# Book: Data Analysis with Pandas and Python (Packt Publishing)

https://github.com/PacktPublishing/Data-Analysis-with-Pandas-and-Python

In [1]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/nba.csv')
df.head(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [None]:
df.shape

(458, 9)

In [None]:
df.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [None]:
# to convert type
df['Salary'] = df['Salary'].astype('int') # does not perform inplace. So it is necessary to assign

0      7730337
1      6796117
3      1148640
6      1170960
7      2165160
        ...   
449    1348440
451     981348
452    2239800
453    2433333
456     947276
Name: Salary, Length: 364, dtype: int64

In [None]:
# categoric types: useful when there are a few column values possible. It is memory efficient

# get the total of unique values for a column
print(df['Position'].nunique()) # there are only 5 different positions. Lets use a categorical type for it!

df['Position'] = df['Position'].astype('category')
df['Team'] = df['Team'].astype('category')

5


In [None]:
df.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [None]:
print(df.index)
print(df.index.values[0:10])

RangeIndex(start=0, stop=458, step=1)
[0 1 2 3 4 5 6 7 8 9]


In [None]:
df.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [None]:
dict(df['Team'].value_counts())

{'New Orleans Pelicans': 19,
 'Memphis Grizzlies': 18,
 'New York Knicks': 16,
 'Milwaukee Bucks': 16,
 'Boston Celtics': 15,
 'Brooklyn Nets': 15,
 'Portland Trail Blazers': 15,
 'Oklahoma City Thunder': 15,
 'Denver Nuggets': 15,
 'Washington Wizards': 15,
 'Miami Heat': 15,
 'Charlotte Hornets': 15,
 'Atlanta Hawks': 15,
 'San Antonio Spurs': 15,
 'Houston Rockets': 15,
 'Dallas Mavericks': 15,
 'Indiana Pacers': 15,
 'Detroit Pistons': 15,
 'Cleveland Cavaliers': 15,
 'Chicago Bulls': 15,
 'Sacramento Kings': 15,
 'Phoenix Suns': 15,
 'Los Angeles Lakers': 15,
 'Los Angeles Clippers': 15,
 'Golden State Warriors': 15,
 'Toronto Raptors': 15,
 'Philadelphia 76ers': 15,
 'Utah Jazz': 15,
 'Orlando Magic': 14,
 'Minnesota Timberwolves': 14}

### Axis

In [None]:
df.sum(axis=0) # sum by column

  df.sum(axis=0) # sum by column


Number    8.079000e+03
Age       1.231100e+04
Weight    1.012360e+05
Salary    2.159837e+09
dtype: float64

In [None]:
df.sum(axis=1) # sum by column

  df.sum(axis=1) # sum by column


0      7730542.0
1      6796476.0
2          262.0
3      1148875.0
4      5000268.0
         ...    
453    2433570.0
454     900228.0
455    2900303.0
456     947557.0
457          0.0
Length: 458, dtype: float64

### Sorting

In [None]:
df.sort_values(by="Name", ascending=False, inplace=True)
df.sort_values(by=["Name", "Team"], ascending=[True, True]).head(10)

Unnamed: 0,Name,Team,Number,new_column_in_file,Position,Age,Height,Weight,College,Salary,new_column
152,Aaron Brooks,Chicago Bulls,0.0,something,PG,31.0,6-0,161.0,Oregon,2250000.0,something
356,Aaron Gordon,Orlando Magic,0.0,something,PF,20.0,6-9,220.0,Arizona,4171680.0,something
328,Aaron Harrison,Charlotte Hornets,9.0,something,SG,21.0,6-6,210.0,Kentucky,525093.0,something
404,Adreian Payne,Minnesota Timberwolves,33.0,something,PF,25.0,6-10,237.0,Michigan State,1938840.0,something
312,Al Horford,Atlanta Hawks,15.0,something,C,30.0,6-10,245.0,Florida,12000000.0,something
428,Al-Farouq Aminu,Portland Trail Blazers,8.0,something,SF,25.0,6-9,215.0,Wake Forest,8042895.0,something
368,Alan Anderson,Washington Wizards,6.0,something,SG,33.0,6-6,220.0,Michigan State,4000000.0,something
135,Alan Williams,Phoenix Suns,15.0,something,C,23.0,6-8,260.0,UC Santa Barbara,83397.0,something
444,Alec Burks,Utah Jazz,10.0,something,SG,24.0,6-6,214.0,Colorado,9463484.0,something
128,Alex Len,Phoenix Suns,21.0,something,C,22.0,7-1,260.0,Maryland,3807120.0,something


In [None]:
df.sort_index(ascending=True, inplace=True)

### Rank

In [None]:
df['SalaryRank'] = df['Salary'].rank(ascending=False).astype('int')
df.sort_values(by='Salary', ascending=False)

Unnamed: 0,Name,Team,Number,new_column_in_file,Position,Age,Height,Weight,College,Salary,new_column,SalaryRank
33,Carmelo Anthony,New York Knicks,7.0,something,SF,32.0,6-8,240.0,Syracuse,22875000.0,something,1
339,Chris Bosh,Miami Heat,1.0,something,PF,32.0,6-11,235.0,Georgia Tech,22192730.0,something,2
100,Chris Paul,Los Angeles Clippers,3.0,something,PG,31.0,6-0,175.0,Wake Forest,21468695.0,something,3
414,Kevin Durant,Oklahoma City Thunder,35.0,something,SF,27.0,6-9,240.0,Texas,20158622.0,something,4
164,Derrick Rose,Chicago Bulls,1.0,something,PG,27.0,6-3,190.0,Memphis,20093064.0,something,5
...,...,...,...,...,...,...,...,...,...,...,...,...
92,Jeff Ayres,Los Angeles Clippers,19.0,something,PF,29.0,6-9,250.0,Arizona State,111444.0,something,359
175,Jordan McRae,Cleveland Cavaliers,12.0,something,SG,25.0,6-5,179.0,Tennessee,111196.0,something,361
135,Alan Williams,Phoenix Suns,15.0,something,C,23.0,6-8,260.0,UC Santa Barbara,83397.0,something,362
291,Orlando Johnson,New Orleans Pelicans,0.0,something,SG,27.0,6-5,220.0,UC Santa Barbara,55722.0,something,363


### Insert Columns

In [None]:
# add new columns to df

# first option (append at the end)
df['new_column'] = 'something'

# second option (allow to choose the column position)
df.insert(loc=3, column='new_column_in_file', value='something')

df.head(5)

Unnamed: 0,Name,Team,Number,new_column_in_file,Position,Age,Height,Weight,College,Salary,new_column
0,Avery Bradley,Boston Celtics,0.0,something,PG,25.0,6-2,180.0,Texas,7730337.0,something
1,Jae Crowder,Boston Celtics,99.0,something,SF,25.0,6-6,235.0,Marquette,6796117.0,something
2,John Holland,Boston Celtics,30.0,something,SG,27.0,6-5,205.0,Boston University,,something
3,R.J. Hunter,Boston Celtics,28.0,something,SG,22.0,6-5,185.0,Georgia State,1148640.0,something
4,Jonas Jerebko,Boston Celtics,8.0,something,PF,29.0,6-10,231.0,,5000000.0,something


### Broadcast operations

In [None]:
df['Salary'].add(1) # or  df['Salary'] + 1
df['Salary'].sub(1) # or  df['Salary'] - 1
df['Salary'].mul(0.8) # or  df['Salary'] * 0.8
df['Salary'].div(0.8) # or  df['Salary'] / 0.8

0      9662921.25
1      8495146.25
2             NaN
3      1435800.00
4      6250000.00
          ...    
453    3041666.25
454    1125000.00
455    3625000.00
456    1184095.00
457           NaN
Name: Salary, Length: 458, dtype: float64

### Missing values


In [None]:
# drop
df.dropna(axis=0, how='any', inplace=True) # remove rows that contains one or more null values
df.dropna(axis=0, how='all', inplace=True) # remove rows that all columns are null
df.dropna(axis=1, how='any', inplace=True) # remove columns that contains one or more null values
df.dropna(subset=['Team'], inplace=True) # remove rows that does not contains Team

In [None]:
# fill
df['Salary'].fillna(0)
df['Team'].fillna('No Team')

0      Boston Celtics
1      Boston Celtics
3      Boston Celtics
6      Boston Celtics
7      Boston Celtics
            ...      
449         Utah Jazz
451         Utah Jazz
452         Utah Jazz
453         Utah Jazz
456         Utah Jazz
Name: Team, Length: 364, dtype: object

In [34]:
# checking for nulls
mask_null = df['Team'].isnull()
df[mask_null].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-06-25 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2023-06-25 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2023-06-25 16:19:00,125792,5.042,True,


In [35]:
# getting not nulls
mask_not_null= df['Team'].notnull()
df[mask_not_null].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-25 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2023-06-25 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-25 13:00:00,138705,9.34,True,Finance


### Dates

In [53]:
df = pd.read_csv('/content/employees.csv')
df.head(5)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [6]:
# converting date strings to datetime
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])

In [15]:
# another way to set the datetime type is while reading the file
df2 = pd.read_csv('/content/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    object        
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  933 non-null    object        
 7   Team               957 non-null    object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(4)
memory usage: 62.6+ KB


In [7]:
# Convert boolean string to boolean
df['Senior Management'] = df['Senior Management'].astype('bool')

In [9]:
# Convert strings to categorical
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


### Filtering

In [19]:
# condition
df[df['Gender'] == 'Male'].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-25 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-25 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2023-06-25 13:00:00,138705,9.34,True,Finance


In [23]:
df[df['Start Date'] >= '2005-03-04'].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
3,Jerry,Male,2005-03-04,2023-06-25 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2023-06-25 10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,2023-06-25 06:29:00,95570,18.523,True,Engineering


In [25]:
# more than one condition
gender_mask = df['Gender'] == 'Male'
start_date_mask = 'Start Date' >= '2005-03-04'

df[gender_mask & start_date_mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-25 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-25 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2023-06-25 13:00:00,138705,9.34,True,Finance


In [26]:
df[gender_mask | start_date_mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-25 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-25 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-25 11:17:00,130590,11.858,False,Finance


In [33]:
# filter by column values

df[df['Team'].isin(['Marketing', 'Finance'])].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-25 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2023-06-25 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-25 13:00:00,138705,9.34,True,Finance


In [41]:
# between

df[df['Salary'].between(65476, 97308)].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-25 12:42:00,97308,6.945,True,Marketing
6,Ruby,Female,1987-08-17,2023-06-25 16:20:00,65476,10.012,True,Product
8,Angela,Female,2005-11-22,2023-06-25 06:29:00,95570,18.523,True,Engineering


In [49]:
# find duplicates
df.sort_values(by="First Name", ascending=True, inplace=True)

# first occurrence is not threated as duplicated. Another option is to keep the last or False to not keep any duplicate
df[df['First Name'].duplicated(keep='first')]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2023-06-25 18:48:00,58755,5.097,True,Marketing
101,Aaron,Male,2012-02-17,2023-06-25 10:20:00,61602,11.849,True,Marketing
937,Aaron,,1986-01-22,2023-06-25 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2023-06-25 21:53:00,45181,3.491,False,Human Resources
302,Adam,Male,2007-07-05,2023-06-25 11:59:00,71276,5.027,True,Human Resources
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2023-06-25 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2023-06-25 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2023-06-25 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2023-06-25 15:07:00,107351,5.329,True,Marketing


In [54]:
# drop duplicates
print(len(df))
df.drop_duplicates(subset=['First Name'], keep='first', inplace=True)
print(len(df))

1000
201


In [55]:
# unique values

df['Gender'].unique()

array(['Male', 'Female', nan], dtype=object)

In [57]:
df['Gender'].nunique(dropna=False)

3

#### Where and Query

In [209]:
df = pd.read_csv('/content/jamesbond.csv', index_col='Film')
mask = df['Actor'] == 'Sean Connery'

In [211]:
# with where() rows that do not match are returned with NaNs

df.where(mask).head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962.0,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963.0,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965.0,Sean Connery,Terence Young,848.1,41.9,4.7
Casino Royale,,,,,,


In [212]:
# query() just works if column's names do not have spaces
df.columns = [column.replace(' ', '_') for column in df.columns]
df.columns

Index(['Year', 'Actor', 'Director', 'Box_Office', 'Budget',
       'Bond_Actor_Salary'],
      dtype='object')

In [219]:
df.query('Actor != "Sean Connery" and Actor != "David Niven"').head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,
The Spy Who Loved Me,1977,Roger Moore,Lewis Gilbert,533.0,45.1,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,


### Index




In [173]:
df = pd.read_csv('/content/jamesbond.csv', index_col='Film')
df.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [74]:
df2 = pd.read_csv('/content/jamesbond.csv', index_col='Film')
df2.set_index(keys=['Year'], inplace=True)
df2.head(3)

Unnamed: 0_level_0,Actor,Director,Box Office,Budget,Bond Actor Salary
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1962,Sean Connery,Terence Young,448.8,7.0,0.6
1963,Sean Connery,Terence Young,543.8,12.6,1.6
1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [82]:
# before change a index to another one, is a good idea to reset the index before use the set_index method.

df.reset_index(drop=False, inplace=True) # drop=False keeps old index as a normal column. Otherwise, drop it.
df.head(3)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [93]:
# It is a good idea to sort the index for better performance

df.sort_index(inplace=True)

### Rename index and/or columns

In [167]:
# rename index using mapper. Mapper can also be used to rename columns
df.rename(mapper={'Dr. No': 'Dr. No renamed'}, axis=0, inplace=True)
df.rename(mapper={'Budget': 'Budget renamed'}, axis=1, inplace=True)


# another option is to use index insted of mapper
df.rename(index={'Thunderball': 'Thunderball renamed'}, inplace=True)

# similarly, there is a columns option
df.rename(columns={'Thunderball renamed': 'Budget renamed2'}, inplace=True)


df.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget renamed,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No renamed,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball renamed,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


### Drop rows and columns

In [168]:
# drop row by index
df.drop('Dr. No renamed', inplace=True, axis=0)

In [None]:
# drop rows by index

df.drop(['Dr. No renamed', 'Die Another Day'], axis=0, inplace=True)

In [169]:
# drop column
df.drop('Budget renamed', axis=1, inplace=True)

In [175]:
# drop columns
df.drop(labels=['Year', 'Bond Actor Salary'], axis=1, inplace=True)

In [176]:
df.head(5)

Unnamed: 0_level_0,Actor,Director,Box Office,Budget
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dr. No,Sean Connery,Terence Young,448.8,7.0
From Russia with Love,Sean Connery,Terence Young,543.8,12.6
Goldfinger,Sean Connery,Guy Hamilton,820.4,18.6
Thunderball,Sean Connery,Terence Young,848.1,41.9
Casino Royale,David Niven,Ken Hughes,315.0,85.0


In [179]:
# another option is to use pop(). It removes the series and returns it

df.pop('Budget')

Film
Dr. No                               7.0
From Russia with Love               12.6
Goldfinger                          18.6
Thunderball                         41.9
Casino Royale                       85.0
You Only Live Twice                 59.9
On Her Majesty's Secret Service     37.3
Diamonds Are Forever                34.7
Live and Let Die                    30.8
The Man with the Golden Gun         27.7
The Spy Who Loved Me                45.1
Moonraker                           91.5
For Your Eyes Only                  60.2
Never Say Never Again               86.0
Octopussy                           53.9
A View to a Kill                    54.5
The Living Daylights                68.8
Licence to Kill                     56.7
GoldenEye                           76.9
Tomorrow Never Dies                133.9
The World Is Not Enough            158.3
Die Another Day                    154.2
Casino Royale                      145.3
Quantum of Solace                  181.4
Skyfall    

In [180]:
df.head(5)

Unnamed: 0_level_0,Actor,Director,Box Office
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dr. No,Sean Connery,Terence Young,448.8
From Russia with Love,Sean Connery,Terence Young,543.8
Goldfinger,Sean Connery,Guy Hamilton,820.4
Thunderball,Sean Connery,Terence Young,848.1
Casino Royale,David Niven,Ken Hughes,315.0


In [181]:
# another option is to use del(). It removes the series.

del df['Director']
df.head(5)

Unnamed: 0_level_0,Actor,Box Office
Film,Unnamed: 1_level_1,Unnamed: 2_level_1
Dr. No,Sean Connery,448.8
From Russia with Love,Sean Connery,543.8
Goldfinger,Sean Connery,820.4
Thunderball,Sean Connery,848.1
Casino Royale,David Niven,315.0


### Copy

In [229]:
df = pd.read_csv('/content/jamesbond.csv', index_col='Film')
df2 = df.copy()

### Loc

Return rows based on index labels

In [200]:
# you can filter by index using loc
## a vantage of series compared to dict is that series allows duplicated keys

df = pd.read_csv('/content/jamesbond.csv', index_col='Film')
df.sort_index(inplace=True)
df.loc[['Casino Royale']]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [101]:
df.loc['Casino Royale':'Diamonds Are Forever'] # slicing is inclusive (start:end)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8


In [102]:
df.loc[:'Diamonds Are Forever']  # until Diamonds Are Forever

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8


In [104]:
df.loc['Casino Royale':] # from Casino Royale until the end
Output = None

In [105]:
df.loc[['Casino Royale', 'Goldfinger']] # specific indexes

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [118]:
# selecting columns
df.loc['Casino Royale':, ['Actor', 'Director']].head(3)

Unnamed: 0_level_0,Actor,Director
Film,Unnamed: 1_level_1,Unnamed: 2_level_1
Casino Royale,Daniel Craig,Martin Campbell
Casino Royale,David Niven,Ken Hughes
Diamonds Are Forever,Sean Connery,Guy Hamilton


In [120]:
df.loc['Diamonds Are Forever', 'Actor']

'Sean Connery'

In [128]:
df.loc['Diamonds Are Forever', 'Actor':'Box Office']

Actor         Sean Connery
Director      Guy Hamilton
Box Office           442.5
Name: Diamonds Are Forever, dtype: object

In [138]:
# updating cell value
df.loc['Diamonds Are Forever', 'Actor'] = 'Tania'
df.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Tania,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [143]:
# updating mulitple cells

df.loc['Diamonds Are Forever', ['Actor', 'Director', 'Salary']] = ['Tania2', 'Director2', '1000000']
df.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary,Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1,
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3,
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,,
Diamonds Are Forever,1971,Tania2,Director2,442.5,34.7,5.8,1000000.0
Die Another Day,2002,Tania-Iloc,Director-Iloc,465.4,154.2,17.9,3000000.0


In [151]:
# updating by condition

actor_sean_connery = df['Actor'] == 'Sean Connery' # condition
df.loc[actor_sean_connery, 'Actor'] = 'Sean Connery2'
df.head(10)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery2,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery2,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery2,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965,Sean Connery2,Terence Young,848.1,41.9,4.7
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
You Only Live Twice,1967,Sean Connery2,Lewis Gilbert,514.2,59.9,4.4
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Diamonds Are Forever,1971,Sean Connery2,Guy Hamilton,442.5,34.7,5.8
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,


### Iloc

Return rows based on index position. It slicing is exclusive

In [110]:
df.iloc[[0]]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1


In [111]:
df.iloc[[0,1]]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3


In [113]:
df.iloc[0:2] # exclusive slicing (do not returns the last)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3


In [115]:
df.iloc[20:] # from line 20 to the end

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,
The Spy Who Loved Me,1977,Roger Moore,Lewis Gilbert,533.0,45.1,
The World Is Not Enough,1999,Pierce Brosnan,Michael Apted,439.5,158.3,13.5
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Tomorrow Never Dies,1997,Pierce Brosnan,Roger Spottiswoode,463.2,133.9,10.0
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [116]:
df.iloc[:5] # first 5 rows

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [129]:
# select specific columns by column position

df.iloc[:5, [0,2]]

Unnamed: 0_level_0,Year,Director
Film,Unnamed: 1_level_1,Unnamed: 2_level_1
A View to a Kill,1985,John Glen
Casino Royale,2006,Martin Campbell
Casino Royale,1967,Ken Hughes
Diamonds Are Forever,1971,Guy Hamilton
Die Another Day,2002,Lee Tamahori


In [130]:
df.iloc[:5, 1:3]

Unnamed: 0_level_0,Actor,Director
Film,Unnamed: 1_level_1,Unnamed: 2_level_1
A View to a Kill,Roger Moore,John Glen
Casino Royale,Daniel Craig,Martin Campbell
Casino Royale,David Niven,Ken Hughes
Diamonds Are Forever,Sean Connery,Guy Hamilton
Die Another Day,Pierce Brosnan,Lee Tamahori


In [135]:
# updating cell value
df.iloc[4, 1] = 'Tania'
df.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary,1
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A View to a Kill,1985.0,Roger Moore,John Glen,275.2,54.5,9.1,
Casino Royale,2006.0,Daniel Craig,Martin Campbell,581.5,145.3,3.3,
Casino Royale,1967.0,David Niven,Ken Hughes,315.0,85.0,,
Diamonds Are Forever,1971.0,Tania,Guy Hamilton,442.5,34.7,5.8,
Die Another Day,2002.0,Tania,Lee Tamahori,465.4,154.2,17.9,


In [140]:
# updating multiple cells
df.iloc[4, [1, 2, 6]] = ['Tania-Iloc', 'Director-Iloc', '3000000']
df.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary,Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1,
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3,
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,,
Diamonds Are Forever,1971,Tania2,Director2,442.5,34.7,5.8,1000000.0
Die Another Day,2002,Tania-Iloc,Director-Iloc,465.4,154.2,17.9,3000000.0


### Random Sampling

In [193]:
df.sample(n=5, random_state=1) # number of rows

Unnamed: 0_level_0,Actor,Box Office
Film,Unnamed: 1_level_1,Unnamed: 2_level_1
GoldenEye,Pierce Brosnan,518.5
Never Say Never Again,Sean Connery,380.0
Licence to Kill,Timothy Dalton,250.9
Thunderball,Sean Connery,848.1
Casino Royale,Daniel Craig,581.5


In [196]:
df.sample(frac=.25, random_state=1) # % of rows

Unnamed: 0_level_0,Actor,Box Office
Film,Unnamed: 1_level_1,Unnamed: 2_level_1
GoldenEye,Pierce Brosnan,518.5
Never Say Never Again,Sean Connery,380.0
Licence to Kill,Timothy Dalton,250.9
Thunderball,Sean Connery,848.1
Casino Royale,Daniel Craig,581.5
The Spy Who Loved Me,Roger Moore,533.0


In [199]:
# select random columns

df.sample(n=1, axis=1, random_state=1).head(5)

Unnamed: 0_level_0,Actor
Film,Unnamed: 1_level_1
Dr. No,Sean Connery
From Russia with Love,Sean Connery
Goldfinger,Sean Connery
Thunderball,Sean Connery
Casino Royale,David Niven


### Find the smallest (nsmallest()) and largest (nlargest()) columns'values

In [201]:
df.nlargest(n=3, columns='Budget', keep='first')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5


In [203]:
# another way
df['Budget'].nlargest(3)

Film
Spectre              206.3
Quantum of Solace    181.4
Skyfall              170.2
Name: Budget, dtype: float64

In [202]:
df.nsmallest(n=3, columns='Budget', keep='first')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [206]:
# another way
df['Budget'].nsmallest(3)

Film
Dr. No                    7.0
From Russia with Love    12.6
Goldfinger               18.6
Name: Budget, dtype: float64

### Apply

In [224]:
# apply function to column

def some_function(number):
  return str(number) + ' test'

df['Box_Office'] = df['Box_Office'].apply(some_function)

In [225]:
df.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8 test,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8 test,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4 test,18.6,3.2
Thunderball,1965,Sean Connery,Terence Young,848.1 test,41.9,4.7
Casino Royale,1967,David Niven,Ken Hughes,315.0 test,85.0,


In [228]:
# apply function to row

def review_movie(row):
  actor = row[1]
  budget = row[4]
  if actor == 'Sean Connery':
    return 'The best!'
  return 'So so'

df['Review'] = df.apply(review_movie, axis=1)
df.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary,Review
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8 test,7.0,0.6,The best!
From Russia with Love,1963,Sean Connery,Terence Young,543.8 test,12.6,1.6,The best!
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4 test,18.6,3.2,The best!
Thunderball,1965,Sean Connery,Terence Young,848.1 test,41.9,4.7,The best!
Casino Royale,1967,David Niven,Ken Hughes,315.0 test,85.0,,So so
