In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('gapminder.csv')
df.head()

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.85303
2,Afghanistan,1962,10267083,Asia,31.997,853.10071
3,Afghanistan,1967,11537966,Asia,34.02,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106


## Sorting

In [3]:
df.sort_values(by='year')

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
528,France,1952,42459667,Europe,67.410,7029.809327
540,Gabon,1952,420702,Africa,37.003,4293.476475
1656,West Bank and Gaza,1952,1030585,Asia,43.160,1515.592329
552,Gambia,1952,284320,Africa,30.000,485.230659
...,...,...,...,...,...,...
1127,Niger,2007,12894865,Africa,56.867,619.676892
1139,Nigeria,2007,135031164,Africa,46.859,2013.977305
1151,Norway,2007,4627926,Europe,80.196,49357.190170
1175,Pakistan,2007,169270617,Asia,65.483,2605.947580


In [4]:
df.sort_values(by='year', ascending=False)

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298
491,Equatorial Guinea,2007,551201,Africa,51.579,12154.089750
515,Ethiopia,2007,76511887,Africa,52.947,690.805576
527,Finland,2007,5238460,Europe,79.313,33207.084400
539,France,2007,61083916,Europe,80.657,30470.016700
...,...,...,...,...,...,...
1116,Niger,1952,3379468,Africa,37.444,761.879376
1128,Nigeria,1952,33119096,Africa,36.324,1077.281856
1140,Norway,1952,3327728,Europe,72.670,10095.421720
1152,Oman,1952,507833,Asia,37.578,1828.230307


## Multiple column sorting

In [5]:
df.sort_values(by=['year', 'life_exp'])

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
552,Gambia,1952,284320,Africa,30.000,485.230659
36,Angola,1952,4232095,Africa,30.015,3520.610273
1344,Sierra Leone,1952,2143249,Africa,30.331,879.787736
1032,Mozambique,1952,6446316,Africa,31.286,468.526038
...,...,...,...,...,...,...
71,Australia,2007,20434176,Oceania,81.235,34435.367440
1487,Switzerland,2007,7554661,Europe,81.701,37506.419070
695,Iceland,2007,301931,Europe,81.757,36180.789190
671,"Hong Kong, China",2007,6980412,Asia,82.208,39724.978670


## One column ascending & other descending

In [6]:
df.sort_values(by=['year', 'life_exp'], ascending=[False, True])

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
1463,Swaziland,2007,1133066,Africa,39.613,4513.480643
1043,Mozambique,2007,19951656,Africa,42.082,823.685621
1691,Zambia,2007,11746035,Africa,42.384,1271.211593
1355,Sierra Leone,2007,6144562,Africa,42.568,862.540756
887,Lesotho,2007,2012649,Africa,42.592,1569.331442
...,...,...,...,...,...,...
408,Denmark,1952,4334000,Europe,70.780,9692.385245
1464,Sweden,1952,7124673,Europe,71.860,8527.844662
1080,Netherlands,1952,10381988,Europe,72.130,8941.571858
684,Iceland,1952,147962,Europe,72.490,7267.688428


## Creating series and Df from scratch

In [10]:
ser = pd.Series([1, 10, 100, 1000])
ser

0       1
1      10
2     100
3    1000
dtype: int64

In [11]:
ser.index = ['a', 'b', 'c', 'd']
ser

a       1
b      10
c     100
d    1000
dtype: int64

In [12]:
df = pd.DataFrame(ser)
df

Unnamed: 0,0
a,1
b,10
c,100
d,1000


In [13]:
arr = np.arange(20).reshape(5, 4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [17]:
temp = pd.DataFrame(arr, columns=['age', 'gender', 'school', 'name'])
temp

Unnamed: 0,age,gender,school,name
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [18]:
# DF to array
temp.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

## Concat Dataframes

In [24]:
a = pd.DataFrame({'A':[10, 30], 'B':[20, 40]})
b = pd.DataFrame({'A':[10, 30], 'C':[20, 40]})

In [25]:
a

Unnamed: 0,A,B
0,10,20
1,30,40


In [26]:
b

Unnamed: 0,A,C
0,10,20
1,30,40


In [27]:
pd.concat([a, b])

Unnamed: 0,A,B,C
0,10,20.0,
1,30,40.0,
0,10,,20.0
1,30,,40.0


In [28]:
pd.concat([a, b], axis=0).reset_index(drop=True)

Unnamed: 0,A,B,C
0,10,20.0,
1,30,40.0,
2,10,,20.0
3,30,,40.0


In [29]:
pd.concat([a, b], axis=1)

Unnamed: 0,A,B,A.1,C
0,10,20,10,20
1,30,40,30,40


In [31]:
#Outer join is the default one
pd.concat([a, b], join='outer')

Unnamed: 0,A,B,C
0,10,20.0,
1,30,40.0,
0,10,,20.0
1,30,,40.0


In [33]:
#intersection columns will be displayed
pd.concat([a, b], join='inner')

Unnamed: 0,A
0,10
1,30
0,10
1,30


## Difference b/w Merge & Concat
### In merging there should be atleast one common column

![image.png](attachment:image.png)

In [44]:
users = pd.DataFrame({'userid':[1, 2, 3], 'name':['A', 'B', 'C']})
msgs = pd.DataFrame({'userid':[1, 1, 2, 1, 1, 4], 'msg':['hello', 'hi', 'hi', 'sup', 'bye', 'go']})

In [45]:
users

Unnamed: 0,userid,name
0,1,A
1,2,B
2,3,C


In [46]:
msgs

Unnamed: 0,userid,msg
0,1,hello
1,1,hi
2,2,hi
3,1,sup
4,1,bye
5,4,go


In [47]:
pd.merge(users, msgs)

Unnamed: 0,userid,name,msg
0,1,A,hello
1,1,A,hi
2,1,A,sup
3,1,A,bye
4,2,B,hi


In [48]:
 pd.merge(users, msgs, how='inner')

Unnamed: 0,userid,name,msg
0,1,A,hello
1,1,A,hi
2,1,A,sup
3,1,A,bye
4,2,B,hi


In [49]:
pd.merge(users, msgs, how='left')

Unnamed: 0,userid,name,msg
0,1,A,hello
1,1,A,hi
2,1,A,sup
3,1,A,bye
4,2,B,hi
5,3,C,


In [50]:
pd.merge(users, msgs, how='right')

Unnamed: 0,userid,name,msg
0,1,A,hello
1,1,A,hi
2,2,B,hi
3,1,A,sup
4,1,A,bye
5,4,,go


In [51]:
pd.merge(users, msgs, how='outer')

Unnamed: 0,userid,name,msg
0,1,A,hello
1,1,A,hi
2,1,A,sup
3,1,A,bye
4,2,B,hi
5,3,C,
6,4,,go


### If we do not have common column name

In [55]:
users.rename(columns={'userid':'id'}, inplace=True)
users

Unnamed: 0,id,name
0,1,A
1,2,B
2,3,C


In [57]:
pd.merge(users, msgs, left_on='id', right_on='userid')

Unnamed: 0,id,name,userid,msg
0,1,A,1,hello
1,1,A,1,hi
2,1,A,1,sup
3,1,A,1,bye
4,2,B,2,hi


In [58]:
a

Unnamed: 0,A,B
0,10,20
1,30,40


In [59]:
b

Unnamed: 0,A,C
0,10,20
1,30,40


In [60]:
a.append(b, ignore_index=True)

  a.append(b, ignore_index=True)


Unnamed: 0,A,B,C
0,10,20.0,
1,30,40.0,
2,10,,20.0
3,30,,40.0


## How Join works

![image.png](attachment:image.png)

### How you can calculate mean of two columns population and gdp_cap and store it in another column as "average"

In [63]:
df = pd.read_csv('gapminder.csv')
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623


In [66]:
col = ['population', 'gdp_cap']
df['average'] = df[col].mean(axis=1)
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap,average
0,Afghanistan,1952,8425333,Asia,28.801,779.445314,4.213056e+06
1,Afghanistan,1957,9240934,Asia,30.332,820.853030,4.620877e+06
2,Afghanistan,1962,10267083,Asia,31.997,853.100710,5.133968e+06
3,Afghanistan,1967,11537966,Asia,34.020,836.197138,5.769401e+06
4,Afghanistan,1972,13079460,Asia,36.088,739.981106,6.540100e+06
...,...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306,4.608562e+06
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786,5.352517e+06
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960,5.702870e+06
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623,5.963618e+06


## IMDB

In [67]:
import pandas as pd

In [69]:
movies = pd.read_csv("movies.csv", index_col=0)
movies.head().reset_index(drop=True)

Unnamed: 0,id,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day
0,43597,237000000,150,2787965087,Avatar,7.2,11800,4762,2009,Dec,Thursday
1,43598,300000000,139,961000000,Pirates of the Caribbean: At World's End,6.9,4500,4763,2007,May,Saturday
2,43599,245000000,107,880674609,Spectre,6.3,4466,4764,2015,Oct,Monday
3,43600,250000000,112,1084939099,The Dark Knight Rises,7.6,9106,4765,2012,Jul,Monday
4,43602,258000000,115,890871626,Spider-Man 3,5.9,3576,4767,2007,May,Tuesday


In [70]:
movies.shape

(1465, 11)

In [72]:
directors = pd.read_csv('directors.csv', index_col=0)
directors

Unnamed: 0,director_name,id,gender
0,James Cameron,4762,Male
1,Gore Verbinski,4763,Male
2,Sam Mendes,4764,Male
3,Christopher Nolan,4765,Male
4,Andrew Stanton,4766,Male
...,...,...,...
2344,Shane Carruth,7106,Male
2345,Neill Dela Llana,7107,
2346,Scott Smith,7108,
2347,Daniel Hsia,7109,Male


In [73]:
directors.duplicated().sum()

0

In [74]:
movies['director_id'].nunique()

199

In [75]:
directors['id'].nunique()

2349

In [77]:
np.all(movies['director_id'].isin(directors['id']))

True

In [79]:
data = pd.merge(movies, directors, how='left', left_on='director_id', right_on='id')
data.head()

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,id_y,gender
0,43597,237000000,150,2787965087,Avatar,7.2,11800,4762,2009,Dec,Thursday,James Cameron,4762,Male
1,43598,300000000,139,961000000,Pirates of the Caribbean: At World's End,6.9,4500,4763,2007,May,Saturday,Gore Verbinski,4763,Male
2,43599,245000000,107,880674609,Spectre,6.3,4466,4764,2015,Oct,Monday,Sam Mendes,4764,Male
3,43600,250000000,112,1084939099,The Dark Knight Rises,7.6,9106,4765,2012,Jul,Monday,Christopher Nolan,4765,Male
4,43602,258000000,115,890871626,Spider-Man 3,5.9,3576,4767,2007,May,Tuesday,Sam Raimi,4767,Male


In [80]:
data.drop('id_y', axis= 1, inplace=True)

In [81]:
data.head()

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
0,43597,237000000,150,2787965087,Avatar,7.2,11800,4762,2009,Dec,Thursday,James Cameron,Male
1,43598,300000000,139,961000000,Pirates of the Caribbean: At World's End,6.9,4500,4763,2007,May,Saturday,Gore Verbinski,Male
2,43599,245000000,107,880674609,Spectre,6.3,4466,4764,2015,Oct,Monday,Sam Mendes,Male
3,43600,250000000,112,1084939099,The Dark Knight Rises,7.6,9106,4765,2012,Jul,Monday,Christopher Nolan,Male
4,43602,258000000,115,890871626,Spider-Man 3,5.9,3576,4767,2007,May,Tuesday,Sam Raimi,Male


In [83]:
data['revenue'] = (data['revenue']/1000000).round(2)
data['budget'] = (data['budget']/1000000).round(2)
data

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
0,43597,237.00,150,0.0,Avatar,7.2,11800,4762,2009,Dec,Thursday,James Cameron,Male
1,43598,300.00,139,0.0,Pirates of the Caribbean: At World's End,6.9,4500,4763,2007,May,Saturday,Gore Verbinski,Male
2,43599,245.00,107,0.0,Spectre,6.3,4466,4764,2015,Oct,Monday,Sam Mendes,Male
3,43600,250.00,112,0.0,The Dark Knight Rises,7.6,9106,4765,2012,Jul,Monday,Christopher Nolan,Male
4,43602,258.00,115,0.0,Spider-Man 3,5.9,3576,4767,2007,May,Tuesday,Sam Raimi,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1460,48363,0.00,3,0.0,The Last Waltz,7.9,64,4809,1978,May,Monday,Martin Scorsese,Male
1461,48370,0.03,19,0.0,Clerks,7.4,755,5369,1994,Sep,Tuesday,Kevin Smith,Male
1462,48375,0.00,7,0.0,Rampage,6.0,131,5148,2009,Aug,Friday,Uwe Boll,Male
1463,48376,0.00,3,0.0,Slacker,6.4,77,5535,1990,Jul,Friday,Richard Linklater,Male


## Masking

In [91]:
mask = data['vote_average'] > 7

In [93]:
data.loc[mask]

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
0,43597,237.00,150,0.0,Avatar,7.2,11800,4762,2009,Dec,Thursday,James Cameron,Male
3,43600,250.00,112,0.0,The Dark Knight Rises,7.6,9106,4765,2012,Jul,Monday,Christopher Nolan,Male
14,43616,250.00,120,0.0,The Hobbit: The Battle of the Five Armies,7.1,4760,4777,2014,Dec,Wednesday,Peter Jackson,Male
16,43619,250.00,94,0.0,The Hobbit: The Desolation of Smaug,7.6,4524,4777,2013,Dec,Wednesday,Peter Jackson,Male
19,43622,200.00,100,0.0,Titanic,7.5,7562,4762,1997,Nov,Tuesday,James Cameron,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,48321,0.01,20,0.0,Eraserhead,7.5,485,5265,1977,Mar,Saturday,David Lynch,Male
1457,48323,0.00,5,0.0,The Mighty,7.1,51,4921,1998,Oct,Friday,Peter Chelsom,Male
1458,48335,0.06,27,0.0,Pi,7.1,586,4881,1998,Jul,Friday,Darren Aronofsky,Male
1460,48363,0.00,3,0.0,The Last Waltz,7.9,64,4809,1978,May,Monday,Martin Scorsese,Male


In [97]:
data[mask][['title', 'vote_average']]

Unnamed: 0,title,vote_average
0,Avatar,7.2
3,The Dark Knight Rises,7.6
14,The Hobbit: The Battle of the Five Armies,7.1
16,The Hobbit: The Desolation of Smaug,7.6
19,Titanic,7.5
...,...,...
1456,Eraserhead,7.5
1457,The Mighty,7.1
1458,Pi,7.1
1460,The Last Waltz,7.9


In [101]:
data.loc[mask, ['title', 'vote_average']]

Unnamed: 0,title,vote_average
0,Avatar,7.2
3,The Dark Knight Rises,7.6
14,The Hobbit: The Battle of the Five Armies,7.1
16,The Hobbit: The Desolation of Smaug,7.6
19,Titanic,7.5
...,...,...
1456,Eraserhead,7.5
1457,The Mighty,7.1
1458,Pi,7.1
1460,The Last Waltz,7.9


In [90]:
data.iloc[[1, 10, 100]]

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
1,43598,300.0,139,0.0,Pirates of the Caribbean: At World's End,6.9,4500,4763,2007,May,Saturday,Gore Verbinski,Male
10,43611,225.0,99,0.0,Man of Steel,6.5,6359,4771,2013,Jun,Wednesday,Zack Snyder,Male
100,43763,130.0,59,0.0,G.I. Joe: Retaliation,5.4,3025,4868,2013,Mar,Tuesday,Jon M. Chu,Male


In [96]:
data.iloc[[1, 10, 100]][['title', 'vote_average']]

Unnamed: 0,title,vote_average
1,Pirates of the Caribbean: At World's End,6.9
10,Man of Steel,6.5
100,G.I. Joe: Retaliation,5.4


### Recently released movie

In [108]:
data[(data['vote_average'] > 7) & (data['year'] > 2014)].sort_values(by='vote_average', ascending= False)

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
833,45293,28.0,61,0.0,Straight Outta Compton,7.7,1355,5033,2015,Aug,Thursday,F. Gary Gray,Male
162,43867,108.0,167,0.0,The Martian,7.6,7268,4779,2015,Sep,Wednesday,Ridley Scott,Male
394,44281,44.0,68,0.0,The Hateful Eight,7.6,4274,4927,2015,Dec,Friday,Quentin Tarantino,Male
30,43641,190.0,102,0.0,Furious 7,7.3,4176,4794,2015,Apr,Wednesday,James Wan,Male
106,43773,135.0,100,0.0,The Revenant,7.3,6396,4874,2015,Dec,Friday,Alejandro González Iñárritu,Male
808,45194,30.0,65,0.0,Southpaw,7.3,2067,5034,2015,Jun,Monday,Antoine Fuqua,Male
839,45301,28.0,57,0.0,The Big Short,7.3,2607,4925,2015,Dec,Friday,Adam McKay,Male
78,43724,150.0,434,0.0,Mad Max: Fury Road,7.2,9427,4845,2015,May,Wednesday,George Miller,Male
635,44784,40.0,48,0.0,Bridge of Spies,7.2,2583,4799,2015,Oct,Thursday,Steven Spielberg,Male
312,44128,75.0,48,0.0,The Man from U.N.C.L.E.,7.1,2265,4888,2015,Aug,Thursday,Guy Ritchie,Male


## List down the movies which are listed before 'Avengers' (Alphabetically)

In [109]:
data[data['title'] < 'Avengers']

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
0,43597,237.0,150,0.0,Avatar,7.2,11800,4762,2009,Dec,Thursday,James Cameron,Male
23,43629,200.0,78,0.0,Alice in Wonderland,6.4,4645,4785,2010,Mar,Wednesday,Tim Burton,Male
40,43656,200.0,45,0.0,2012,5.6,4903,4803,2009,Oct,Saturday,Roland Emmerich,Male
41,43657,200.0,39,0.0,A Christmas Carol,6.6,1095,4804,2009,Nov,Wednesday,Robert Zemeckis,Male
69,43709,155.0,39,0.0,Alexander,5.6,927,4839,2004,Nov,Sunday,Oliver Stone,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,47491,3.0,7,0.0,A Room with a View,6.9,156,5826,1985,Dec,Friday,James Ivory,Male
1395,47575,3.0,3,0.0,Amnesiac,4.1,52,5951,2015,Aug,Friday,Michael Polish,Male
1405,47686,2.0,23,0.0,Amores perros,7.6,521,4874,2000,Jun,Friday,Alejandro González Iñárritu,Male
1432,47970,0.0,3,0.0,All the Real Girls,5.9,30,5231,2003,Aug,Friday,David Gordon Green,Male


## Display all the movie starts with 'The'

In [114]:
data[data['title'].str.startswith('The')]

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
3,43600,250.00,112,0.0,The Dark Knight Rises,7.6,9106,4765,2012,Jul,Monday,Christopher Nolan,Male
9,43610,255.00,49,0.0,The Lone Ranger,5.9,2311,4763,2013,Jul,Wednesday,Gore Verbinski,Male
11,43612,225.00,53,0.0,The Chronicles of Narnia: Prince Caspian,6.3,1630,4774,2008,May,Thursday,Andrew Adamson,Male
14,43616,250.00,120,0.0,The Hobbit: The Battle of the Five Armies,7.1,4760,4777,2014,Dec,Wednesday,Peter Jackson,Male
16,43619,250.00,94,0.0,The Hobbit: The Desolation of Smaug,7.6,4524,4777,2013,Dec,Wednesday,Peter Jackson,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,48095,0.60,10,0.0,The Kentucky Fried Movie,6.4,66,5217,1977,Aug,Wednesday,John Landis,Male
1443,48192,0.35,35,0.0,The Evil Dead,7.3,894,4767,1981,Oct,Thursday,Sam Raimi,Male
1449,48244,0.25,6,0.0,The Canyons,4.1,75,5970,2013,Jul,Monday,Paul Schrader,
1457,48323,0.00,5,0.0,The Mighty,7.1,51,4921,1998,Oct,Friday,Peter Chelsom,Male


## Top 5 popular movies

In [117]:
data.sort_values(by='vote_average', ascending=False).head(5)

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
1284,46829,8.0,121,0.0,Pulp Fiction,8.3,8428,4927,1994,Oct,Saturday,Quentin Tarantino,Male
383,44259,63.0,146,0.0,Fight Club,8.3,9413,4829,1999,Oct,Friday,David Fincher,Male
901,45415,22.0,104,0.0,Schindler's List,8.3,4329,4799,1993,Nov,Monday,Steven Spielberg,Male
459,44406,55.0,138,0.0,Forrest Gump,8.2,7927,4804,1994,Jul,Wednesday,Robert Zemeckis,Male
45,43662,185.0,187,0.0,The Dark Knight,8.2,12002,4765,2008,Jul,Wednesday,Christopher Nolan,Male


## Find all the movies directed by 'Steven Spielberg'

In [125]:
data[data['director_name'] == 'Steven Spielberg']

Unnamed: 0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,month,day,director_name,gender
37,43650,185.0,75,0.0,Indiana Jones and the Kingdom of the Crystal S...,5.7,2495,4799,2008,May,Wednesday,Steven Spielberg,Male
105,43772,140.0,44,0.0,The BFG,6.0,1000,4799,2016,Jun,Wednesday,Steven Spielberg,Male
110,43782,132.0,48,0.0,War of the Worlds,6.2,2322,4799,2005,Jun,Tuesday,Steven Spielberg,Male
114,43787,130.0,89,0.0,The Adventures of Tintin,6.7,2061,4799,2011,Oct,Tuesday,Steven Spielberg,Male
166,43872,102.0,65,0.0,Minority Report,7.1,2608,4799,2002,Jun,Thursday,Steven Spielberg,Male
219,43960,100.0,34,0.0,A.I. Artificial Intelligence,6.8,1974,4799,2001,Jun,Friday,Steven Spielberg,Male
296,44105,73.0,2,0.0,The Lost World: Jurassic Park,6.2,2487,4799,1997,May,Friday,Steven Spielberg,Male
304,44118,60.0,57,0.0,The Terminal,7.0,1910,4799,2004,Jun,Thursday,Steven Spielberg,Male
309,44125,70.0,29,0.0,Munich,6.9,696,4799,2005,Dec,Thursday,Steven Spielberg,Male
333,44169,70.0,33,0.0,Hook,6.6,1532,4799,1991,Dec,Wednesday,Steven Spielberg,Male


## Which director produced more movies

In [129]:
data['director_name'].value_counts()

Steven Spielberg      26
Martin Scorsese       19
Clint Eastwood        19
Woody Allen           18
Ridley Scott          16
                      ..
Tim Hill               5
Jonathan Liebesman     5
Roman Polanski         5
Larry Charles          5
Nicole Holofcener      5
Name: director_name, Length: 199, dtype: int64

In [135]:
data[data['director_name'] == 'Woody Allen']['title'].count()

18

## Number of movies released in the specified month

In [136]:
data[data['month'] == 'Dec']['title'].count()

193

## Number of movies released in each month

In [138]:
data_groupby = data.groupby(by='month')
data_groupby

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000248DCFAAF50>

In [141]:
data_groupby.count()

Unnamed: 0_level_0,id_x,budget,popularity,revenue,title,vote_average,vote_count,director_id,year,day,director_name,gender
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Apr,90,90,90,90,90,90,90,90,90,90,90,80
Aug,111,111,111,111,111,111,111,111,111,111,111,97
Dec,193,193,193,193,193,193,193,193,193,193,193,180
Feb,104,104,104,104,104,104,104,104,104,104,104,94
Jan,60,60,60,60,60,60,60,60,60,60,60,51
Jul,127,127,127,127,127,127,127,127,127,127,127,120
Jun,133,133,133,133,133,133,133,133,133,133,133,126
Mar,99,99,99,99,99,99,99,99,99,99,99,87
May,116,116,116,116,116,116,116,116,116,116,116,113
Nov,117,117,117,117,117,117,117,117,117,117,117,110


In [142]:
data_groupby.count()['title']

month
Apr     90
Aug    111
Dec    193
Feb    104
Jan     60
Jul    127
Jun    133
Mar     99
May    116
Nov    117
Oct    149
Sep    166
Name: title, dtype: int64

In [143]:
data_groupby.sum()

Unnamed: 0_level_0,id_x,budget,popularity,revenue,vote_average,vote_count,director_id,year
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Apr,4085512,3835.55,2364,0.0,553.5,92623,453722,180336
Aug,5047885,4309.08,2903,0.0,685.1,92136,565407,222259
Dec,8678997,9861.3,6163,0.0,1284.0,250696,967218,386292
Feb,4718998,3892.65,2401,0.0,639.2,81042,526824,208387
Jan,2746905,1538.2,1224,0.0,353.7,31289,307725,120368
Jul,5704429,8252.48,4922,0.0,806.8,198172,632648,254179
Jun,5952045,9063.91,4457,0.0,831.7,186372,660052,266074
Mar,4474086,4786.26,3057,0.0,611.1,104572,499565,198333
May,5214402,8293.95,5021,0.0,748.9,193024,580933,232284
Nov,5247130,7322.06,4968,0.0,757.4,174094,584293,234246


In [145]:
data_groupby_director = data.groupby(by='director_name')
data_groupby_director

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000248DD097DF0>

In [147]:
data_groupby_director.max()['budget']

director_name
Adam McKay                     100.0
Adam Shankman                   80.0
Alejandro González Iñárritu    135.0
Alex Proyas                    140.0
Alexander Payne                 30.0
                               ...  
Wes Craven                      40.0
Wolfgang Petersen              175.0
Woody Allen                     30.0
Zack Snyder                    250.0
Zhang Yimou                     94.0
Name: budget, Length: 199, dtype: float64