In [2]:
import pandas as pd

casts = pd.read_csv('data/cast.csv',index_col=None)
casts.head # head is used to show first five records

titles = pd.read_csv('data/titles.csv', index_col =None)
titles.tail() # tail is used to show last five records

Unnamed: 0,title,year
49995,Rebel,1970
49996,Suzanne,1996
49997,Bomba,2013
49998,Aao Jao Ghar Tumhara,1984
49999,Mrs. Munck,1995


In [3]:
'''
read_csv : read the data from the csv file.
index_col = None : there is no index i.e. first column is data
head() : show only first five elements of the DataFrame
tail() : show only last five elements of the DataFrame
'''

'\nread_csv : read the data from the csv file.\nindex_col = None : there is no index i.e. first column is data\nhead() : show only first five elements of the DataFrame\ntail() : show only last five elements of the DataFrame\n'

In [4]:
titles

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011
...,...,...
49995,Rebel,1970
49996,Suzanne,1996
49997,Bomba,2013
49998,Aao Jao Ghar Tumhara,1984


In [5]:
len(titles) # titles give total number of rows

50000

In [6]:
t = titles['title']
t

0                 The Rising Son
1        The Thousand Plane Raid
2               Crucea de piatra
3                        Country
4                     Gaiking II
                  ...           
49995                      Rebel
49996                    Suzanne
49997                      Bomba
49998       Aao Jao Ghar Tumhara
49999                 Mrs. Munck
Name: title, Length: 50000, dtype: object

In [7]:
#ix : use ‘loc’ command to select a row from the DataFrame.
titles.loc[1]

title    The Thousand Plane Raid
year                        1969
Name: 1, dtype: object

In [8]:
# filter data
'''
Data can be filtered by providing some boolean expression in DataFrame.
For example, in below code, movies which released after 1985 are filtered out from
the DataFrame ‘titles’ and stored in a new DataFrame i.e. after85.'''
# movies after 1985
after85 =titles[titles['year'] > 1985]
after85.head()

Unnamed: 0,title,year
0,The Rising Son,1990
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011
5,Medusa (IV),2015


In [9]:
'''
In below code all the movies in decade 1990 (i.e. 1900-1999) are selected.
Also ‘t = titles’ is used for simplicity purpose only.
'''

'\nIn below code all the movies in decade 1990 (i.e. 1900-1999) are selected.\nAlso ‘t = titles’ is used for simplicity purpose only.\n'

In [10]:
# display movie in years 1990 - 1999
t = titles
movies90 = t[ (t['year']>=1990) & (t['year']<2000) ]
movies90.head()

Unnamed: 0,title,year
0,The Rising Son,1990
2,Crucea de piatra,1993
12,Poka Makorer Ghar Bosoti,1996
19,Maa Durga Shakti,1999
24,Conflict of Interest,1993


In [11]:
# find all movies named as 'Macbeth'
t = titles
macbeth = t[t['title'] == 'Macbeth']
macbeth

Unnamed: 0,title,year
4226,Macbeth,1913
9322,Macbeth,2006
11722,Macbeth,2013
17166,Macbeth,1997
25847,Macbeth,1998


In [12]:
macbeth = t[t['title'] == 'Macbeth']
macbeth

Unnamed: 0,title,year
4226,Macbeth,1913
9322,Macbeth,2006
11722,Macbeth,2013
17166,Macbeth,1997
25847,Macbeth,1998


In [13]:
#sorting the vlaues based on year
macbeth = t[t['title'] == 'Macbeth'].sort_values('year')
macbeth

Unnamed: 0,title,year
4226,Macbeth,1913
17166,Macbeth,1997
25847,Macbeth,1998
9322,Macbeth,2006
11722,Macbeth,2013


In [14]:
casts.loc[3:4]

Unnamed: 0,title,year,name,type,character,n
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


In [15]:
'''
These null values can be easily selected, unselected or contents can be replaced by
any other values e.g. empty strings or 0 etc.
Various examples of null values are shown in this section.
'''

'\nThese null values can be easily selected, unselected or contents can be replaced by\nany other values e.g. empty strings or 0 etc.\nVarious examples of null values are shown in this section.\n'

In [16]:
'''
‘isnull’ command returns the true value if any row of has null values.
Since the rows 3-4 has NaN value,
therefore, these are displayed as True.
'''

'\n‘isnull’ command returns the true value if any row of has null values.\nSince the rows 3-4 has NaN value,\ntherefore, these are displayed as True.\n'

In [17]:
c = casts
c['n'].isnull().head()

0    False
1    False
2    False
3     True
4     True
Name: n, dtype: bool

In [18]:
'''
to display the rows with null values,
the condition must be passed in the DataFrame,
'''

'\nto display the rows with null values,\nthe condition must be passed in the DataFrame,\n'

In [19]:
c[c['n'].isnull()].head(3)

Unnamed: 0,title,year,name,type,character,n
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,
5,Straight Outta Compton,2015,$hutter,actor,Club Patron,


In [20]:
'''
NaN values can be fill by using fillna, ffill(forward fill),
and bfill(backward fill) etc. 
In below code, ‘NaN’ values are replace by NA. 
Further, example of ffill and bfill are shown in later part of the tutorial,
'''

'\nNaN values can be fill by using fillna, ffill(forward fill),\nand bfill(backward fill) etc. \nIn below code, ‘NaN’ values are replace by NA. \nFurther, example of ffill and bfill are shown in later part of the tutorial,\n'

In [21]:
c_fill = c[c['n'].isnull()].fillna('NA')
c_fill.head(2)

Unnamed: 0,title,year,name,type,character,n
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


In [22]:
'''
NaN values can be fill by using fillna, ffill(forward fill),
and bfill(backward fill) etc. In below code, ‘NaN’ values are replace
by NA
'''

'\nNaN values can be fill by using fillna, ffill(forward fill),\nand bfill(backward fill) etc. In below code, ‘NaN’ values are replace\nby NA\n'

In [23]:
c_fill = c[c['n'].isnull()].fillna('viki')
c_fill.head()

Unnamed: 0,title,year,name,type,character,n
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,viki
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,viki
5,Straight Outta Compton,2015,$hutter,actor,Club Patron,viki
6,Straight Outta Compton,2015,$hutter,actor,Dopeman,viki
7,For Thy Love 2,2009,Bee Moe $lim,actor,Thug 1,viki


In [24]:
# String operations
'''
Various string operations can be performed using ‘.str.’ option. 
Let’s search for the movie “Maa” first
'''

'\nVarious string operations can be performed using ‘.str.’ option. \nLet’s search for the movie “Maa” first\n'

In [25]:
t = titles
t[t['title'] == 'Maa']

Unnamed: 0,title,year
38880,Maa,1968


In [26]:
t[t['title'].str.startswith("Maa ")].head(3)

Unnamed: 0,title,year
19,Maa Durga Shakti,1999
3046,Maa Aur Mamta,1970
7470,Maa Vaibhav Laxmi,1989


In [27]:
#Count Values
'''
Total number of occurrences can be counted using ‘value_counts()’ option.
In following code, total number of movies are displayed base on years.
'''

'\nTotal number of occurrences can be counted using ‘value_counts()’ option.\nIn following code, total number of movies are displayed base on years.\n'

In [28]:
#plots
'''Pandas supports the matplotlib library and can be used to plot the data as well. 
In previous section, the total numbers of movies/year were filtered out from the DataFrame. 
In the below code, 
those values are saved in new DataFrame and then plotted using panda,'''

'Pandas supports the matplotlib library and can be used to plot the data as well. \nIn previous section, the total numbers of movies/year were filtered out from the DataFrame. \nIn the below code, \nthose values are saved in new DataFrame and then plotted using panda,'

In [29]:
#Groupby
'''
Data can be grouped by columns-headers.

In Section Count Values, the value of movies/year were counted using ‘count_values()’ method.
Same can be achieve by ‘groupby’ method as well.
The ‘groupby’ command return an object, and we need to an additional functionality to it to get some results.
For example, in below code, data is grouped by ‘year’ and then size() command is used.
The size() option counts the total number for rows for each year; therefore the result of below code is same as ‘count_values()’ command.
'''

'\nData can be grouped by columns-headers.\n\nIn Section Count Values, the value of movies/year were counted using ‘count_values()’ method.\nSame can be achieve by ‘groupby’ method as well.\nThe ‘groupby’ command return an object, and we need to an additional functionality to it to get some results.\nFor example, in below code, data is grouped by ‘year’ and then size() command is used.\nThe size() option counts the total number for rows for each year; therefore the result of below code is same as ‘count_values()’ command.\n'

In [30]:
#!pip3 install matplotlib
import matplotlib as mpl
mpl.use('tkagg') 
import matplotlib.pyplot as plt
t = titles
p = t['year'].value_counts()
p.plot()

plt.show()

In [31]:
'''
It’s better to sort the years (i.e. index) first and then plot the data as below.
Here, the plot shows that number of movies are increasing every year.
'''

'\nIt’s better to sort the years (i.e. index) first and then plot the data as below.\nHere, the plot shows that number of movies are increasing every year.\n'

In [32]:
p.sort_index().plot()
plt.show()

In [34]:
#Groupby
'''
Data can be grouped by columns-headers
'''

'\nData can be grouped by columns-headers\n'

In [33]:
'''
In Section Count Values, the value of movies/year were counted using ‘count_values()’ method. 
Same can be achieve by ‘groupby’ method as well. 
The ‘groupby’ command return an object, and we need to an additional functionality to it to get some results. 
For example, in below code, data is grouped by ‘year’ and then size() command is used. The size() option counts the total number for rows for each year; 
therefore the result of below code is same as ‘count_values()’ command.
'''

'\nIn Section Count Values, the value of movies/year were counted using ‘count_values()’ method. \nSame can be achieve by ‘groupby’ method as well. \nThe ‘groupby’ command return an object, and we need to an additional functionality to it to get some results. \nFor example, in below code, data is grouped by ‘year’ and then size() command is used. The size() option counts the total number for rows for each year; \ntherefore the result of below code is same as ‘count_values()’ command.\n'

In [35]:
cg = c.groupby(['year']).size()
cg.plot()
plt.show()

In [36]:
'''
Further, groupby option can take multiple parameters for grouping.
For example, we want to group the movies of the actor ‘Aaron Abrams’
based on year,
'''

'\nFurther, groupby option can take multiple parameters for grouping.\nFor example, we want to group the movies of the actor ‘Aaron Abrams’\nbased on year,\n'

In [37]:
c = casts
cf = c[c['name'] == 'Aaron Abrams']
cf.groupby(['year']).size()


year
2003    2
2004    2
2005    2
2006    1
2007    2
2008    1
2009    2
2011    5
2013    1
2015    2
2017    2
2018    1
dtype: int64

In [1]:
'''
Above list shows that year-2003 is found in two rows with name-entry as ‘Aaron Abrams’.
In the other word, he did 2 movies in 2003
'''

'\nAbove list shows that year-2003 is found in two rows with name-entry as ‘Aaron Abrams’.\nIn the other word, he did 2 movies in 2003\n'

In [38]:
cf.groupby(['year', 'title']).size().head()

year  title                               
2003  The In-Laws                             1
      The Visual Bible: The Gospel of John    1
2004  Resident Evil: Apocalypse               1
      Siblings                                1
2005  Cinderella Man                          1
dtype: int64

In [42]:
c.groupby(['year']).n.max()

year
1912      6.0
1913     14.0
1914     39.0
1915     14.0
1916     35.0
        ...  
2017    620.0
2018     21.0
2019      6.0
2020      NaN
2023      NaN
Name: n, Length: 110, dtype: float64