In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
basics['tconst'] = basics['tconst'].replace('\\N', np.nan)
basics['titleType'] = basics['titleType'].replace('\\N', np.nan)
basics['primaryTitle'] = basics['primaryTitle'].replace('\\N', np.nan)
basics['originalTitle'] = basics['originalTitle'].replace('\\N', np.nan)
basics['isAdult'] = basics['isAdult'].replace('\\N', np.nan)
basics['startYear'] = basics['startYear'].replace('\\N', np.nan)
basics['endYear'] = basics['endYear'].replace('\\N', np.nan)
basics['runtimeMinutes'] = basics['runtimeMinutes'].replace('\\N', np.nan)
basics['genres'] = basics['genres'].replace('\\N', np.nan)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [6]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1286688
endYear           9460475
runtimeMinutes    6782432
genres             434198
dtype: int64

In [7]:
basics.dropna(subset = ['runtimeMinutes'], inplace = True)
basics.dropna(subset = ['genres'], inplace = True)
basics.dropna(subset = ['startYear'], inplace = True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear               0
endYear           2509525
runtimeMinutes          0
genres                  0
dtype: int64

In [8]:
basics['titleType'].value_counts()

tvEpisode       1205944
short            582636
movie            369491
video            177929
tvMovie           90336
tvSeries          88064
tvSpecial         17538
tvMiniSeries      16638
tvShort            8548
videoGame           317
Name: titleType, dtype: int64

In [9]:
basics['startYear'].value_counts()

2017    132767
2018    130678
2016    127052
2015    121315
2019    120575
         ...  
1889         2
2027         1
1874         1
1883         1
1885         1
Name: startYear, Length: 148, dtype: int64

In [10]:
basics['genres'].value_counts()

Drama                      195873
Documentary                168076
Comedy                     141663
Drama,Short                110453
Drama,Romance              108791
                            ...  
Action,Reality-TV,War           1
Horror,Romance,Western          1
Romance,Talk-Show               1
Adult,Documentary,Music         1
Drama,Horror,Reality-TV         1
Name: genres, Length: 2213, dtype: int64

In [11]:
basics['startYear'] = basics['startYear'].astype(int)
basics['startYear'].dtype

dtype('int64')

In [12]:
movie_filter = basics['titleType'] == 'movie'
basics = basics[movie_filter]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


In [13]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,,45,Drama


In [14]:
start_date_filter = basics['startYear'] >= 2000
end_date_filter = basics['startYear'] <= 2022
basics = basics[start_date_filter & end_date_filter]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [15]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [16]:
akas['titleId'] = akas['titleId'].replace('\\N', np.nan)
akas['ordering'] = akas['ordering'].replace('\\N', np.nan)
akas['title'] = akas['title'].replace('\\N', np.nan)
akas['region'] = akas['region'].replace('\\N', np.nan)
akas['language'] = akas['language'].replace('\\N', np.nan)
akas['types'] = akas['types'].replace('\\N', np.nan)
akas['attributes'] = akas['attributes'].replace('\\N', np.nan)
akas['isOriginalTitle'] = akas['isOriginalTitle'].replace('\\N', np.nan)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [17]:
US_filter = akas['region'] == 'US'
akas = akas[US_filter]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [18]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9561853     True
9561862     True
9561901    False
9561946     True
9562030    False
Name: tconst, Length: 146226, dtype: bool

In [19]:
basics = basics[keepers]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [20]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1944
1,tt0000002,5.8,263
2,tt0000003,6.5,1769
3,tt0000004,5.6,179
4,tt0000005,6.2,2579


In [21]:
ratings['tconst'] = ratings['tconst'].replace('\\N', np.nan)
ratings['averageRating'] = ratings['averageRating'].replace('\\N', np.nan)
ratings['numVotes'] = ratings['numVotes'].replace('\\N', np.nan)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1944
1,tt0000002,5.8,263
2,tt0000003,6.5,1769
3,tt0000004,5.6,179
4,tt0000005,6.2,2579


In [22]:
keepers =ratings['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1272080    False
1272081    False
1272082    False
1272083    False
1272084    False
Name: tconst, Length: 1272085, dtype: bool

In [23]:
ratings = ratings[keepers]
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1944
1,tt0000002,5.8,263
4,tt0000005,6.2,2579
5,tt0000006,5.1,177
6,tt0000007,5.4,809


In [24]:
import os
os.makedirs('Data/',exist_ok=True)

In [25]:
os.listdir("Data/")

[]

In [26]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [27]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [28]:
akas.to_csv("Data/title.akas.csv.gz",compression='gzip',index=False)

In [29]:
akas = pd.read_csv("Data/title.akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [30]:
ratings.to_csv("Data/title.ratings.csv.gz",compression='gzip',index=False)

In [31]:
ratings = pd.read_csv("Data/title.ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1944
1,tt0000002,5.8,263
2,tt0000005,6.2,2579
3,tt0000006,5.1,177
4,tt0000007,5.4,809
