In [31]:
import pandas as pd
import numpy as np

In [32]:
basic_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
title_aka_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
title_rating_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [33]:
basics = pd.read_csv(basic_url, sep='\t', low_memory=False)
title_aka = pd.read_csv(title_aka_url, sep='\t', low_memory=False)
title_rating = pd.read_csv(title_rating_url, sep='\t', low_memory=False)

In [34]:
basics.replace({'\\N':np.nan}, inplace = True)
title_aka.replace({'\\N':np.nan}, inplace = True)
title_rating.replace({'\\N':np.nan}, inplace = True)

In [35]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [36]:
# Eliminate movies that are null for runtimeMinutes and genres
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [37]:
# Keep only titleType==Movie and startYear 2000-2022
basics = basics[basics["titleType"] == "movie"]
basics["startYear"] = pd.to_numeric(basics["startYear"], errors="coerce")
basics = basics[(basics["startYear"].between(2000, 2022, inclusive=True))]

  basics = basics[(basics["startYear"].between(2000, 2022, inclusive=True))]


In [38]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [39]:
us_movies = basics[basics["tconst"].isin(title_aka[title_aka["region"] == "US"]["titleId"])]

In [40]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(title_aka['titleId'])
keepers

34803      True
61116      True
67669      True
77964      True
86801      True
           ... 
9808643    True
9808652    True
9808691    True
9808736    True
9808820    True
Name: tconst, Length: 147358, dtype: bool

In [41]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9808643,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9808652,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9808691,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9808736,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


In [42]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_basics.csv.gz']

In [43]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [44]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
