In [17]:
import pandas as pd
import numpy as np
import os

In [18]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [19]:
basics_df = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [20]:
akas_df = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [21]:
ratings_df = pd.read_csv(ratings_url,sep='\t', low_memory=False)

**Title Basics**

In [22]:
# Replace "\N" with np.nan
basics_df.replace({'\\N':np.nan},inplace=True)

In [23]:
# Drop movies that are missing values for runtimeMinutes, genres, startYear
basics_df = basics_df.dropna(subset = ['runtimeMinutes', 'genres', 'startYear'])

In [24]:
# Keep titleType movie
basics_df = basics_df.loc[basics_df['titleType'] == 'movie']

In [25]:
# Keep startYear 2000-2022
basics_df['startYear'] = basics_df['startYear'].astype(int)
basics_df = basics_df.loc[(basics_df['startYear'] >= 2000) & (basics_df['startYear'] <=2021)]

In [26]:
# Exclude movies that are included in the documentary category.
is_documentary = basics_df['genres'].str.contains('Documentary',case=False)
basics_df = basics_df[~is_documentary]


In [27]:
basics_df.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [28]:
# Make directory
os.makedirs('Data/',exist_ok=True) 

In [29]:
# Confirm creation 
os.listdir("Data/")

[]

In [30]:
# Save data frame 
basics_df.to_csv("Data/title_basics.csv.gz", compression='gzip', index=False)

In [31]:
# Open saved file and preview again
basics_df = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


**Title Ratings**

In [34]:
ratings_df.replace({'\\N':np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1879
1,tt0000002,5.9,249
2,tt0000003,6.5,1651
3,tt0000004,5.8,161
4,tt0000005,6.2,2476
...,...,...,...
1240753,tt9916690,6.5,6
1240754,tt9916720,5.1,209
1240755,tt9916730,8.7,6
1240756,tt9916766,6.7,19


In [35]:
ratings_df.to_csv("Data/title_ratings.csv.gz", compression='gzip', index=False)

In [36]:
# Open saved file and preview again
ratings_df = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1879
1,tt0000002,5.9,249
2,tt0000003,6.5,1651
3,tt0000004,5.8,161
4,tt0000005,6.2,2476


In [38]:
akas_df = akas_df.loc[akas_df['region'] =='US']

In [37]:
akas_df.replace({'\\N':np.nan},inplace=True)

In [39]:
akas_df.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)

In [40]:
# Open saved file and preview again
akas_df = pd.read_csv("Data/title_akas.csv.gz", low_memory=True)
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
