## imports and data urls

In [7]:
import pandas as pd
import numpy as np

url1 = "https://datasets.imdbws.com/title.basics.tsv.gz"
url2 = "https://datasets.imdbws.com/title.akas.tsv.gz"
url3 = "https://datasets.imdbws.com/title.ratings.tsv.gz"

## Lets clean akas

In [2]:
%%time

# Load akas Data
akas = pd.read_csv(url2, sep='\t', low_memory=False)

CPU times: total: 2min 59s
Wall time: 6min 8s


In [3]:
%%time

akas.replace({'\\N':np.nan}, inplace=True)
akas = akas[(akas['region'] == 'US')]
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1361344 entries, 5 to 33544157
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1361344 non-null  object
 1   ordering         1361344 non-null  int64 
 2   title            1361344 non-null  object
 3   region           1361344 non-null  object
 4   language         3695 non-null     object
 5   types            965071 non-null   object
 6   attributes       45205 non-null    object
 7   isOriginalTitle  1359969 non-null  object
dtypes: int64(1), object(7)
memory usage: 93.5+ MB
CPU times: total: 1min 34s
Wall time: 1min 51s


## Lets clean title.basics dataset

In [14]:
%%time

# Load basics Data
basics = pd.read_csv(url1, sep='\t', low_memory=False)

CPU times: total: 44.7 s
Wall time: 1min 14s


In [18]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [19]:
# Replace '\N' with np.nan
basics.replace({'\\N':np.nan}, inplace=True)

In [20]:
# Drop any rows that have null values in either genre or runtimeMinutes.
print(basics.info())
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])
print(basics.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9308649 entries, 0 to 9308648
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         object 
 5   startYear       object 
 6   endYear         object 
 7   runtimeMinutes  object 
 8   genres          float64
dtypes: float64(1), object(8)
memory usage: 639.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          0 non-null      object 
 1   titleType       0 non-null      object 
 2   primaryTitle    0 non-null      object 
 3   originalTitle   0 non-null      object 
 4   isAdult         0 non-null      object 
 5   startYear       0 non-null      object 
 6   endYear         0 non-null      object 
 7   runtimeM

In [7]:
# Drop all rows where titleType!=Movie. Keep only titleType==Movie
is_Movie = basics['titleType'] == 'movie'
basics = basics[is_Movie]

In [8]:
# Drop all rows where the start year is not between 2000-2022. Keep startYear 2000-2022

# We first need to change the column type to something numeric.
print("\nData Type before conversion : ", basics['startYear'].dtype)
basics['startYear'] = pd.to_numeric(basics['startYear'])
print("\nData Type after conversion : ", basics['startYear'].dtype)

#Now we can make and apply our filters
filter2000 = basics['startYear'] >= 2000 
filter2022 = basics['startYear'] <=2022
basics = basics[filter2000 & filter2022]


Data Type before conversion :  object

Data Type after conversion :  int64


In [17]:
# Exclude movies that are included in the documentary category.
basics.info()
# is_documentary = basics['genres'].str.contains('documentary', case=False)
# basics = basics[~is_documentary]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          0 non-null      object 
 1   titleType       0 non-null      object 
 2   primaryTitle    0 non-null      object 
 3   originalTitle   0 non-null      object 
 4   isAdult         0 non-null      object 
 5   startYear       0 non-null      int64  
 6   endYear         0 non-null      object 
 7   runtimeMinutes  0 non-null      object 
 8   genres          0 non-null      float64
dtypes: float64(1), int64(1), object(7)
memory usage: 0.0+ bytes


In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_basics = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers_basics]

basics.info()

## Lets clean ratings

In [None]:
%%time

# Load ratings Data
ratings = pd.read_csv(url3, sep='\t', low_memory=False)

In [None]:
ratings.replace({'\\N':np.nan}, inplace=True)
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]

ratings.info()

## save cleaned files in Data folder

In [None]:
# Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
print("Done!")