# Importing Modules

In [50]:
# Importing dataframe and storage modules
import pandas as pd
import numpy as np

# Importing os module
import os

# Importing SQL modules
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine

In [51]:
basic = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas = 'https://datasets.imdbws.com/title.akas.tsv.gz'
rating = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [52]:
basics = pd.read_csv(basic, sep='\t', low_memory=False)
akas_df = pd.read_csv(akas, sep='\t', low_memory=False)
ratings = pd.read_csv(rating, sep='\t', low_memory=False)

# Filtering/Cleaning

- For this project I want to break down each dataset into 3 parts. This will allow me to make the code more readable for the user.

Now I will create a custom function to evaluate imported dataframes using different Pandas methods, this will allow me to view the data for future proccessing.

In [53]:
# Creating a function that evaluates a given dataset
def eval_df(df):
    print(df.head())
    print(f'Dataset Information Eval: \n{df.info(memory_usage=True, show_counts=True)}\n')
    print(f'Columns with NULL values: \n{df.isna().sum()}\n')
    print(f'Duplicated: \n{df.duplicated().sum()}')

# Title Basics

- I will now use the function created from earlier to get more insight on the basics dataframe.

In [54]:
# Calling the function to evaluate the dataset
eval_df(basics)

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              4  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9669713 entries, 0 to 9669712
Data columns (total 9 columns):
 #   Column      

In [55]:
# Using the value_counts() method to get the number of unique values in the column
basics.value_counts()

tconst      titleType  primaryTitle                                                                originalTitle                                                               isAdult  startYear  endYear  runtimeMinutes  genres                    
tt0000001   short      Carmencita                                                                  Carmencita                                                                  0        1894       \N       1               Documentary,Short             1
tt2712674   short      Brett & Kyle                                                                Brett & Kyle                                                                0        2013       \N       \N              Comedy,Short                  1
tt2712634   tvEpisode  Episode dated 23 February 2012                                              Episode dated 23 February 2012                                              0        2012       \N       \N              Comedy,News,Talk-Show        

1. Replace "\N" with np.nan.

In [56]:
# Using the replace() method to replace \N with np.nan
basics.replace({'\\N':np.nan}, inplace=True)
basics.isin(['\\N']).sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

2. Eliminate movies that are null for runtimeMinutes
3. Eliminate movies that are null for genres

I decided to drop both of these columns within one dropna() method.

In [57]:
# Using the dropna() method to drop rows with null values
basics.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)
# Checking the number of null values in the dataset
print(basics.isna().sum())

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          158421
endYear           2711044
runtimeMinutes          0
genres                  0
dtype: int64


4. Filter column titleType values to movie.

In [58]:
# Filtering titleType column to get only movies
titleType = basics['titleType'] == 'movie'
basics_f = basics.loc[titleType]
# Checking dataframe to see if the filter worked
basics_f.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


5. Filter column startYear values between 2000-2022.

In [59]:
# Filtering startYear column to get only movies released after 2000
df_filter = basics_f['startYear'] >= '2000'
basics_f = basics_f.loc[df_filter]
# Checking dataframe to see if the filter worked
basics_f.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,133,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
76059,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022,,46,Documentary


6. Eliminate movies that include "Documentary" in genre.

In [60]:
# Filtering genres column to get only movies that are not documentaries
is_doc = basics_f['genres'].str.contains('documentary', case=False)
basics_f = basics_f[~is_doc]
# Checking if the filter worked
print(basics_f['genres'].loc[basics_f['genres']=='Documentary'].sum())

0


----

# AKAs

- First before the cleaning process I want to use the custom function created earlier to get some information on this data.

In [61]:
eval_df(akas_df)

     titleId  ordering                      title region language  \
0  tt0000001         1                 Карменсіта     UA       \N   
1  tt0000001         2                 Carmencita     DE       \N   
2  tt0000001         3  Carmencita - spanyol tánc     HU       \N   
3  tt0000001         4                 Καρμενσίτα     GR       \N   
4  tt0000001         5                 Карменсита     RU       \N   

         types     attributes isOriginalTitle  
0  imdbDisplay             \N               0  
1           \N  literal title               0  
2  imdbDisplay             \N               0  
3  imdbDisplay             \N               0  
4  imdbDisplay             \N               0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35180463 entries, 0 to 35180462
Data columns (total 8 columns):
 #   Column           Non-Null Count     Dtype 
---  ------           --------------     ----- 
 0   titleId          35180463 non-null  object
 1   ordering         35180463 non-null

In [62]:
akas_df.value_counts()

titleId     ordering  title           region  language  types        attributes  isOriginalTitle
tt0000001   1         Карменсіта      UA      \N        imdbDisplay  \N          0                  1
tt25967852  4         Folge #1.122    DE      de        \N           \N          0                  1
tt25967888  3         एपिसोड #1.3     IN      hi        \N           \N          0                  1
            2         エピソード #1.3      JP      ja        \N           \N          0                  1
            1         Episódio #1.3   PT      pt        \N           \N          0                  1
                                                                                                   ..
tt13832462  4         Épisode #1.45   FR      fr        \N           \N          0                  1
            3         エピソード #1.45     JP      ja        \N           \N          0                  1
            2         एपिसोड #1.45    IN      hi        \N           \N          0     

1. Keep only US movies from the region column.

In [63]:
region_f = akas_df['region'] == 'US'
akas_df = akas_df.loc[region_f]
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


2. Replace "\N" with np.nan

In [64]:
akas_df = akas_df.replace({'\\N':np.nan})
akas_df.isin(['\\N']).sum()

titleId            0
ordering           0
title              0
region             0
language           0
types              0
attributes         0
isOriginalTitle    0
dtype: int64

----

# Ratings

Again, I will be using eval_df() to evaulate the ratings dataframe.

In [65]:
eval_df(ratings)

      tconst  averageRating  numVotes
0  tt0000001            5.7      1959
1  tt0000002            5.8       263
2  tt0000003            6.5      1791
3  tt0000004            5.6       179
4  tt0000005            6.2      2596
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1286543 entries, 0 to 1286542
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1286543 non-null  object 
 1   averageRating  1286543 non-null  float64
 2   numVotes       1286543 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.4+ MB
Dataset Information Eval: 
None

Columns with NULL values: 
tconst           0
averageRating    0
numVotes         0
dtype: int64

Duplicated: 
0


In [66]:
ratings.value_counts()

tconst      averageRating  numVotes
tt0000001   5.7            1959        1
tt22182834  8.5            33          1
tt22182890  8.5            18          1
tt22182882  8.1            19          1
tt22182868  7.7            19          1
                                      ..
tt0934786   6.8            53          1
tt0934770   7.6            17          1
tt0934769   6.9            26          1
tt0934768   1.2            121         1
tt9916880   8.2            6           1
Length: 1286543, dtype: int64

1. Replace "/N" with np.nan (if any).

In [67]:
ratings = ratings.replace({'\\N':np.nan})
ratings.isin(['\\N']).sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

# Other filtering

Now I will filter both dataframes basics and ratings to only have movies produced in this US.

In [68]:
keepers = basics['tconst'].isin(akas_df['titleId'])
basics_f = basics_f[keepers]
basics_f.head()

  basics_f = basics_f[keepers]


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [69]:
keepers = ratings['tconst'].isin(akas_df['titleId'])
ratings = ratings[keepers]
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1959
1,tt0000002,5.8,263
4,tt0000005,6.2,2596
5,tt0000006,5.1,177
6,tt0000007,5.4,814


# Storing Dataframes Filtered

In [70]:
# Using make_dirs() method to create a directory
os.makedirs('Data/', exist_ok=True)
# Using the listdir() method to list the files in the directory
os.listdir("Data/")

['title_akas_df.cvs.gz', 'title_basics_f.cvs.gz', 'title_ratings.cvs.gz']

In [71]:
# Using the to_csv() method to save the dataframe as a csv file
basics_f.to_csv("Data/title_basics_f.cvs.gz", compression='gzip', index=False)
# Checking if csv file was created
basics = pd.read_csv("Data/title_basics_f.cvs.gz", low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [72]:
# Using the to_csv() method to save the dataframe as a csv file
akas_df.to_csv("Data/title_akas_df.cvs.gz", compression='gzip', index=False)
# Checking if csv file was created
akas_df = pd.read_csv("Data/title_akas_df.cvs.gz", low_memory=False)
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [73]:
# Using the to_csv() method to save the dataframe as a csv file
ratings.to_csv("Data/title_ratings.cvs.gz", compression='gzip', index=False)
# Checking if csv file was created
ratings = pd.read_csv("Data/title_ratings.cvs.gz", low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1959
1,tt0000002,5.8,263
2,tt0000005,6.2,2596
3,tt0000006,5.1,177
4,tt0000007,5.4,814


- Before saving, run a final .info() for each of the dataframes to show a summary of how many movies remain and the datatypes of each feature

In [74]:
print(basics.info())
print(akas_df.info())
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86799 entries, 0 to 86798
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86799 non-null  object 
 1   titleType       86799 non-null  object 
 2   primaryTitle    86799 non-null  object 
 3   originalTitle   86799 non-null  object 
 4   isAdult         86799 non-null  int64  
 5   startYear       86799 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86799 non-null  int64  
 8   genres          86799 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 6.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1419921 entries, 0 to 1419920
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1419921 non-null  object 
 1   ordering         1419921 non-null  int64  
 2   title            1419