In [112]:
import pandas as pd
import numpy as np
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine

In [98]:
basic = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas = 'https://datasets.imdbws.com/title.akas.tsv.gz'
rating = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [99]:
basics = pd.read_csv(basic, sep='\t', low_memory=False)
akas_df = pd.read_csv(akas, sep='\t', low_memory=False)
ratings = pd.read_csv(rating, sep='\t', low_memory=False)

# Filtering/Cleaning

- For this project I want to break down each dataset into 3 parts. This will allow me to make the code more readable for the user.

In [100]:
# Creating a function that evaluates a given dataset
def eval_df(df):
    print(f'Dataset Information Eval: \n{df.info(memory_usage=True, show_counts=True)}\n')
    print(f'Columns with NULL values: \n{df.isna().sum()}\n')
    print(f'Duplicated: \n{df.duplicated().sum()}')

In [101]:
# Creating function that removes NAN and Duplicated values 
def remove_nan_and_duplicates(df):
    df = df.dropna()
    print(f'Null Values Remaining: \n{df.isna().sum()}\n')
    df = df.drop_duplicates()
    print(f'Duplicated Values Removed: \n{df.duplicated().sum()}')
    

In [102]:
eval_df(basics)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9631839 entries, 0 to 9631838
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tconst          9631839 non-null  object
 1   titleType       9631839 non-null  object
 2   primaryTitle    9631828 non-null  object
 3   originalTitle   9631828 non-null  object
 4   isAdult         9631839 non-null  object
 5   startYear       9631839 non-null  object
 6   endYear         9631839 non-null  object
 7   runtimeMinutes  9631839 non-null  object
 8   genres          9631829 non-null  object
dtypes: object(9)
memory usage: 661.4+ MB
Dataset Information Eval: 
None

Columns with NULL values: 
tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

Duplicated: 
0


In [103]:
remove_nan_and_duplicates(basics)

Null Values Remaining: 
tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

Duplicated Values Removed: 
0


In [104]:
basics.value_counts()

tconst      titleType  primaryTitle                 originalTitle                isAdult  startYear  endYear  runtimeMinutes  genres                    
tt0000001   short      Carmencita                   Carmencita                   0        1894       \N       1               Documentary,Short             1
tt2743722   tvEpisode  Episode dated 28 April 1998  Episode dated 28 April 1998  0        1998       \N       \N              News                          1
tt2743690   tvEpisode  Episode dated 6 April 1998   Episode dated 6 April 1998   0        1998       \N       \N              News                          1
tt2743692   tvEpisode  Episode dated 7 April 1998   Episode dated 7 April 1998   0        1998       \N       \N              News                          1
tt2743694   tvEpisode  Episode dated 8 April 1998   Episode dated 8 April 1998   0        1998       \N       \N              News                          1
                                                         

In [105]:
basics.replace({'\\N':np.nan}, inplace=True)

In [106]:
basics.isin(['\\N']).sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [107]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1303058
endYear           9528911
runtimeMinutes    6809952
genres             435532
dtype: int64

In [108]:
basics.dropna(subset=['runtimeMinutes', 'genres'], inplace=True)

In [109]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          158369
endYear           2697861
runtimeMinutes          0
genres                  0
dtype: int64

In [110]:
basics['startYear'] = basics['startYear'].astype(float)

In [111]:
basics.value_counts()

tconst      titleType  primaryTitle                  originalTitle                 isAdult  startYear  endYear  runtimeMinutes  genres           
tt0025509   tvSeries   Les Misérables                Les misérables                0        1934.0     1934     279             Drama                1
tt2263743   tvSeries   Mountain Man                  Mountain Man                  0        2009.0     2010     11              Adventure            1
tt2262420   tvSeries   Plano Wired                   Plano Wired                   0        2006.0     2011     30              News                 1
tt2262456   tvSeries   The Hollow Crown              The Hollow Crown              0        2012.0     2016     150             Drama,History,War    1
tt2262521   tvSeries   A Thousand Days' Promise      A Thousand Days' Promise      0        2011.0     2011     65              Drama,Romance        1
                                                                                                   