In [1]:
import pandas as pd
import numpy as np
import requests

## Data Gathering

In [9]:
# url of our data
basic_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
title_aka_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
title_rating_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [10]:
# create a list for url and names
url_list = [basic_url, title_aka_url, title_rating_url]
url_names = ['title.basics.tsv.gz', 'title.akas.tsv.gz', 'title.ratings.tsv.gz']

In [11]:
# use request to download data Programmatically from the url
responses = []
for url in url_list:
    response = requests.get(url)
    responses.append(response.content)
n = 0
for names in url_names:
    with open(names, mode='wb') as file: 
        file.write(responses[n])
    n += 1

In [2]:
title_basics = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=False)

In [13]:
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [53]:
title_aka = pd.read_csv('title.akas.tsv.gz', sep='\t', low_memory = False)

In [54]:
title_aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [16]:
title_aka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35843078 entries, 0 to 35843077
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [110]:
title_rating = pd.read_csv('title.ratings.tsv.gz', sep='\t', low_memory=False)

In [111]:
title_rating.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1969
1,tt0000002,5.8,263
2,tt0000003,6.5,1815
3,tt0000004,5.6,178
4,tt0000005,6.2,2612


In [19]:
title_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308743 entries, 0 to 1308742
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1308743 non-null  object 
 1   averageRating  1308743 non-null  float64
 2   numVotes       1308743 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.0+ MB


## Data Cleaning

### Title Basics column:

1. Replace "\N" with np.nan
2. Eliminate movies that are null for runtimeMinutes
3. Eliminate movies that are null for genre
4. keep only titleType==Movie
5. keep startYear 2000 to 2022(include 2000 and 2021)
6. Eliminate movies that include "Documentary" in genre 
7. Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)

In [122]:
# Create a copy
title_basics_clean = title_basics.copy()

**1. Replace "\N" with np.nan**

In [123]:
title_basics_clean.replace({'\\N':np.nan}, inplace=True)

**2. Eliminate movies that are null for runtimeMinutes**

In [124]:
title_basics_clean.dropna(subset='runtimeMinutes' , inplace=True)

In [125]:
# test
title_basics_clean.runtimeMinutes.isna().sum()

0

**3. Eliminate movies that are null for genre**

In [126]:
title_basics_clean.dropna(subset= 'genres', inplace= True)

In [127]:
# test
title_basics_clean.genres.isna().sum()

0

**4. keep only titleType==Movie**

In [128]:
# select movie type
title_basics_clean = title_basics_clean[title_basics_clean.titleType == 'movie']

In [129]:
# test
title_basics_clean.titleType.unique()

array(['movie'], dtype=object)

**5. keep startYear 2000-2022**

In [130]:
# change start year to float
title_basics_clean['startYear'] = title_basics_clean['startYear'].astype(float) 

In [131]:
title_basics_clean = title_basics_clean[(title_basics_clean.startYear >=2000) & (title_basics_clean.startYear < 2022)]

In [132]:
# test
title_basics_clean.startYear.unique()

array([2021., 2001., 2020., 2018., 2005., 2002., 2009., 2017., 2000.,
       2006., 2004., 2008., 2007., 2003., 2012., 2010., 2013., 2011.,
       2015., 2016., 2014., 2019.])

**6. Eliminate movies that include "Documentary" in genre**

In [133]:
# movie index that includes Documentary
is_documentary = title_basics_clean.genres.str.contains('documentary',case=False)
# filter out documentary
title_basics_clean = title_basics_clean[~is_documentary]

In [134]:
# Test
title_basics_clean.genres.str.contains('documentary',case=False).unique()

array([False])

In [135]:
title_basics_clean.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


### Title AKAs Column:

1. Replace "\N" with np.nan
2. keep only US movies.

In [100]:
# create a copy
title_aka_clean = title_aka.copy()

**1. Replace "\N" with np.nan**

In [101]:
title_aka_clean.replace({'\\N':np.nan}, inplace=True)

**2. keep only US movies.**

In [103]:
title_aka_clean = title_aka_clean[title_aka_clean.region == 'US']

In [104]:
# test
title_aka_clean.region.unique()

array(['US'], dtype=object)

In [105]:
title_aka_clean.shape

(1435435, 8)

**For title_basic Keep only US movies**

In [137]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = title_basics_clean['tconst'].isin(title_aka_clean['titleId'])

In [139]:
title_basics_clean = title_basics_clean[keepers]
title_basics_clean.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [140]:
title_basics_clean.shape

(81710, 9)

### Title Ratings Column:

1. Replace "\N" with np.nan (if any)
2. Keep only US movies (Use AKAs table)

In [112]:
# create a copy
title_rating_clean = title_rating.copy()

**1. Replace "\N" with np.nan (if any)**

In [113]:
title_rating_clean.replace({'\\N':np.nan}, inplace=True)

**2. Keep only US movies (Use AKAs table)**

In [114]:
# Filter the rating table down to only include the US by using the filter akas dataframe
keepers = title_rating_clean['tconst'].isin(title_aka_clean['titleId'])

In [115]:
title_rating_clean = title_rating_clean[keepers]
title_rating_clean.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1969
1,tt0000002,5.8,263
4,tt0000005,6.2,2612
5,tt0000006,5.1,181
6,tt0000007,5.4,818


In [117]:
title_rating_clean.shape

(497874, 3)

In [156]:
# making new folder with os
import os

FOLDER = 'Data/'
is_path_exist = os.path.exists(FOLDER)

if is_path_exist == False:
    os.makedirs(FOLDER)
else:
    os.rmdir(FOLDER)
    
# Confirm folder created
os.listdir(FOLDER)

[]

In [161]:
## Save current dataframe to file.
title_basics_clean.to_csv("Data/title_basics_clean.csv.gz",compression='gzip',index=False)

In [162]:
title_aka_clean.to_csv("Data/title_aka_clean.csv.gz",compression='gzip',index=False)

In [163]:
title_rating_clean.to_csv("Data/title_rating_clean.csv.gz",compression='gzip',index=False)

In [164]:
# Open saved file and preview again
title_basics = pd.read_csv("Data/title_basics_clean.csv.gz", low_memory = False)
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [165]:
# Open saved file and preview again
title_rating = pd.read_csv("Data/title_rating_clean.csv.gz", low_memory = False)
title_rating.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1969
1,tt0000002,5.8,263
2,tt0000005,6.2,2612
3,tt0000006,5.1,181
4,tt0000007,5.4,818
