# IMDB: Recommendations for a Successful Movie
- Victoria White
- 11 October 2022


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data

In [2]:
#importing data
title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'
title_akas = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

## Saving Compressed Files

In [3]:
#creating folder for Data
import os
os.makedirs('Data/', exist_ok=True)
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [4]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [5]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [6]:
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1917
1,tt0000002,5.8,260
2,tt0000005,6.2,2541
3,tt0000006,5.1,175
4,tt0000007,5.4,796


In [7]:
#basics = pd.read_csv(title_basics, sep='\t', low_memory=False)
df_basics = basics.copy()
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [8]:
#akas = pd.read_csv(title_akas, sep='\t', low_memory=False)
df_akas = akas.copy()
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [9]:
#ratings = pd.read_csv(title_ratings, sep='\t', low_memory=False)
df_ratings = ratings.copy()
df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1917
1,tt0000002,5.8,260
2,tt0000005,6.2,2541
3,tt0000006,5.1,175
4,tt0000007,5.4,796


In [10]:
#saving dataframes as compressed files
df_basics.to_csv("Data/title_basics.csv.gz", compression='gzip', index=False)
df_akas.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)
df_ratings.to_csv("Data/title_ratings.csv.gz", compression='gzip',index=False)

## Preprocessing

In [11]:
#replacing null values
df_basics.replace({'\\N':np.nan}, inplace=True)

In [12]:
#replacing null values
df_ratings.replace({'\\N':np.nan}, inplace=True)

In [13]:
#replacing null values
df_akas.replace({'\\N':np.nan}, inplace=True)

In [14]:
#checking for duplicates
df_basics.duplicated().sum()

0

In [15]:
#checking for duplicates
df_akas.duplicated().sum()

0

In [16]:
#checking for duplicates
df_ratings.duplicated().sum()

0

In [17]:
#finding all missing values
df_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           272504
runtimeMinutes         0
genres                 0
dtype: int64

In [18]:
#eliminating movies that are null in runtimeMinutes and genres
df_basics.dropna(subset='runtimeMinutes', inplace=True)
df_basics.dropna(subset='genres', inplace=True)
df_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           272504
runtimeMinutes         0
genres                 0
dtype: int64

In [19]:
#changing dtype from object for startYear
df_basics.astype({'startYear':'int'}).dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult             int64
startYear           int32
endYear           float64
runtimeMinutes      int64
genres             object
dtype: object

In [28]:
#filtering basics for movies outside of 2000-2021
drop_movies = df_basics[(df_basics['titleType'] != 'Movie') &
                        (df_basics['startYear'] <=1999) &
                        (df_basics['startYear'] >= 2022)].index
df_basics.drop(drop_movies, inplace=True)
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [21]:
#dropping short from titleType so only movie is included
short = df_basics[(df_basics['titleType'] == 'short')].index
df_basics.drop(short, inplace=True)
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [22]:
#dropping documentaries
is_documentary = df_basics['genres'].str.contains('documentary', case=False)
df_basics = df_basics[~is_documentary]
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [23]:
#filtering movies not in US region
not_US = df_akas[(df_akas['region'] != 'US')].index
df_akas.drop(not_US, inplace=True)
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [24]:
#filtering akas dataframe on the basics dataframe
keepers_basics = df_basics['tconst'].isin(df_akas['titleId'])
keepers_basics

0         True
1         True
2         True
3         True
4         True
          ... 
282599    True
282600    True
282601    True
282602    True
282603    True
Name: tconst, Length: 282604, dtype: bool

In [25]:
#filtering basics
df_basics = df_basics[keepers_basics]
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [26]:
#filtering akas dataframe on ratings dataframe
keepers_ratings = df_ratings['tconst'].isin(df_akas['titleId'])
keepers_ratings

0         True
1         True
2         True
3         True
4         True
          ... 
471730    True
471731    True
471732    True
471733    True
471734    True
Name: tconst, Length: 471735, dtype: bool

In [27]:
#filtering ratings
df_ratings = df_ratings[keepers_ratings]
df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1917
1,tt0000002,5.8,260
2,tt0000005,6.2,2541
3,tt0000006,5.1,175
4,tt0000007,5.4,796


Checking info after filtering dataframes

In [29]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 282604 entries, 0 to 282603
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          282604 non-null  object 
 1   titleType       282604 non-null  object 
 2   primaryTitle    282604 non-null  object 
 3   originalTitle   282604 non-null  object 
 4   isAdult         282604 non-null  int64  
 5   startYear       282604 non-null  int64  
 6   endYear         10100 non-null   float64
 7   runtimeMinutes  282604 non-null  int64  
 8   genres          282604 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 21.6+ MB


In [30]:
df_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359209 entries, 0 to 1359208
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1359209 non-null  object 
 1   ordering         1359209 non-null  int64  
 2   title            1359209 non-null  object 
 3   region           1359209 non-null  object 
 4   language         3689 non-null     object 
 5   types            964586 non-null   object 
 6   attributes       45157 non-null    object 
 7   isOriginalTitle  1357834 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 83.0+ MB


In [31]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471735 entries, 0 to 471734
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         471735 non-null  object 
 1   averageRating  471735 non-null  float64
 2   numVotes       471735 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.4+ MB
