# IMDB Data Exploration

A separate notebook for querying the IMDb database file.

In [50]:
import pandas as pd 
import sqlite3

In [51]:
con = sqlite3.connect('../data/raw/im.db')

In [52]:
%%bash 

sqlite3 ../data/raw/im.db
.tables

directors      movie_akas     movie_ratings  principals   
known_for      movie_basics   persons        writers      


In [53]:
%%bash

sqlite3 ../data/raw/im.db 
.schema

CREATE TABLE IF NOT EXISTS "movie_basics" (
"movie_id" TEXT,
  "primary_title" TEXT,
  "original_title" TEXT,
  "start_year" INTEGER,
  "runtime_minutes" REAL,
  "genres" TEXT
);
CREATE TABLE IF NOT EXISTS "directors" (
"movie_id" TEXT,
  "person_id" TEXT
);
CREATE TABLE IF NOT EXISTS "known_for" (
"person_id" TEXT,
  "movie_id" TEXT
);
CREATE TABLE IF NOT EXISTS "movie_akas" (
"movie_id" TEXT,
  "ordering" INTEGER,
  "title" TEXT,
  "region" TEXT,
  "language" TEXT,
  "types" TEXT,
  "attributes" TEXT,
  "is_original_title" REAL
);
CREATE TABLE IF NOT EXISTS "movie_ratings" (
"movie_id" TEXT,
  "averagerating" REAL,
  "numvotes" INTEGER
);
CREATE TABLE IF NOT EXISTS "persons" (
"person_id" TEXT,
  "primary_name" TEXT,
  "birth_year" REAL,
  "death_year" REAL,
  "primary_profession" TEXT
);
CREATE TABLE IF NOT EXISTS "principals" (
"movie_id" TEXT,
  "ordering" INTEGER,
  "person_id" TEXT,
  "category" TEXT,
  "job" TEXT,
  "characters" TEXT
);
CREATE TABLE IF NOT EXISTS "writers" (
"m

In [58]:
query = \
"""
SELECT *
FROM movie_basics

"""

In [55]:
df_movie_akas = pd.read_sql(query, con)

In [56]:
df_movie_akas.sample(20)

Unnamed: 0,movie_id,ordering,title,region,language,types,attributes,is_original_title
45030,tt2267704,1,Palestine,FR,,,,0.0
96921,tt3220192,4,Braquage à la suédoise,FR,,imdbDisplay,,0.0
125690,tt2244886,1,Unexpected Places,US,,,,0.0
164095,tt3992566,1,Kizasi: The man who erace radioactivity of Fuk...,JP,,,,0.0
147613,tt1941705,2,Treasure Inn,HK,en,,,0.0
223538,tt5097012,3,21,,,original,,1.0
296378,tt4696704,6,Sometimes You Get Lucky: Conversations with Ma...,,,working,,0.0
186651,tt6120756,3,Mysterious Masterpiece: Cold Case Torrentius,,,original,,1.0
132489,tt3112838,1,No Exit,US,,,,0.0
72461,tt2592476,1,"His story, His fall, His Victory",US,,,,0.0


In [57]:
df_movie_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331703 entries, 0 to 331702
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   movie_id           331703 non-null  object 
 1   ordering           331703 non-null  int64  
 2   title              331703 non-null  object 
 3   region             278410 non-null  object 
 4   language           41715 non-null   object 
 5   types              168447 non-null  object 
 6   attributes         14925 non-null   object 
 7   is_original_title  331678 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 20.2+ MB


In [63]:
df_movie_akas['region'].value_counts()[:20]

region
US     51490
XWW    18467
RU     13817
DE     11634
FR     10990
ES      9007
GB      8942
CA      8871
PL      8691
IN      8435
BR      8167
IT      7983
GR      6311
JP      6308
HU      6258
TR      5052
AR      4729
PT      4629
SE      4444
MX      3869
Name: count, dtype: int64

In [59]:
df_movie_basics = pd.read_sql(query, con)
df_movie_basics.sample(20)

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
65670,tt3746924,Cretativity,Cretativity,2014,80.0,Documentary
13128,tt1678050,Cure for Pain: The Mark Sandman Story,Cure for Pain: The Mark Sandman Story,2011,86.0,Documentary
50419,tt2991202,Genuine Risk,Genuine Risk,2012,87.0,Drama
17786,tt1791686,Wimbledon Official Film 2010,Wimbledon Official Film 2010,2010,52.0,"Documentary,Sport"
62943,tt3616562,Never Enough Thunder,Never Enough Thunder,2015,,Comedy
2119,tt10210336,Chon-cheol-sal-in,Chon-cheol-sal-in,2011,87.0,Drama
105692,tt6181368,Pasos de Fe,Pasos de Fe,2014,,"Drama,Music"
110779,tt6463248,Rising from the Ashes,Rising from the Ashes,2010,60.0,Documentary
118802,tt7124742,Four Springs,Four Springs,2017,105.0,"Documentary,Family,Music"
16145,tt1754506,Battleground,Skeleton Lake,2012,85.0,"Action,Horror,Thriller"


In [60]:
df_movie_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [68]:
df_tn = pd.read_csv('../data/raw/tn.movie_budgets.csv.gz', compression='gzip')
df_tn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [69]:
df_tn['movie'].isin(df_movie_basics['primary_title']).value_counts()

movie
False    3406
True     2376
Name: count, dtype: int64

In [70]:
df_tn['movie'].isin(df_movie_basics['original_title']).value_counts()

movie
False    3452
True     2330
Name: count, dtype: int64

In [71]:
df_movie_basics['primary_title'].isin(df_tn['movie']).value_counts()

primary_title
False    142538
True       3606
Name: count, dtype: int64

In [72]:
df_movie_basics['original_title'].isin(df_tn['movie']).value_counts()

original_title
False    142786
True       3358
Name: count, dtype: int64

In [79]:
df_movie_basics.loc[df_movie_basics['primary_title'].str.contains('Harry Potter')]

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
457,tt0926084,Harry Potter and the Deathly Hallows: Part 1,Harry Potter and the Deathly Hallows: Part 1,2010,146.0,"Adventure,Fantasy,Mystery"
5859,tt1201607,Harry Potter and the Deathly Hallows: Part 2,Harry Potter and the Deathly Hallows: Part 2,2011,130.0,"Adventure,Drama,Fantasy"
17273,tt1781796,"Creating the World of Harry Potter, Part 4: So...","Creating the World of Harry Potter, Part 4: So...",2010,54.0,Documentary
20960,tt1867094,The Seekers Guide to Harry Potter,The Seekers Guide to Harry Potter,2010,75.0,Documentary
127195,tt7783322,Harry Potter: A History of Magic,Harry Potter: A History of Magic,2017,59.0,Documentary
133278,tt8358970,The Harry Potter Saga Analyzed,The Harry Potter Saga Analyzed,2018,,Documentary
134218,tt8443702,Harry Potter and the Untold Stories of Hogwarts,Harry Potter and the Untold Stories of Hogwarts,2012,58.0,"Adventure,Comedy,Fantasy"


In [80]:
df_movie_basics.rename(columns={'primary_title': 'title'}, inplace=True)

In [81]:
df_movie_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146144 non-null  object 
 1   title            146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [82]:
df_movie_basics.drop(columns=['movie_id', 'original_title', 'start_year'], inplace=True)

In [83]:
df_tn.rename(columns={'movie': 'title'}, inplace=True)

In [108]:
df_merged = pd.merge(df_tn, df_movie_basics, how='inner', on='title')

In [109]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3815 entries, 0 to 3814
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3815 non-null   int64  
 1   release_date       3815 non-null   object 
 2   title              3815 non-null   object 
 3   production_budget  3815 non-null   object 
 4   domestic_gross     3815 non-null   object 
 5   worldwide_gross    3815 non-null   object 
 6   runtime_minutes    3328 non-null   float64
 7   genres             3743 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 238.6+ KB


In [100]:
df_merged[:20]

Unnamed: 0,id,release_date,title,production_budget,domestic_gross,worldwide_gross,runtime_minutes,genres
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",93.0,Horror
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",136.0,"Action,Adventure,Fantasy"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",113.0,"Action,Adventure,Sci-Fi"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",141.0,"Action,Adventure,Sci-Fi"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747",,
5,6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,"$306,000,000","$936,662,225","$2,053,311,220",,
6,7,"Apr 27, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200",149.0,"Action,Adventure,Sci-Fi"
7,8,"May 24, 2007",Pirates of the Caribbean: At Worldâs End,"$300,000,000","$309,420,425","$963,420,425",,
8,9,"Nov 17, 2017",Justice League,"$300,000,000","$229,024,295","$655,945,209",120.0,"Action,Adventure,Fantasy"
9,10,"Nov 6, 2015",Spectre,"$300,000,000","$200,074,175","$879,620,923",148.0,"Action,Adventure,Thriller"


In [112]:
df_merged.drop_duplicates(subset='title').info()

<class 'pandas.core.frame.DataFrame'>
Index: 2312 entries, 0 to 3814
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2312 non-null   int64  
 1   release_date       2312 non-null   object 
 2   title              2312 non-null   object 
 3   production_budget  2312 non-null   object 
 4   domestic_gross     2312 non-null   object 
 5   worldwide_gross    2312 non-null   object 
 6   runtime_minutes    2138 non-null   float64
 7   genres             2287 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 162.6+ KB


In [102]:
df_merged.duplicated(subset='title').value_counts()

False    5698
True     1523
Name: count, dtype: int64

In [111]:
len(df_merged['title'].unique())

2312

In [113]:
df_merged.drop_duplicates(subset='title', inplace=True)
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2312 entries, 0 to 3814
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2312 non-null   int64  
 1   release_date       2312 non-null   object 
 2   title              2312 non-null   object 
 3   production_budget  2312 non-null   object 
 4   domestic_gross     2312 non-null   object 
 5   worldwide_gross    2312 non-null   object 
 6   runtime_minutes    2138 non-null   float64
 7   genres             2287 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 162.6+ KB


In [115]:
df_merged.to_csv('../data/processed/tn_imdb_merged.csv', index=False)