In [1]:
# Install dependencies as needed:
# !pip install --upgrade kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ashirwadsangwan/imdb-dataset/version/824") # if already downloaded, will still give us the path

print("Path to dataset files:", path)

Path to dataset files: /Users/sunyoungpark/.cache/kagglehub/datasets/ashirwadsangwan/imdb-dataset/versions/824


In [2]:
# check if the files are successfully downloaded in the said path
!ls -alh $path

total 16662144
drwxr-xr-x@ 7 sunyoungpark  staff   224B Feb 26 09:27 [34m.[m[m
drwxr-xr-x@ 3 sunyoungpark  staff    96B Feb 26 09:26 [34m..[m[m
-rw-r--r--@ 1 sunyoungpark  staff   833M Feb 26 09:26 name.basics.tsv
-rw-r--r--@ 1 sunyoungpark  staff   2.4G Feb 26 09:26 title.akas.tsv
-rw-r--r--@ 1 sunyoungpark  staff   944M Feb 26 09:26 title.basics.tsv
-rw-r--r--@ 1 sunyoungpark  staff   3.8G Feb 26 09:27 title.principals.tsv
-rw-r--r--@ 1 sunyoungpark  staff    25M Feb 26 09:27 title.ratings.tsv


## We'll use two files that contain basic information and ratings for each title.
Note: if the title matching is not successful with the titles in the basics file, try akas file where there is localized/original titles and language information which may help with matching.

### title.basics.tsv.gz - Contains the following information for titles:
- tconst (string) - alphanumeric unique identifier of the title.
- titleType (string) – the type/format of the title (e.g. movie, short,
tvseries, tvepisode, video, etc).
- primaryTitle (string) – the more popular title / the title used by
the filmmakers on promotional materials at the point of release.
- originalTitle (string) - original title, in the original language.
- isAdult (boolean) - 0: non-adult title; 1: adult title.
- startYear (YYYY) – represents the release year of a title. In the
case of TV Series, it is the series start year.
- endYear (YYYY) – TV Series end year. for all other title types.
- runtimeMinutes – primary runtime of the title, in minutes.
- genres (string array) – includes up to three genres associated with
the title.

### title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles:
- tconst (string) - alphanumeric unique identifier of the title.
- averageRating – weighted average of all the individual user ratings.
- numVotes - number of votes the title has received.

Load the two tsv files to dataframes and then write to a sqlite table.

In [3]:
import pandas as pd
import os
import sqlite3 

# Connect to SQLite database 
conn = sqlite3.connect('films.db') 

# Create a cursor object 
cur = conn.cursor() 

In [4]:
# Load TSV file into a DataFrame, specifying tab as the separator
df_basics = pd.read_csv(os.path.join(path,'title.basics.tsv'), sep='\t')
df_basics = df_basics.convert_dtypes()
df_basics.info()

  df_basics = pd.read_csv(os.path.join(path,'title.basics.tsv'), sep='\t')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11470477 entries, 0 to 11470476
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          string
 1   titleType       string
 2   primaryTitle    string
 3   originalTitle   string
 4   isAdult         object
 5   startYear       string
 6   endYear         string
 7   runtimeMinutes  string
 8   genres          string
dtypes: object(1), string(8)
memory usage: 787.6+ MB


In [5]:
# Change data types to correct types
df_basics['isAdult'] = df_basics['isAdult'].astype(bool)
df_basics['startYear']=pd.to_numeric(df_basics['startYear'], errors='coerce')
df_basics['endYear']=pd.to_numeric(df_basics['endYear'], errors='coerce')
df_basics['runtimeMinutes']=pd.to_numeric(df_basics['runtimeMinutes'], errors='coerce')

# Drop rows where titleType is 'tvEpisode' to reduce memory usage since we're only looking at movies
df_basics = df_basics[df_basics['titleType']!='tvEpisode']

# Check before saving to database
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2650163 entries, 0 to 11470475
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          string
 1   titleType       string
 2   primaryTitle    string
 3   originalTitle   string
 4   isAdult         bool  
 5   startYear       Int64 
 6   endYear         Int64 
 7   runtimeMinutes  Int64 
 8   genres          string
dtypes: Int64(3), bool(1), string(5)
memory usage: 192.1 MB


In [6]:
# Do the same with the ratings table
# Load TSV file into a DataFrame, specifying tab as the separator
df_ratings = pd.read_csv(os.path.join(path,'title.ratings.tsv'), sep='\t')
df_ratings = df_ratings.convert_dtypes()
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1537005 entries, 0 to 1537004
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1537005 non-null  string 
 1   averageRating  1537005 non-null  Float64
 2   numVotes       1537005 non-null  Int64  
dtypes: Float64(1), Int64(1), string(1)
memory usage: 38.1 MB


In [7]:
# Write the data to a sqlite table 
df_basics.to_sql('imdb_basics', conn, if_exists='replace', index=False) 
df_ratings.to_sql('imdb_ratings', conn, if_exists='replace', index=False) 

1537005

In [8]:
# Check if basics dataframe was saved correctly by loading and pulling up first 10 entries
query = """
        SELECT *
        FROM imdb_basics
        LIMIT 10;
        """
for row in cur.execute(query): 
    print(row) 

('tt0000001', 'short', 'Carmencita', 'Carmencita', 0, 1894, None, 1, 'Documentary,Short')
('tt0000002', 'short', 'Le clown et ses chiens', 'Le clown et ses chiens', 0, 1892, None, 5, 'Animation,Short')
('tt0000003', 'short', 'Poor Pierrot', 'Pauvre Pierrot', 0, 1892, None, 5, 'Animation,Comedy,Romance')
('tt0000004', 'short', 'Un bon bock', 'Un bon bock', 0, 1892, None, 12, 'Animation,Short')
('tt0000005', 'short', 'Blacksmith Scene', 'Blacksmith Scene', 0, 1893, None, 1, 'Short')
('tt0000006', 'short', 'Chinese Opium Den', 'Chinese Opium Den', 0, 1894, None, 1, 'Short')
('tt0000007', 'short', 'Corbett and Courtney Before the Kinetograph', 'Corbett and Courtney Before the Kinetograph', 0, 1894, None, 1, 'Short,Sport')
('tt0000008', 'short', 'Edison Kinetoscopic Record of a Sneeze', 'Edison Kinetoscopic Record of a Sneeze', 0, 1894, None, 1, 'Documentary,Short')
('tt0000009', 'movie', 'Miss Jerry', 'Miss Jerry', 0, 1894, None, 45, 'Romance')
('tt0000010', 'short', 'Leaving the Factory',

In [9]:
# Check if ratings dataframe was saved correctly by loading and pulling up first 10 entries
query = """
        SELECT *
        FROM imdb_ratings
        LIMIT 10;
        """
for row in cur.execute(query): 
    print(row) 

('tt0000001', 5.7, 2133)
('tt0000002', 5.5, 289)
('tt0000003', 6.4, 2169)
('tt0000004', 5.3, 184)
('tt0000005', 6.2, 2896)
('tt0000006', 5.0, 208)
('tt0000007', 5.3, 901)
('tt0000008', 5.4, 2280)
('tt0000009', 5.3, 220)
('tt0000010', 6.8, 7871)


### Let's load Netflix original film list and add to the database

First, I want to add the netflix data to the database so we can access everything from one place in the future.

In [10]:
# Load TSV file into a DataFrame, specifying tab as the separator
df_netflix = pd.read_csv('netflix_og_films.csv')
# Fix the column names before writing to the database to follow SQL syntax
df_netflix.rename(columns={'Title': 'title', 'Release date': 'releaseDate', 
                   'Genre':'genre','Runtime':'runtime','Language':'language','Film type':'filmType'}, inplace=True)

Before we save this dataframe into the database, I want to convert runtime ('x h x min') to minutes, so that we can use this as an additional constraint when matching the title of the films to IMDB.

In [11]:
def hm2m(time_str):
    """a function to convert runtime to minutes"""
    cuml_min = 0
    if time_str.find('h')>0:
        h_pos = time_str.find('h')
        cuml_min += int(time_str[:h_pos-1])*60
        time_str = time_str[h_pos+2:]
    
    if time_str.find('min')>0:
        m_pos = time_str.find('min')
        cuml_min += int(time_str[:m_pos-1])
        
    return cuml_min

In [12]:
# Convert runtime that's in X h X min to minutes
df_netflix = df_netflix.convert_dtypes() # convert to string first
df_netflix['runtime_min']=df_netflix['runtime'].apply(hm2m) # apply the above function to the runtime column
df_netflix.head() # check if it worked correctly

Unnamed: 0,title,releaseDate,genre,runtime,language,filmType,runtime_min
0,Beasts of No Nation,"October 16, 2015",War drama,2 h 17 min,English,Feature films,137
1,The Ridiculous 6,"December 11, 2015",Western comedy,2 h,English,Feature films,120
2,Pee-wee's Big Holiday,"March 18, 2016",Adventure comedy,1 h 30 min,English,Feature films,90
3,Special Correspondents,"April 29, 2016",Satire,1 h 41 min,English,Feature films,101
4,The Do-Over,"May 27, 2016",Action comedy,1 h 48 min,English,Feature films,108


In [28]:
df_netflix['releaseYear'] = df_netflix['releaseDate'].str[-4:].astype(int)

In [29]:
df_netflix.head()

Unnamed: 0,title,releaseDate,genre,runtime,language,filmType,runtime_min,releaseYear
0,Beasts of No Nation,"October 16, 2015",War drama,2 h 17 min,English,Feature films,137,2015
1,The Ridiculous 6,"December 11, 2015",Western comedy,2 h,English,Feature films,120,2015
2,Pee-wee's Big Holiday,"March 18, 2016",Adventure comedy,1 h 30 min,English,Feature films,90,2016
3,Special Correspondents,"April 29, 2016",Satire,1 h 41 min,English,Feature films,101,2016
4,The Do-Over,"May 27, 2016",Action comedy,1 h 48 min,English,Feature films,108,2016


In [30]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1413 entries, 0 to 1412
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1413 non-null   string
 1   releaseDate  1413 non-null   string
 2   genre        1065 non-null   string
 3   runtime      1413 non-null   string
 4   language     1413 non-null   string
 5   filmType     1413 non-null   string
 6   runtime_min  1413 non-null   int64 
 7   releaseYear  1413 non-null   int64 
dtypes: int64(2), string(6)
memory usage: 88.4 KB


In [32]:
# Write the data to a sqlite table 
df_netflix.to_sql('netflix', conn, if_exists='replace', index=False) 

1413

In [33]:
# Read in data and list the column names and the first 10 entries
query = """
        SELECT *
        FROM netflix
        LIMIT 10;
        """
out = cur.execute(query)
names = list(map(lambda x: x[0], cur.description))
print(names)
for row in out: 
    print(row) 

['title', 'releaseDate', 'genre', 'runtime', 'language', 'filmType', 'runtime_min', 'releaseYear']
('Beasts of No Nation', 'October 16, 2015', 'War drama', '2 h 17 min', 'English', 'Feature films', 137, 2015)
('The Ridiculous 6', 'December 11, 2015', 'Western comedy', '2 h', 'English', 'Feature films', 120, 2015)
("Pee-wee's Big Holiday", 'March 18, 2016', 'Adventure comedy', '1 h 30 min', 'English', 'Feature films', 90, 2016)
('Special Correspondents', 'April 29, 2016', 'Satire', '1 h 41 min', 'English', 'Feature films', 101, 2016)
('The Do-Over', 'May 27, 2016', 'Action comedy', '1 h 48 min', 'English', 'Feature films', 108, 2016)
('The Fundamentals of Caring', 'June 24, 2016', 'Comedy drama', '1 h 37 min', 'English', 'Feature films', 97, 2016)
('Brahman Naman', 'July 7, 2016', 'Sex comedy', '1 h 35 min', 'English', 'Feature films', 95, 2016)
('Rebirth', 'July 15, 2016', 'Thriller', '1 h 40 min', 'English', 'Feature films', 100, 2016)
('Tallulah', 'July 29, 2016', 'Comedy drama', '1 

Close out the connection to SQLite database

In [7]:
# Close connection to SQLite database 
conn.close() 