In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
imdb_movies_filepath = "./data/IMDb movies.csv"
imdb_names_filepath = "./data/IMDb names.csv"

In [5]:
imdb_movies = pd.read_csv(imdb_movies_filepath)
imdb_names = pd.read_csv(imdb_names_filepath)
imdb_movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [6]:
# MOVIES FILE
# imdb_movies.isnull().sum()

imdb_title_id                0
title                        0
original_title               0
year                         0
date_published               0
genre                        0
duration                     0
country                     39
language                   755
director                    73
writer                    1493
production_company        4325
actors                      66
description               2430
avg_vote                     0
votes                        0
budget                   58469
usa_gross_income         66179
worlwide_gross_income    51381
metascore                68551
reviews_from_users        7077
reviews_from_critics     10987
dtype: int64

In [7]:
imdb_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81273 entries, 0 to 81272
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          81273 non-null  object 
 1   title                  81273 non-null  object 
 2   original_title         81273 non-null  object 
 3   year                   81273 non-null  int64  
 4   date_published         81273 non-null  object 
 5   genre                  81273 non-null  object 
 6   duration               81273 non-null  int64  
 7   country                81234 non-null  object 
 8   language               80518 non-null  object 
 9   director               81200 non-null  object 
 10  writer                 79780 non-null  object 
 11  production_company     76948 non-null  object 
 12  actors                 81207 non-null  object 
 13  description            78843 non-null  object 
 14  avg_vote               81273 non-null  float64
 15  vo

In [8]:
### Dropping columns and focus on language and country

In [9]:
movie_df = imdb_movies[['imdb_title_id', 'country', 'language']]
movie_df

Unnamed: 0,imdb_title_id,country,language
0,tt0000574,Australia,
1,tt0001892,"Germany, Denmark",
2,tt0002101,USA,English
3,tt0002130,Italy,Italian
4,tt0002199,USA,English
...,...,...,...
81268,tt9903716,India,Telugu
81269,tt9905412,India,Malayalam
81270,tt9905462,India,Malayalam
81271,tt9911774,India,Malayalam


In [10]:
# get the list of unique country names
movie_df_temp1 = movie_df.set_index('imdb_title_id').country.str.split(', ', expand=True).stack()
uniq_country = movie_df_temp1.unique()
uniq_country.size

190

In [11]:
# create a dataframe 'country_origin_df' with 'country_id' and 'country_name'
temp1 = pd.DataFrame(data=[np.arange(191), uniq_country])
country_origin_df = temp1.transpose()
country_origin_df.columns = ['country_id','country_name']
country_origin_df

Unnamed: 0,country_id,country_name
0,0,Australia
1,1,Germany
2,2,Denmark
3,3,USA
4,4,Italy
...,...,...
186,186,Guadeloupe
187,187,Malawi
188,188,Holy See (Vatican City State)
189,189,Oman


In [12]:
# get the list of unique movie IDs
movie_df.nunique()

imdb_title_id    81273
country           4632
language          4251
dtype: int64

In [13]:
movie_country_df = imdb_movies[['imdb_title_id', 'country']]
movie_country_df

Unnamed: 0,imdb_title_id,country
0,tt0000574,Australia
1,tt0001892,"Germany, Denmark"
2,tt0002101,USA
3,tt0002130,Italy
4,tt0002199,USA
...,...,...
81268,tt9903716,India
81269,tt9905412,India
81270,tt9905462,India
81271,tt9911774,India


In [14]:
movie_df_temp1 = movie_country_df.assign(country_new=movie_country_df.country.str.split(', ')).explode('country_new')
movie_df_temp1

Unnamed: 0,imdb_title_id,country,country_new
0,tt0000574,Australia,Australia
1,tt0001892,"Germany, Denmark",Germany
1,tt0001892,"Germany, Denmark",Denmark
2,tt0002101,USA,USA
3,tt0002130,Italy,Italy
...,...,...,...
81268,tt9903716,India,India
81269,tt9905412,India,India
81270,tt9905462,India,India
81271,tt9911774,India,India


In [16]:
# map out the country_id column with the corresponding country_id from the country_origin_df
movie_df_temp1['country_id'] = movie_df_temp1['country_new'].map(country_origin_df.set_index('country_name')['country_id'])
movie_df_temp1

Unnamed: 0,imdb_title_id,country,country_new,country_id
0,tt0000574,Australia,Australia,0
1,tt0001892,"Germany, Denmark",Germany,1
1,tt0001892,"Germany, Denmark",Denmark,2
2,tt0002101,USA,USA,3
3,tt0002130,Italy,Italy,4
...,...,...,...,...
81268,tt9903716,India,India,19
81269,tt9905412,India,India,19
81270,tt9905462,India,India,19
81271,tt9911774,India,India,19


In [20]:
movie_country_junction_df =movie_df_temp1.drop(columns=['country_new','country'])
movie_country_junction_df

Unnamed: 0,imdb_title_id,country_id
0,tt0000574,0
1,tt0001892,1
1,tt0001892,2
2,tt0002101,3
3,tt0002130,4
...,...,...
81268,tt9903716,19
81269,tt9905412,19
81270,tt9905462,19
81271,tt9911774,19


In [23]:
# now do the same with language
movie_df.language

0              NaN
1              NaN
2          English
3          Italian
4          English
           ...    
81268       Telugu
81269    Malayalam
81270    Malayalam
81271    Malayalam
81272      Turkish
Name: language, Length: 81273, dtype: object

In [25]:
movie_df_temp2 = movie_df.set_index('imdb_title_id').language.str.split(', ', expand=True).stack()
movie_df_temp2

imdb_title_id   
tt0002101      0      English
tt0002130      0      Italian
tt0002199      0      English
tt0002423      0       German
tt0002445      0      Italian
                      ...    
tt9903716      0       Telugu
tt9905412      0    Malayalam
tt9905462      0    Malayalam
tt9911774      0    Malayalam
tt9914286      0      Turkish
Length: 103728, dtype: object

In [26]:
uniq_language = movie_df_temp2.unique()
uniq_language.size

264

In [29]:
# create a dataframe 'language_df' with 'language_id' and 'language'
temp2 = pd.DataFrame(data=[np.arange(265), uniq_language])
language_df = temp2.transpose()
language_df.columns = ['language_id','language']
language_df

Unnamed: 0,language_id,language
0,0,English
1,1,Italian
2,2,German
3,3,Danish
4,4,French
...,...,...
260,260,Bemba
261,261,Wayuu
262,262,Balinese
263,263,Haida


In [31]:
movie_language_df = imdb_movies[['imdb_title_id', 'language']]
movie_language_df

Unnamed: 0,imdb_title_id,language
0,tt0000574,
1,tt0001892,
2,tt0002101,English
3,tt0002130,Italian
4,tt0002199,English
...,...,...
81268,tt9903716,Telugu
81269,tt9905412,Malayalam
81270,tt9905462,Malayalam
81271,tt9911774,Malayalam


In [32]:
#since there are more than one languages for some movies, create a new column 'language_name' with single language in it
movie_df_temp2 = movie_language_df.assign(language_new=movie_language_df.language.str.split(', ')).explode('language_new')
movie_df_temp2

Unnamed: 0,imdb_title_id,language,language_new
0,tt0000574,,
1,tt0001892,,
2,tt0002101,English,English
3,tt0002130,Italian,Italian
4,tt0002199,English,English
...,...,...,...
81268,tt9903716,Telugu,Telugu
81269,tt9905412,Malayalam,Malayalam
81270,tt9905462,Malayalam,Malayalam
81271,tt9911774,Malayalam,Malayalam


In [33]:
# map out the language_id column with the corresponding language_id from the language_df
# movie_df_temp1['country_id'] = movie_df_temp1['country_new'].map(country_origin_df.set_index('country_name')['country_id'])
movie_df_temp2['language_id'] = movie_df_temp2['language_new'].map(language_df.set_index('language')['language_id'])
movie_df_temp2

Unnamed: 0,imdb_title_id,language,language_new,language_id
0,tt0000574,,,264
1,tt0001892,,,264
2,tt0002101,English,English,0
3,tt0002130,Italian,Italian,1
4,tt0002199,English,English,0
...,...,...,...,...
81268,tt9903716,Telugu,Telugu,59
81269,tt9905412,Malayalam,Malayalam,91
81270,tt9905462,Malayalam,Malayalam,91
81271,tt9911774,Malayalam,Malayalam,91


In [34]:
movie_language_junction_df = movie_df_temp2.drop(columns=['language','language_new'])
movie_language_junction_df

Unnamed: 0,imdb_title_id,language_id
0,tt0000574,264
1,tt0001892,264
2,tt0002101,0
3,tt0002130,1
4,tt0002199,0
...,...,...
81268,tt9903716,59
81269,tt9905412,91
81270,tt9905462,91
81271,tt9911774,91


In [None]:
# rds_connection_string = f'{username}:{password}@localhost:5432/customer_db'