# Movie Hackathon ETL Pipeline - Part 3

### Dependencies

In [1]:
%matplotlib inline

import datetime
import json
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

import config

### Data

In [2]:
# Path to data directory
data_path = 'data/'

In [3]:
# Movie data
movies_df = pd.read_pickle(data_path + 'movies.pkl')
print(movies_df.info())
movies_df.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5982 entries, 0 to 5982
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   kaggle_id             5982 non-null   int64         
 1   imdb_id               5982 non-null   object        
 2   imdb_link             5982 non-null   object        
 3   wikipedia_url         5982 non-null   object        
 4   poster_path           5981 non-null   object        
 5   title                 5982 non-null   object        
 6   original_title        5982 non-null   object        
 7   tagline               4880 non-null   object        
 8   based_on              1966 non-null   object        
 9   overview              5977 non-null   object        
 10  release_date          5982 non-null   datetime64[ns]
 11  year                  5982 non-null   int64         
 12  runtime               5982 non-null   float64       
 13  budget            

Unnamed: 0,kaggle_id,imdb_id,imdb_link,wikipedia_url,poster_path,title,original_title,tagline,based_on,overview,...,writers,producers,director,stars,cinematographers,composers,editors,production_companies,production_countries,distributor
0,9548,tt0098987,https://www.imdb.com/title/tt0098987/,https://en.wikipedia.org/wiki/The_Adventures_o...,/yLeX2QLkHeRlYQRcbU8BKgMaYYD.jpg,The Adventures of Ford Fairlane,The Adventures of Ford Fairlane,Kojak. Columbo. Dirty Harry. Wimps.,"[Characters, by Rex Weiner]","Ford ""Mr. Rock n' Roll Detective"" Fairlane is ...",...,"[David Arnott, James Cappe]","[Steve Perry, Joel Silver]",Renny Harlin,"[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,"[Cliff Eidelman, Yello]",Michael Tronick,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",20th Century Fox
1,25501,tt0098994,https://www.imdb.com/title/tt0098994/,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",/3hjcHNtWn9T6jVGXgNXyCsMWBdj.jpg,"After Dark, My Sweet","After Dark, My Sweet",All they risked was everything.,"[the novel, After Dark, My Sweet, by, Jim Thom...",The intriguing relationship between three desp...,...,"[James Foley, Robert Redlin]","[Ric Kidney, Robert Redlin]",James Foley,"[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,Maurice Jarre,Howard E. Smith,"[{'name': 'Avenue Pictures Productions', 'id':...","[{'iso_3166_1': 'US', 'name': 'United States o...",Avenue Pictures


In [4]:
# Rating data
ratings_df = pd.read_csv(data_path + 'raw/ratings.csv')
print(ratings_df.info())
ratings_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


In [5]:
# Convert timestamp to datetime type
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')
print(ratings_df.info())
ratings_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int64         
 1   movieId    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 794.2 MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,2015-03-09 22:52:09
1,1,147,4.5,2015-03-09 23:07:15


### Aggregate ratings by movie

In [6]:
# Count each star-rating for each movie
ratings_count_df = ratings_df.groupby(['movieId', 'rating'], as_index=False).timestamp.count()
ratings_count_df.columns = ['kaggle_id', 'rating', 'count']
ratings_count_df.head(2)

Unnamed: 0,kaggle_id,rating,count
0,1,0.5,441
1,1,1.0,804


In [7]:
# Pivot ratings into columns
ratings_pivot_df = ratings_count_df.pivot(index='kaggle_id', columns='rating', values='count')
ratings_pivot_df.columns.name = None
ratings_pivot_df.columns = ['rating_' + str(col) for col in ratings_pivot_df.columns]
ratings_pivot_df.reset_index(inplace=True)
ratings_pivot_df.head(2)

Unnamed: 0,kaggle_id,rating_0.5,rating_1.0,rating_1.5,rating_2.0,rating_2.5,rating_3.0,rating_3.5,rating_4.0,rating_4.5,rating_5.0
0,1,441.0,804.0,438.0,2083.0,1584.0,11577.0,5741.0,22020.0,5325.0,15995.0
1,2,263.0,797.0,525.0,2479.0,1810.0,8510.0,2916.0,6035.0,690.0,2035.0


### Join movie and rating data

In [8]:
df = pd.merge(movies_df, ratings_pivot_df, on='kaggle_id', how='left').fillna(0)
print(df.info())
df.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5982 entries, 0 to 5981
Data columns (total 43 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   kaggle_id             5982 non-null   int64         
 1   imdb_id               5982 non-null   object        
 2   imdb_link             5982 non-null   object        
 3   wikipedia_url         5982 non-null   object        
 4   poster_path           5982 non-null   object        
 5   title                 5982 non-null   object        
 6   original_title        5982 non-null   object        
 7   tagline               5982 non-null   object        
 8   based_on              5982 non-null   object        
 9   overview              5982 non-null   object        
 10  release_date          5982 non-null   datetime64[ns]
 11  year                  5982 non-null   int64         
 12  runtime               5982 non-null   float64       
 13  budget            

Unnamed: 0,kaggle_id,imdb_id,imdb_link,wikipedia_url,poster_path,title,original_title,tagline,based_on,overview,...,rating_0.5,rating_1.0,rating_1.5,rating_2.0,rating_2.5,rating_3.0,rating_3.5,rating_4.0,rating_4.5,rating_5.0
0,9548,tt0098987,https://www.imdb.com/title/tt0098987/,https://en.wikipedia.org/wiki/The_Adventures_o...,/yLeX2QLkHeRlYQRcbU8BKgMaYYD.jpg,The Adventures of Ford Fairlane,The Adventures of Ford Fairlane,Kojak. Columbo. Dirty Harry. Wimps.,"[Characters, by Rex Weiner]","Ford ""Mr. Rock n' Roll Detective"" Fairlane is ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25501,tt0098994,https://www.imdb.com/title/tt0098994/,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",/3hjcHNtWn9T6jVGXgNXyCsMWBdj.jpg,"After Dark, My Sweet","After Dark, My Sweet",All they risked was everything.,"[the novel, After Dark, My Sweet, by, Jim Thom...",The intriguing relationship between three desp...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Connect to PostgreSQL database

In [9]:
# Connection string format: "postgres://[user]:[password]@[location]:[port]/[database]"
db_string = f'postgres://postgres:{config.DB_PW}@127.0.0.1:5432/movie_data'

# Create engine
engine = create_engine(db_string)
engine

Engine(postgres://postgres:***@127.0.0.1:5432/movie_data)

### Load movie data into database

In [12]:
# Create table for movie data
df.to_sql('movies', engine, if_exists='replace')
pd.read_sql('SELECT * FROM movies', engine).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5982 entries, 0 to 5981
Data columns (total 44 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   index                 5982 non-null   int64         
 1   kaggle_id             5982 non-null   int64         
 2   imdb_id               5982 non-null   object        
 3   imdb_link             5982 non-null   object        
 4   wikipedia_url         5982 non-null   object        
 5   poster_path           5982 non-null   object        
 6   title                 5982 non-null   object        
 7   original_title        5982 non-null   object        
 8   tagline               5982 non-null   object        
 9   based_on              5982 non-null   object        
 10  overview              5982 non-null   object        
 11  release_date          5982 non-null   datetime64[ns]
 12  year                  5982 non-null   int64         
 13  runtime           

### Load rating data into database

In [13]:
# Load data in chunks
loaded, chunksize = 0, 1e6
start = datetime.datetime.now()
for chunk in pd.read_csv(data_path + 'raw/ratings.csv', chunksize=chunksize):
    print('Loading rows', loaded, 'to', loaded + chunksize, end=' | ')
    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'], unit='s')
    chunk.to_sql('ratings', engine, if_exists='append')
    loaded += chunksize
    print((datetime.datetime.now() - start), 'elapsed')

Loading rows 0 to 1000000.0 | 0:02:05.423014 elapsed
Loading rows 1000000.0 to 2000000.0 | 0:04:32.170540 elapsed
Loading rows 2000000.0 to 3000000.0 | 0:06:50.403111 elapsed
Loading rows 3000000.0 to 4000000.0 | 0:09:12.970840 elapsed
Loading rows 4000000.0 to 5000000.0 | 0:11:37.039540 elapsed
Loading rows 5000000.0 to 6000000.0 | 0:13:55.987443 elapsed
Loading rows 6000000.0 to 7000000.0 | 0:15:54.089590 elapsed
Loading rows 7000000.0 to 8000000.0 | 0:17:52.754633 elapsed
Loading rows 8000000.0 to 9000000.0 | 0:19:56.805774 elapsed
Loading rows 9000000.0 to 10000000.0 | 0:22:31.368234 elapsed
Loading rows 10000000.0 to 11000000.0 | 0:24:38.865080 elapsed
Loading rows 11000000.0 to 12000000.0 | 0:26:37.714832 elapsed
Loading rows 12000000.0 to 13000000.0 | 0:28:37.783845 elapsed
Loading rows 13000000.0 to 14000000.0 | 0:30:47.192705 elapsed
Loading rows 14000000.0 to 15000000.0 | 0:32:48.827282 elapsed
Loading rows 15000000.0 to 16000000.0 | 0:34:55.346271 elapsed
Loading rows 160000

In [15]:
pd.read_sql('SELECT * FROM ratings', engine).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   index      int64         
 1   userId     int64         
 2   movieId    int64         
 3   rating     float64       
 4   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 992.7 MB
