In [1]:
# Import packages
import pandas as pd
import seaborn as sb
import sqlite3 as sq

# Add extra relevant packages below as needed: 
import shutil
import numpy as np
import os

# Non-SQL Databases

### TMDB

In [2]:
db = pd.read_csv('zippedData/tmdb.movies.csv.gz')
db['genre_ids'].replace([''], np.nan, inplace=True)

db.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [3]:
genre_ids = db['genre_ids'].tolist()
genre_ids
counter = 0
genre_list = []
for list_ in genre_ids:
    temp_item = genre_ids[counter].strip('][').split(', ')
    genre_list.append(temp_item)
    counter += 1

In [4]:
genres = ['Action','Adventure','Animation','Comedy','Crime','Documentary','Drama','Family','Fantasy','History','Horror','Music','Mystery','Romance','Science Fiction','TV Movie','Thriller','War','Western']
genre_id = [28,12,16,35,80,99,18,10751,14,36,27,10402,9648,10749,878,10770,53,10752,37]

genre_ref = {genres[i]: genre_id[i] for i in range(len(genres))}
genre_ref

{'Action': 28,
 'Adventure': 12,
 'Animation': 16,
 'Comedy': 35,
 'Crime': 80,
 'Documentary': 99,
 'Drama': 18,
 'Family': 10751,
 'Fantasy': 14,
 'History': 36,
 'Horror': 27,
 'Music': 10402,
 'Mystery': 9648,
 'Romance': 10749,
 'Science Fiction': 878,
 'TV Movie': 10770,
 'Thriller': 53,
 'War': 10752,
 'Western': 37}

### TN DB

In [5]:
tn_budget_raw = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
tn_budget_raw.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


### BOM DB

In [6]:
bom_budget_raw = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
bom_budget_raw.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


### Combining tables

In [7]:
result = pd.merge(tn_budget_raw, db, left_on='movie', right_on='original_title',how='inner')
result.drop(columns=['id_x','Unnamed: 0','genre_ids','id_y','original_title','title'], inplace=True)
result

Unnamed: 0,release_date_x,movie,production_budget,domestic_gross,worldwide_gross,original_language,popularity,release_date_y,vote_average,vote_count
0,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",en,26.526,2009-12-18,7.4,18676
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",en,30.579,2011-05-20,6.4,8571
2,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",en,44.383,2015-05-01,7.3,13457
3,"Apr 27, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200",en,80.773,2018-04-27,8.3,13948
4,"Nov 17, 2017",Justice League,"$300,000,000","$229,024,295","$655,945,209",en,34.953,2017-11-17,6.2,7510
...,...,...,...,...,...,...,...,...,...,...
2311,"Sep 1, 2015",Exeter,"$25,000",$0,"$489,792",en,5.934,2015-03-26,4.7,121
2312,"Apr 21, 2015",Ten,"$25,000",$0,$0,en,1.575,2014-03-28,5.4,5
2313,"Dec 31, 2014",Dry Spell,"$22,000",$0,$0,en,0.600,2013-02-14,6.0,1
2314,"Jan 4, 2013",All Superheroes Must Die,"$20,000",$0,$0,en,2.078,2013-01-04,3.9,19


# IMDB SQL Database

In [8]:
# Unzip IMDB SQL to new folder repository
import zipfile
with zipfile.ZipFile("zippedData/im.db.zip", 'r') as zip_ref:
    zip_ref.extractall("temp/IMDB_sql")
    
db_file = 'temp/IMDB_sql/im.db'

In [9]:
# Initializing imdb SQL database
# Create a SQL connection to our SQLite database
con = sq.connect(db_file)


## Connection test and reference
genre_reviews = pd.read_sql(""" SELECT genres, primary_title
                
                FROM movie_basics
                LEFT JOIN movie_ratings ON movie_ratings.movie_id = movie_basics.movie_id
                JOIN movie_akas ON movie_akas.movie_id = movie_basics.movie_id
                WHERE start_year >= 2018 and region = 'US'
                ORDER BY averagerating DESC

""", con)

genre_reviews.dropna(inplace = True)
genre_reviews

Unnamed: 0,genres,primary_title
0,Documentary,All Around Us
1,"Documentary,History",Ellis Island: The Making of a Master Race in A...
2,Documentary,Fly High: Story of the Disc Dog
3,Documentary,Hiro's Table
4,"Documentary,Music",RISE: The Story of Augustines
...,...,...
5690,Thriller,The Last Exorcist
5691,Documentary,Conway Pride
5692,"Comedy,Romance",Straight Up
5693,Drama,The Journey Ahead


In [10]:
# DO THINGS
comb_data = pd.merge(result, genre_reviews, left_on='movie', right_on='primary_title',how='inner')
comb_data.drop(columns=['primary_title'], inplace=True)
comb_data.drop_duplicates(subset=['movie'], inplace=True, ignore_index=True)
comb_data.sort_values(by="worldwide_gross", ascending=False, inplace=True, ignore_index=True)


comb_data

Unnamed: 0,release_date_x,movie,production_budget,domestic_gross,worldwide_gross,original_language,popularity,release_date_y,vote_average,vote_count,genres
0,"Aug 17, 2018",Alpha,"$51,000,000","$35,851,379","$99,624,873",en,24.536,2018-08-17,5.9,1167,"Adventure,Drama,Family"
1,"Jun 15, 1994",The Lion King,"$79,300,000","$421,785,283","$986,214,868",en,28.583,1994-06-23,8.2,10160,"Adventure,Animation,Drama"
2,"Sep 14, 2018",A Simple Favor,"$20,000,000","$53,548,586","$97,628,717",en,21.121,2018-09-14,6.6,1756,"Comedy,Crime,Drama"
3,"Apr 13, 2018",Truth or Dare,"$3,500,000","$41,411,015","$95,127,344",en,8.102,2012-08-06,5.6,194,"Horror,Thriller"
4,"Nov 23, 2018",The Favourite,"$15,000,000","$34,366,783","$94,113,929",en,28.651,2018-11-23,7.7,2096,"Biography,Drama,History"
...,...,...,...,...,...,...,...,...,...,...,...
146,"Jul 12, 2019",Crawl,"$17,000,000",$0,$0,en,1.937,2011-10-15,5.6,18,"Action,Horror,Thriller"
147,"Dec 13, 2018",Bird Box,"$19,800,000",$0,$0,en,26.156,2018-11-12,7.0,4777,"Drama,Horror,Sci-Fi"
148,"Dec 31, 2012",After,"$650,000",$0,$0,en,7.712,2012-09-14,5.7,86,"Drama,Romance"
149,"Oct 5, 2010",Rust,"$250,000",$0,$0,en,0.660,2010-10-07,8.0,1,Drama


In [12]:
#remove temporary files
con.close()
shutil.rmtree('temp/')