## Compiling Master Data Set

In [None]:
# Import packages
import pandas as pd
import seaborn as sns
import sqlite3 as sq
import matplotlib.pyplot as plt
%matplotlib inline

# Add extra relevant packages below as needed: 
import shutil
import numpy as np
import os
import re

In [None]:
# Import TMDB Database and format
db = pd.read_csv('zippedData/tmdb.movies.csv.gz')
db['genre_ids'].replace([''], np.nan, inplace=True)

#Import TN Database
tn_budget_raw = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

#Combine TN and TMDB databases
result = pd.merge(tn_budget_raw, db, left_on='movie', right_on='original_title',how='inner')
result.drop(columns=['id_x','Unnamed: 0','genre_ids','id_y','original_title','title'], inplace=True)

# Unzip IMDB SQL to new folder repository
import zipfile
with zipfile.ZipFile("zippedData/im.db.zip", 'r') as zip_ref:
    zip_ref.extractall("temp/IMDB_sql")
    
db_file = 'temp/IMDB_sql/im.db'

# Initializing imdb SQL database
# Create a SQL connection to our SQLite database
con = sq.connect(db_file)


## Query database for relevant information
genre_reviews = pd.read_sql(""" SELECT genres, primary_title, start_year
                
                FROM movie_basics
                LEFT JOIN movie_ratings ON movie_ratings.movie_id = movie_basics.movie_id
                JOIN movie_akas ON movie_akas.movie_id = movie_basics.movie_id
                WHERE start_year >= 2015 and region = 'US'
                ORDER BY averagerating DESC

""", con)

# Remove NaN values from SQL database
genre_reviews.dropna(inplace = True)

# Combine CSV and SQL data and format
comb_data = pd.merge(result, genre_reviews, left_on='movie', right_on='primary_title',how='inner')
comb_data.drop(columns=['primary_title', 'release_date_x','release_date_y'], inplace=True)
comb_data.drop_duplicates(subset=['movie'], inplace=True, ignore_index=True)

# Convert production budgets into ints
comb_data['production_budget'] = comb_data['production_budget'].str.strip('$')
comb_data['production_budget'] = comb_data['production_budget'].str.replace(',','')
comb_data['production_budget'] = comb_data['production_budget'].astype(int)

#Convert domestic gross into ints
comb_data['domestic_gross'] = comb_data['domestic_gross'].str.strip('$')
comb_data['domestic_gross'] = comb_data['domestic_gross'].str.replace(',','')
comb_data['domestic_gross'] = comb_data['domestic_gross'].astype(int)

# Convert worldwide gross into ints
comb_data['worldwide_gross'] = comb_data['worldwide_gross'].str.strip('$')
comb_data['worldwide_gross'] = comb_data['worldwide_gross'].str.replace(',','')
comb_data['worldwide_gross'] = comb_data['worldwide_gross'].astype(int)

# Removing movies that made less than $25,000
comb_data = comb_data[(comb_data['worldwide_gross'] > 25000)]

# Create movie net columns
comb_data['worldwide_net'] = comb_data['worldwide_gross'] - comb_data['production_budget']
comb_data['domestic_net'] = comb_data['domestic_gross'] - comb_data['production_budget']

# Sort by worldwide net
comb_data.sort_values(by="worldwide_net", ascending=False, inplace=True, ignore_index=True)

In [None]:
# Create new df with genre counts higher than n
df1 = comb_data[comb_data['genres'].map(comb_data['genres'].value_counts()) > 10]

In [None]:
# Print master data frame
comb_data