In [8]:
import itertools
import numpy as np
import pandas as pd 
from numbers import Number
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import statsmodels.api as sm
warnings.filterwarnings('ignore')

In [9]:
gross = pd.read_csv('../Data/bom.movie_gross.csv')

gross['foreign_gross'].fillna(0, inplace = True)
gross['domestic_gross'].fillna(0, inplace = True)
gross['studio'].fillna('N/A', inplace = True)
gross_90s_on = gross[gross['year'] >= 1990]
gross_90s_on = gross_90s_on.replace(',','', regex=True)
gross_90s_on.foreign_gross = gross_90s_on.foreign_gross.astype(np.float64)
money_made_gross_90s = gross_90s_on[(gross_90s_on['domestic_gross'] > 0) & (gross_90s_on['foreign_gross'] > 0)]


tmdb = pd.read_csv('../Data/tmdb.movies.csv')


budget = pd.read_csv('../Data/tn.movie_budgets.csv')
budget_90s_on = budget[budget.release_date >= '1990-01-01']
budget_90s_on[budget_90s_on.columns[3:]] = budget_90s_on[budget_90s_on.columns[3:]].replace('[\$,]', '', regex=True).astype(np.int64)
money_making_budget_90s_on = budget_90s_on[budget_90s_on['worldwide_gross'] > 0]

movie_info = pd.read_csv('../Data/rt.movie_info.tsv', sep='\t')

reviews = pd.read_csv('../Data/rt.reviews.tsv', sep='\t', encoding='windows-1252')

In [10]:
budget_gross_cleaned = pd.merge(money_made_gross_90s, money_making_budget_90s_on, how = 'outer', left_on = 'title', right_on = 'movie')

budget_gross_cleaned = budget_gross_cleaned.drop('domestic_gross_x', axis=1)

budget_gross_cleaned = budget_gross_cleaned[budget_gross_cleaned['production_budget'].notna()]

titles = list(budget_gross_cleaned.title)

budget_gross_cleaned['domestic_success'] = (budget_gross_cleaned['domestic_gross_y']/budget_gross_cleaned['production_budget'])

budget_gross_cleaned['foreign_success'] = (budget_gross_cleaned['foreign_gross']/budget_gross_cleaned['production_budget'])

budget_gross_cleaned['WW_success'] = (budget_gross_cleaned['worldwide_gross']/budget_gross_cleaned['production_budget'])

low_budget = budget_gross_cleaned[budget_gross_cleaned['production_budget'] < 5000000]
mid_budget = budget_gross_cleaned[(budget_gross_cleaned['production_budget'] > 5000000) & (budget_gross_cleaned['production_budget'] < 50000000)]
high_budget = budget_gross_cleaned[budget_gross_cleaned['production_budget'] > 50000000]

In [11]:
conn = sqlite3.connect('../Data/im.db')

imdb_movie_ratings = pd.read_sql('''
SELECT *

FROM movie_ratings
    JOIN movie_basics
    USING (movie_id)

''', conn)
imdb_movie_ratings = imdb_movie_ratings.drop(['movie_id', 'averagerating', 'numvotes', 'start_year'], axis = 1)
imdb_movie_ratings = imdb_movie_ratings[(imdb_movie_ratings['original_title'].isin(titles))| (imdb_movie_ratings['primary_title'].isin(titles))]

imdb_ratings_money = pd.merge(imdb_movie_ratings, budget_gross_cleaned, left_on = 'primary_title', right_on = 'title')
imdb_ratings_money = imdb_ratings_money.drop(['primary_title', 'original_title', 'studio', 'year', 'id', 'release_date'], axis = 1)

low_budget_ratings = imdb_ratings_money[imdb_ratings_money['production_budget'] < 5000000]
low_budget_ratings_moneymaker = low_budget_ratings[low_budget_ratings['WW_success'] > 1]

mid_budget_ratings = imdb_ratings_money[(imdb_ratings_money['production_budget'] > 5000000) & (imdb_ratings_money['production_budget'] < 50000000)]
mid_budget_ratings_moneymaker = mid_budget_ratings[mid_budget_ratings['WW_success'] > 1]

high_budget_ratings = imdb_ratings_money[imdb_ratings_money['production_budget'] > 50000000]
high_budget_ratings_moneymaker = high_budget_ratings[high_budget_ratings['WW_success'] > 1]

people = pd.read_sql('''
SELECT p.person_id, p.category, mb.primary_title, pe.primary_name

FROM principals as p
    JOIN movie_basics as mb
    USING(movie_id)
    JOIN persons as pe
    USING (person_id)
    
''', conn)
people = people[people['primary_title'].isin(titles)]
people_money = pd.merge(people, budget_gross_cleaned, left_on = 'primary_title', right_on = 'title')



reg_lang = pd.read_sql('''
SELECT mak.language, mak.region, mb.primary_title

FROM movie_akas as mak
    JOIN movie_basics as mb
    USING(movie_id)
    
    
''', conn)
language_dict = {
    'bg': 'Bulgarian',
    'he': 'Hebrew',
    'fr': 'French',
    'tr': 'Turkish',
    'ka': 'Georgian',
    'ta': 'Tamil',
    'te': 'Telugu',
    'en': 'English',
    'sv': 'Swedish',
    'hr': 'Croatian',
    'sl': 'Slovenian',
    'ca': 'Catalan',
    'fa': 'Persian',
    'es': 'Spanish',
    'cs': 'Czech',
    'nl': 'Dutch',
    'sr': 'Serbian',
    'sk': 'Slovak',
    'et': 'Estonian',
    'it': 'Italian',
    'hu': 'Hungarian',
    'da': 'Danish',
    'de': 'German',
    'el': 'Greek',
    'pt': 'Portuguese',
    'fi': 'Finnish',
    'no': 'Norwegian',
    'ru': 'Russian',
    'uk': 'Ukrainian',
    'ro': 'Romanian',
    'lt': 'Lithuanian',
    'cmn': 'Mandarin Chinese',
    'bs': 'Bosnian',
    'hi': 'Hindi',
    'th': 'Thai',
    'ms': 'Malay',
    'ar': 'Arabic',
    'qbp': None,  # You have 'qbp' with no corresponding full name.
    'ml': 'Malayalam',
    'yue': 'Cantonese',
    'qbn': None,  # You have 'qbn' with no corresponding full name.
    'id': 'Indonesian',
    'gl': 'Galician',
    'ga': 'Irish',
    'mr': 'Marathi',
    'pl': 'Polish',
    'tl': 'Tagalog',
    'vi': 'Vietnamese',
    'eu': 'Basque',
    'ja': 'Japanese',
    'bn': 'Bengali',
    'ur': 'Urdu'
}
reg_lang = reg_lang.replace(({"language": language_dict}))

reg_lang_money = pd.merge(reg_lang, budget_gross_cleaned, left_on = 'primary_title', right_on = 'title')

reg_lang_low_budget = pd.merge(reg_lang, low_budget, left_on = 'primary_title', right_on = 'title')
reg_lang_mid_budget = pd.merge(reg_lang, mid_budget, left_on = 'primary_title', right_on = 'title')
reg_lang_high_budget = pd.merge(reg_lang, high_budget, left_on = 'primary_title', right_on = 'title')

low_budget_language = ['English', 'Turkish', 'Bulgarian', 'French', 'Hebrew', 'Croatian', 'Swedish']
mid_budget_language = ['English', 'Turkish', 'Bulgarian', 'French', 'Hebrew', 'Croatian', 'Swedish', 'Persian', 'Catalan', 'Mandarin Chinese', 'Spanish', 'Bosnian', 'Cantonese', 'Hindi']
high_budget_language = ['English', 'Turkish', 'Bulgarian', 'French', 'Hebrew', 'Croatian', 'Swedish', 'Persian', 'Catalan', 'Mandarin Chinese', 'Spanish', 'Bosnian', 'Cantonese', 'Hindi', 'Dutch', 'Tamil', 'Serbian']


In [12]:
low_budget_ratings['genres'] = low_budget_ratings['genres'].str.split(',')
mid_budget_ratings['genres'] = mid_budget_ratings['genres'].str.split(',')
high_budget_ratings['genres'] = high_budget_ratings['genres'].str.split(',')

low_budget_genres = low_budget_ratings.explode('genres')
mid_budget_genres = mid_budget_ratings.explode('genres')
high_budget_genres = high_budget_ratings.explode('genres')

In [17]:
WW_cleaned =  budget_gross_cleaned[budget_gross_cleaned['WW_success'] > 1]

WW_cleaned['budget_tier']=pd.cut(WW_cleaned['production_budget'],bins=[-float('Inf'),5000000,50000000,float('Inf')],labels=['Low','Medium','High'])
WW_cleaned.to_csv('../Data/WW_cleaned_tiered.csv')

In [16]:
budget_gross_cleaned.to_csv('../Data/success_by_title_cleaned.csv')
low_budget.to_csv('../Data/low_budget_cleaned.csv')
mid_budget.to_csv('../Data/mid_budget_cleaned.csv')
high_budget.to_csv('../Data/high_budget_cleaned.csv')
low_budget_genres.to_csv('../Data/low_budget_genres_cleaned.csv')
mid_budget_genres.to_csv('../Data/mid_budget_genres_cleaned.csv')
high_budget_genres.to_csv('../Data/low_budget_genres_cleaned.csv')