In [394]:
import pandas as pd
import numpy as np
from ast import literal_eval
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from fuzzywuzzy import fuzz
import fuzzymatcher

In [395]:
df1 = pd.read_csv('data/mojo_pg1.csv')
df2 = pd.read_csv('data/mojo_pg2.csv')
df3 = pd.read_csv('data/mojo_pg3.csv')
df4 = pd.read_csv('data/mojo_pg4.csv')
df5 = pd.read_csv('data/mojo_pg5.csv')

In [526]:
#Create dummy variables for genres
genres_list = ['Action','Adventure','Animation','Biography','Comedy','Crime','Drama','Fantasy','Family','History','Horror','Musical','Mystery','Romance','Sci-Fi','Sport','Thriller','Western']

for i in genres_list:
    df_all[i] = df_all['genres'].apply(lambda x : 1 if i in x else 0)
    
df_all['genre_top10'] = np.where((df_all['Biography'] == 1) | (df_all['Crime'] == 1) | 
                             (df_all['History'] == 1) |
                             (df_all['Horror'] == 1) |
                             (df_all['Musical'] == 1) |
                             (df_all['Mystery'] == 1) |
                             (df_all['Sport'] == 1) |
                             (df_all['Western'] == 1), 0, 1)

df_all['genre_top3'] = np.where((df_all['Action'] == 1) |
                             (df_all['Adventure'] == 1) |
                             (df_all['Comedy'] == 1), 1, 0)

In [527]:
#Rebucket domestic distributors to group smaller distributors
df_all['domestic_distributor'] = df_all['domestic_distributor'].replace(np.nan,'Universal Pictures')
dist_counts = df_all['domestic_distributor'].value_counts()
other_dist = list(dist_counts[dist_counts <= 50].index)

df_all['domestic_dist_processed'] = df_all['domestic_distributor'].replace(other_dist, 'Other')
df_all['domestic_dist_processed'].value_counts()


df_all['dd_top5'] = np.where((df_all['domestic_dist_processed'] == 'Other') | (df_all['domestic_dist_processed'] == 'Sony Pictures Entertainment (SPE)') | 
                             (df_all['domestic_dist_processed'] == 'Paramount Pictures') |
                             (df_all['domestic_dist_processed'] == 'Universal Pictures'), 0, 1)

In [528]:
domestic_distributors_list = df_all['domestic_dist_processed'].unique().tolist()

In [529]:
# Import and clean budget data for merging
df_budget = pd.read_csv('data/budget-all.csv')
df_budget.rename(columns = {'title': 'movie_title'},inplace=True)

In [530]:
#Fuzzy merge tables to match on movie title
merged_df = fuzzymatcher.fuzzy_left_join(df_all,
                                            df_budget,
                                            left_on,
                                            right_on,
                                            left_id_col='movie_title',
                                            right_id_col='movie_title')

In [531]:
#Replace values of budget column based on accuracy of match
merged_df['budget_left'] = np.where((merged_df['best_match_score'] > 0) & (merged_df['budget_left'].isnull()), 
                                    merged_df['budget_right'], merged_df['budget_left'])

In [532]:
#Rename columns
merged_df.rename(columns = {'movie_title_left': 'movie_title', 'budget_left':'budget'},inplace=True)

In [533]:
#Drop unnecessary columns
merged_df.drop(columns=['best_match_score','__id_left', '__id_right','Unnamed: 0','Unnamed: 0.1','movie_title_right','budget_right','domestic_gross','worldwide_gross','release', 'rank_all_movies', 'link_stub'],inplace=True)

In [534]:
#Change date string to datetime
merged_df['year'] = pd.to_datetime(merged_df['earliest_release'])
merged_df.head()

Unnamed: 0,movie_title,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,...,Romance,Sci-Fi,Sport,Thriller,Western,domestic_dist_processed,dd_top5,genre_top10,genre_top3,year
0,Star Wars: Episode VII,Walt Disney Studios Motion Pictures,245000000.0,247966675.0,2015-12-16,PG-13,138.0,"['Action', 'Adventure', 'Sci-Fi']","[['Domestic', 936662225], ['International', 11...","['J.J. Abrams', 'Lawrence Kasdan', 'J.J. Abram...",...,0,1,0,0,0,Walt Disney Studios Motion Pictures,1,1,1,2015-12-16
1,Avengers: Endgame,Walt Disney Studios Motion Pictures,356000000.0,357115007.0,2019-04-24,PG-13,181.0,"['Action', 'Adventure', 'Drama', 'Sci-Fi']","[['Domestic', 858373000], ['International', 19...","['Anthony Russo', 'Joe Russo', 'Christopher Ma...",...,0,1,0,0,0,Walt Disney Studios Motion Pictures,1,1,1,2019-04-24
2,Avatar,Twentieth Century Fox,237000000.0,77025481.0,2009-12-16,PG-13,162.0,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']","[['Domestic', 760507625], ['International', 20...","['James Cameron', 'James Cameron', 'James Came...",...,0,1,0,0,0,Twentieth Century Fox,1,1,1,2009-12-16
3,Black Panther,Walt Disney Studios Motion Pictures,200000000.0,202003951.0,2018-02-13,PG-13,134.0,"['Action', 'Adventure', 'Sci-Fi']","[['Domestic', 700426566], ['International', 64...","['Ryan Coogler', 'Ryan Coogler', 'Joe Robert C...",...,0,1,0,0,0,Walt Disney Studios Motion Pictures,1,1,1,2018-02-13
4,Avengers: Infinity War,Walt Disney Studios Motion Pictures,300000000.0,257698183.0,2018-04-25,PG-13,149.0,"['Action', 'Adventure', 'Sci-Fi']","[['Domestic', 678815482], ['International', 13...","['Anthony Russo', 'Joe Russo', 'Christopher Ma...",...,0,1,0,0,0,Walt Disney Studios Motion Pictures,1,1,1,2018-04-25


In [535]:
# Drop years before 2000 for more complete data / more fair comparison
at_2000 = merged_df[(merged_df['year'].dt.year >= 2000)]

In [536]:
# Fill remaining empty budget values
for index in at_2000.index:
    if at_2000.loc[index,'movie_title']=='Over the Hedge':
        at_2000.loc[index,'budget'] = 80000000
    elif at_2000.loc[index,'movie_title']=='Marley & Me':
        at_2000.loc[index,'budget'] = 60000000
    elif at_2000.loc[index,'movie_title']=='Traffic':
        at_2000.loc[index,'budget'] = 48000000
    elif at_2000.loc[index,'movie_title']=='Hairspray':
        at_2000.loc[index,'budget'] = 75000000
    elif at_2000.loc[index,'movie_title']=='How to Lose a Guy in 10 Days':
        at_2000.loc[index,'budget'] = 50000000
    elif at_2000.loc[index,'movie_title']=='Dreamgirls':
        at_2000.loc[index,'budget'] = 75000000
    elif at_2000.loc[index,'movie_title']=='Gnomeo & Juliet':
        at_2000.loc[index,'budget'] = 36000000
    elif at_2000.loc[index,'movie_title']=='Meet the Robinsons':
        at_2000.loc[index,'budget'] = 150000000
    elif at_2000.loc[index,'movie_title']=='Beverly Hills Chihuahua':
        at_2000.loc[index,'budget'] = 20000000
    elif at_2000.loc[index,'movie_title']=="He's Just Not That Into You":
        at_2000.loc[index,'budget'] = 40000000
    elif at_2000.loc[index,'movie_title']=='Space Station 3D':
        at_2000.loc[index,'budget'] = 1000000
    elif at_2000.loc[index,'movie_title']=='John Wick: Chapter 2':
        at_2000.loc[index,'budget'] = 40000000
    elif at_2000.loc[index,'movie_title']=='The Imitation Game':
        at_2000.loc[index,'budget'] = 14000000
    elif at_2000.loc[index,'movie_title']=='Save the Last Dance':
        at_2000.loc[index,'budget'] = 13000000
    elif at_2000.loc[index,'movie_title']=='The Game Plan':
        at_2000.loc[index,'budget'] = 22000000
    elif at_2000.loc[index,'movie_title']=='Madea Goes to Jail':
        at_2000.loc[index,'budget'] = 17500000
    elif at_2000.loc[index,'movie_title']=='Flightplan':
        at_2000.loc[index,'budget'] = 55000000
    elif at_2000.loc[index,'movie_title']=='Brother Bear':
        at_2000.loc[index,'budget'] = 46000000
    elif at_2000.loc[index,'movie_title']=='The Santa Clause 3: The Escape Clause':
        at_2000.loc[index,'budget'] = 12000000
    elif at_2000.loc[index,'movie_title']=='Scooby':
        at_2000.loc[index,'budget'] = 84000000
    elif at_2000.loc[index,'movie_title']=='Smallfoot':
        at_2000.loc[index,'budget'] = 80000000
    elif at_2000.loc[index,'movie_title']=='The Descendants':
        at_2000.loc[index,'budget'] = 35000000
    elif at_2000.loc[index,'movie_title']=='Cheaper by the Dozen 2':
        at_2000.loc[index,'budget'] = 60000000
    elif at_2000.loc[index,'movie_title']=='Bridge to Terabithia':
        at_2000.loc[index,'budget'] = 20000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [537]:
for index in at_2000.index:
    if at_2000.loc[index,'movie_title']=='Get Out':
        at_2000.loc[index,'domestic_opening_sales'] = 34000000

In [538]:
at_2000['budget'] = at_2000['budget'].astype(int)
at_2000['domestic_opening_sales'] = at_2000['domestic_opening_sales'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at_2000['budget'] = at_2000['budget'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at_2000['domestic_opening_sales'] = at_2000['domestic_opening_sales'].astype(int)


In [539]:
at_2000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 723 entries, 0 to 12811
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   movie_title              723 non-null    object        
 1   domestic_distributor     723 non-null    object        
 2   budget                   723 non-null    int64         
 3   domestic_opening_sales   723 non-null    int64         
 4   earliest_release         723 non-null    object        
 5   rating                   705 non-null    object        
 6   runtime_minutes          708 non-null    float64       
 7   genres                   723 non-null    object        
 8   summary_sales            723 non-null    object        
 9   crew_list                723 non-null    object        
 10  cast_list                723 non-null    object        
 11  domestic_sales           588 non-null    object        
 12  international_sales      588 non-n

In [540]:
at_2000.to_csv('data/final_data_first.csv')

In [541]:
at_2000.groupby('Western').count()

Unnamed: 0_level_0,movie_title,domestic_distributor,budget,domestic_opening_sales,earliest_release,rating,runtime_minutes,genres,summary_sales,crew_list,...,Mystery,Romance,Sci-Fi,Sport,Thriller,domestic_dist_processed,dd_top5,genre_top10,genre_top3,year
Western,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,716,716,716,716,716,698,701,716,716,716,...,716,716,716,716,716,716,716,716,716,716
1,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7


In [494]:
action = 314
adventure = 360
animation = 118
#bio = 36
comedy = 32
#crime = 89
drama = 205
fantasy = 192
family 193
#history = 21
#horror = 37
#musical = 31
#mystery = 69
romance = 104
sci-fi = 178
#sport = 23
thriller = 175
#western = 7


SyntaxError: invalid syntax (<ipython-input-494-1327bb25b66a>, line 9)