In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# import raw data files
df_budgets = pd.read_csv('../data/tn.movie_budgets.csv.gz')
df_basics = pd.read_csv('../data/imdb.title.basics.csv.gz')

In [3]:
def cleanbudgetdata(df):
    # Step 1. Strip out $ signs.
    df = df.applymap(lambda x: str(x).replace('$', ''))
# Step 2. Strip away ','.
    df['worldwide_gross'] = df['worldwide_gross'].map(lambda x: str(x).replace(',', '_'))
    df['domestic_gross'] = df['domestic_gross'].map(lambda x: str(x).replace(',', '_'))
    df['production_budget'] = df['production_budget'].map(lambda x: str(x).replace(',', '_'))
    # Step 3. Transform string values into integers.
    df['worldwide_gross'] = df['worldwide_gross'].astype(int)
    df['domestic_gross'] = df['domestic_gross'].astype(int)
    df['production_budget'] = df['production_budget'].astype(int)
    return df    
    

In [4]:
df_budgets2 = cleanbudgetdata(df_budgets)
df_budgets2

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747
5,6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220
6,7,"Apr 27, 2018",Avengers: Infinity War,300000000,678815482,2048134200
7,8,"May 24, 2007",Pirates of the Caribbean: At Worldâs End,300000000,309420425,963420425
8,9,"Nov 17, 2017",Justice League,300000000,229024295,655945209
9,10,"Nov 6, 2015",Spectre,300000000,200074175,879620923


In [11]:
df_budgets2.head(1)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,425000000,760507625,2776345279


In [23]:
def manipulatebudgetdata(df):
# Create a new variable 'worldwide_roi' by calculating ROI.
    df['worldwide_roi'] = (df['worldwide_gross'] / df['production_budget']) *100
# Sort dataframe by ROI, starting with highest ROI.
    df = df.sort_values('worldwide_roi', ascending=False)
# There is one huge outlier that is in a category we can assume our client doesn't want to enter. Drop it.
    df = df.drop(5745, axis=0)
# Reset our index, because we have removed an outlier and sorted our data.
    df = df.reset_index()
    df = df.drop('index', axis=1)
    return df


    


In [26]:
df_budgets3 = manipulatebudgetdata(df_budgets2)

df_budgets3.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,worldwide_roi
0,14,"Mar 21, 1980",Mad Max,200000,8750000,99750000,49875.0
1,93,"Sep 25, 2009",Paranormal Activity,450000,107918810,194183034,43151.785333
2,80,"Jul 10, 2015",The Gallows,100000,22764410,41656474,41656.474
3,7,"Jul 14, 1999",The Blair Witch Project,600000,140539099,248300000,41383.333333
4,10,"May 7, 2004",Super Size Me,65000,11529368,22233808,34205.858462


In [28]:
# Create a new variable 'roi_category' to classify levels of ROI (based on quintiles).
names = ['low', 'somewhat low', 'moderate', 'somewhat high', 'high']
pd.qcut(df_budgets3['worldwide_roi'], 5, labels=names)
df_budgets3['roi_category'] = pd.qcut(df_budgets3['worldwide_roi'], 5, labels=names)

In [29]:
# Create a new variable 'movie_year' within both 'budgets' and 'basics' datasets which we can use to merge.
df_budgets3['movie_year'] = df_budgets3['movie']+" (" + df_budgets3['release_date'].map(lambda x: x[-4: len(x)])+")"

df_budgets3.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,worldwide_roi,roi_category,movie_year
0,14,"Mar 21, 1980",Mad Max,200000,8750000,99750000,49875.0,high,Mad Max (1980)
1,93,"Sep 25, 2009",Paranormal Activity,450000,107918810,194183034,43151.785333,high,Paranormal Activity (2009)
2,80,"Jul 10, 2015",The Gallows,100000,22764410,41656474,41656.474,high,The Gallows (2015)
3,7,"Jul 14, 1999",The Blair Witch Project,600000,140539099,248300000,41383.333333,high,The Blair Witch Project (1999)
4,10,"May 7, 2004",Super Size Me,65000,11529368,22233808,34205.858462,high,Super Size Me (2004)


In [30]:
df_basics['movie_year'] = df_basics['primary_title']+" (" + df_basics['start_year'].astype(str)+")"

df_basics.head()
    
    

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,movie_year
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",Sunghursh (2013)
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",One Day Before the Rainy Season (2019)
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,The Other Side of the Wind (2018)
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",Sabse Bada Sukh (2018)
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",The Wandering Soap Opera (2017)


In [31]:
# Merge (left) 'basics' data to our 'budgets' dataset on 'movie_year' that we created in the both datasets.
df_merged_data = df_budgets3.merge(df_basics, how='left', on='movie_year')

# Create a new dataset, dropping all the rows with misssing values.
df_fulldata = df_merged_data.loc[~df_merged_data.isna().any(axis=1)]
df_fulldata.head()




Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,worldwide_roi,roi_category,movie_year,tconst,primary_title,original_title,start_year,runtime_minutes,genres
2,80,"Jul 10, 2015",The Gallows,100000,22764410,41656474,41656.474,high,The Gallows (2015),tt2309260,The Gallows,The Gallows,2015.0,81.0,"Horror,Mystery,Thriller"
25,12,"Jan 6, 2012",The Devil Inside,1000000,53262945,101759490,10175.949,high,The Devil Inside (2012),tt1560985,The Devil Inside,The Devil Inside,2012.0,83.0,Horror
51,65,"Oct 20, 2010",Paranormal Activity 2,3000000,84752907,177512032,5917.067733,high,Paranormal Activity 2 (2010),tt1536044,Paranormal Activity 2,Paranormal Activity 2,2010.0,91.0,Horror
59,49,"Feb 24, 2017",Get Out,5000000,176040665,255367951,5107.35902,high,Get Out (2017),tt5052448,Get Out,Get Out,2017.0,104.0,"Horror,Mystery,Thriller"
65,64,"Oct 21, 2016",Moonlight,1500000,27854931,65245512,4349.7008,high,Moonlight (2016),tt4975722,Moonlight,Moonlight,2016.0,111.0,Drama


In [32]:
# Resetting index to 'id' column.
df_fulldata.set_index("id", inplace=True)

df_fulldata.head()

Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross,worldwide_roi,roi_category,movie_year,tconst,primary_title,original_title,start_year,runtime_minutes,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
80,"Jul 10, 2015",The Gallows,100000,22764410,41656474,41656.474,high,The Gallows (2015),tt2309260,The Gallows,The Gallows,2015.0,81.0,"Horror,Mystery,Thriller"
12,"Jan 6, 2012",The Devil Inside,1000000,53262945,101759490,10175.949,high,The Devil Inside (2012),tt1560985,The Devil Inside,The Devil Inside,2012.0,83.0,Horror
65,"Oct 20, 2010",Paranormal Activity 2,3000000,84752907,177512032,5917.067733,high,Paranormal Activity 2 (2010),tt1536044,Paranormal Activity 2,Paranormal Activity 2,2010.0,91.0,Horror
49,"Feb 24, 2017",Get Out,5000000,176040665,255367951,5107.35902,high,Get Out (2017),tt5052448,Get Out,Get Out,2017.0,104.0,"Horror,Mystery,Thriller"
64,"Oct 21, 2016",Moonlight,1500000,27854931,65245512,4349.7008,high,Moonlight (2016),tt4975722,Moonlight,Moonlight,2016.0,111.0,Drama


In [39]:
df_fulldata



Unnamed: 0_level_0,release_date,movie,production_budget,domestic_gross,worldwide_gross,worldwide_roi,roi_category,movie_year,tconst,primary_title,original_title,start_year,runtime_minutes,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
80,"Jul 10, 2015",The Gallows,100000,22764410,41656474,41656.474000,high,The Gallows (2015),tt2309260,The Gallows,The Gallows,2015.0,81.0,"Horror,Mystery,Thriller"
12,"Jan 6, 2012",The Devil Inside,1000000,53262945,101759490,10175.949000,high,The Devil Inside (2012),tt1560985,The Devil Inside,The Devil Inside,2012.0,83.0,Horror
65,"Oct 20, 2010",Paranormal Activity 2,3000000,84752907,177512032,5917.067733,high,Paranormal Activity 2 (2010),tt1536044,Paranormal Activity 2,Paranormal Activity 2,2010.0,91.0,Horror
49,"Feb 24, 2017",Get Out,5000000,176040665,255367951,5107.359020,high,Get Out (2017),tt5052448,Get Out,Get Out,2017.0,104.0,"Horror,Mystery,Thriller"
64,"Oct 21, 2016",Moonlight,1500000,27854931,65245512,4349.700800,high,Moonlight (2016),tt4975722,Moonlight,Moonlight,2016.0,111.0,Drama
18,"May 25, 2012",Chernobyl Diaries,1000000,18119640,42411721,4241.172100,high,Chernobyl Diaries (2012),tt1991245,Chernobyl Diaries,Chernobyl Diaries,2012.0,86.0,"Horror,Mystery,Thriller"
51,"Oct 21, 2011",Paranormal Activity 3,5000000,104028807,207039844,4140.796880,high,Paranormal Activity 3 (2011),tt1778304,Paranormal Activity 3,Paranormal Activity 3,2011.0,83.0,"Horror,Mystery,Thriller"
84,"Oct 3, 2014",Annabelle,6500000,84273813,256862920,3951.737231,high,Annabelle (2014),tt3322940,Annabelle,Annabelle,2014.0,99.0,"Horror,Mystery,Thriller"
15,"Aug 27, 2010",The Last Exorcism,1800000,41034350,70165900,3898.105556,high,The Last Exorcism (2010),tt1320244,The Last Exorcism,The Last Exorcism,2010.0,87.0,"Drama,Horror,Thriller"
56,"Dec 21, 2016",Dangal,9500000,12391761,294654618,3101.627558,high,Dangal (2016),tt5074352,Dangal,Dangal,2016.0,161.0,"Action,Biography,Drama"
