In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Initial Analysis to see if foreign success predictions will work

In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
%matplotlib inline

In [136]:
df = pd.DataFrame([
   {
       "x1": 2,
       "genre": "G1,G2"
   },
  {
       "x1": 3,
       "genre": "G1,G3"
   },
   {
       "x1": 4,
       "genre": "G2,G3"
   }])


In [137]:
df

Unnamed: 0,genre,x1
0,"G1,G2",2
1,"G1,G3",3
2,"G2,G3",4


In [142]:
def split_genres(row):
    genres = row['genre'].split(',')
    for genre in genres:
        row[genre] = 1
    return pd.Series(row)

In [143]:
df_new = df.apply(split_genres,axis=1)

In [144]:
df_new

Unnamed: 0,G1,G2,G3,genre,x1
0,1.0,1.0,,"G1,G2",2
1,1.0,,1.0,"G1,G3",3
2,,1.0,1.0,"G2,G3",4


In [260]:
################## Make it this way (the fast way) for the blog post
# load data into list of lists and then put it into pandas
##################

allFiles = glob.glob("/Users/williamcosby/Documents/metis/Project_Luther/movie_*.csv")
movie_data = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)
movie_data_raw = pd.concat(list_)

In [261]:
# movie_data_2015_2016 = pd.read_csv('movie_data_2015_2016.csv')

In [262]:
movie_data_raw.head()

Unnamed: 0,MOVIE_NAME,DOMESTIC_OPENING,FOREIGN_TOTAL,BUDGET,GENRE,RELEASE_DATE
0,Marvel's The Avengers,207438708,896200000,220000000.0,Action / Adventure,2012-05-04 00:00:00
1,The Dark Knight Rises,160887295,636800000,250000000.0,Action Thriller,2012-07-20 00:00:00
2,The Hunger Games,152535747,286384032,78000000.0,Action / Adventure,2012-03-23 00:00:00
3,Skyfall,88364714,804200736,200000000.0,Action,2012-11-09 00:00:00
4,The Hobbit:An Unexpected Journey,84617303,718100000,0.0,Fantasy,2012-12-14 00:00:00


# Set the index to the movie name (dont need it in the analysis)

In [263]:
movie_data_raw = movie_data_raw.set_index(['MOVIE_NAME'])

# Find number of movies that have foreign gross info

In [264]:
# total number of movies
movie_data_raw.shape

(4256, 5)

In [265]:
# movies with foreign earnings information
movie_data_raw[movie_data_raw['FOREIGN_TOTAL']!=0].shape

(1528, 5)

# Subset to get just the movies with foreign earnings information

In [266]:
movie_data = movie_data_raw[movie_data_raw['FOREIGN_TOTAL']!=0]

In [267]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1528 entries, Marvel's The Avengers to Capture the Flag
Data columns (total 5 columns):
DOMESTIC_OPENING    1528 non-null int64
FOREIGN_TOTAL       1528 non-null int64
BUDGET              1527 non-null float64
GENRE               1528 non-null object
RELEASE_DATE        1528 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 71.6+ KB


# Start some analysis

## Combine the genres so they arent "action / adventure"

In [268]:
def split_genres(row):
    genres=[]
    if '/' in row['GENRE']:
        genres = row['GENRE'].split('/')
    else:
        genres = row['GENRE'].split()
    for genre in genres:
        # need to strip the whitespace off the columns to avoid things like 'Action' and 'Action '
        row[genre.strip()] = 1
    return pd.Series(row)

In [269]:
movie_data_split = movie_data.apply(split_genres,axis=1)

In [270]:
movie_data_split.head(3)

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Musical,Period,RELEASE_DATE,Romance,Romantic,Sci-Fi,Sports,Thriller,War,Western
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Marvel's The Avengers,1.0,1.0,,220000000.0,,,,207438708,,,...,,,2012-05-04 00:00:00,,,,,,,
The Dark Knight Rises,1.0,,,250000000.0,,,,160887295,,,...,,,2012-07-20 00:00:00,,,,,1.0,,
The Hunger Games,1.0,1.0,,78000000.0,,,,152535747,,,...,,,2012-03-23 00:00:00,,,,,,,


## Now fill in the missing values with 0


In [271]:
movie_data = movie_data_split.fillna(0)

## Need to drop the 'GENRE' column now

In [272]:
movie_data.drop(['GENRE'],axis=1,inplace=True)

In [273]:
movie_data.columns

Index([u'Action', u'Adventure', u'Animation', u'BUDGET', u'Comedy', u'Concert',
       u'Crime', u'DOMESTIC_OPENING', u'Documentary', u'Drama',
       u'FOREIGN_TOTAL', u'Family', u'Fantasy', u'Foreign', u'Historical',
       u'Horror', u'IMAX', u'Music', u'Musical', u'Period', u'RELEASE_DATE',
       u'Romance', u'Romantic', u'Sci-Fi', u'Sports', u'Thriller', u'War',
       u'Western'],
      dtype='object')

In [274]:
movie_data.head(2)

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Musical,Period,RELEASE_DATE,Romance,Romantic,Sci-Fi,Sports,Thriller,War,Western
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Marvel's The Avengers,1.0,1.0,0.0,220000000.0,0.0,0.0,0.0,207438708,0.0,0.0,...,0.0,0.0,2012-05-04 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Dark Knight Rises,1.0,0.0,0.0,250000000.0,0.0,0.0,0.0,160887295,0.0,0.0,...,0.0,0.0,2012-07-20 00:00:00,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Will want to encode the dates as 1st quarter, 2nd quarter, 3rd quarter, 4th quarter
#### Do this by first transforming the column into strings "1","2","3","4", then apply pandas .get_dummies to make it one-hot

In [275]:
from pandas import DatetimeIndex
import dateutil.parser

In [276]:
# turn RELEASE_DATE into actual datetime
movie_data['RELEASE_DATE'] = movie_data['RELEASE_DATE'].apply(lambda x: dateutil.parser.parse(x))

In [277]:
movie_data['RELEASE_DATE'].dtype

dtype('<M8[ns]')

In [278]:
movie_data["RELEASE_QUARTER"] = movie_data["RELEASE_DATE"].apply(lambda x: x.quarter)

In [279]:
# now just want release quarter
movie_data.drop(['RELEASE_DATE'],axis=1,inplace=True)

#### Now vectorize release quarter column

In [280]:
movie_data = pd.get_dummies(movie_data,columns=["RELEASE_QUARTER"])

In [281]:
movie_data.head()

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Romantic,Sci-Fi,Sports,Thriller,War,Western,RELEASE_QUARTER_1,RELEASE_QUARTER_2,RELEASE_QUARTER_3,RELEASE_QUARTER_4
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Marvel's The Avengers,1.0,1.0,0.0,220000000.0,0.0,0.0,0.0,207438708,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
The Dark Knight Rises,1.0,0.0,0.0,250000000.0,0.0,0.0,0.0,160887295,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
The Hunger Games,1.0,1.0,0.0,78000000.0,0.0,0.0,0.0,152535747,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Skyfall,1.0,0.0,0.0,200000000.0,0.0,0.0,0.0,88364714,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
The Hobbit:An Unexpected Journey,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84617303,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Check the data for missing values, and other weird things

## Impute what I can into the raw data

### Check Domestic Openings

In [282]:
len(movie_data[movie_data["DOMESTIC_OPENING"] == 0])

0

## Check Budget 

In [283]:
# woah...missing a lot~~
len (movie_data[movie_data['BUDGET'] == 0])

774

In [284]:
movie_no_budget = movie_data[movie_data['BUDGET'] == 0]

In [285]:
movie_no_budget.head(10)

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Romantic,Sci-Fi,Sports,Thriller,War,Western,RELEASE_QUARTER_1,RELEASE_QUARTER_2,RELEASE_QUARTER_3,RELEASE_QUARTER_4
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Hobbit:An Unexpected Journey,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84617303,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Argo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19458109,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
The Campaign,0.0,0.0,0.0,0.0,1.0,0.0,0.0,26588460,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Hope Springs,0.0,0.0,0.0,0.0,1.0,0.0,0.0,14650121,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
The Lucky One,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22518358,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Project X,0.0,0.0,0.0,0.0,1.0,0.0,0.0,21051363,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
The Woman in Black,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20874072,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
The Devil Inside,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33732515,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
The Odd Life of Timothy Green,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10822903,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Beauty and the Beast (3D),0.0,0.0,1.0,0.0,0.0,0.0,0.0,17751905,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Note!!!!!! one reason for weird domest/foreign offset is if a movie is released somewhere like korea...it might do super well there but not well in america (which is considered domestic)

# Have option of ignoring films with foreign as the genre

In [286]:
movies_no_domestic = movie_data[movie_data["DOMESTIC_OPENING"] == 0]

In [287]:
movies_no_domestic

Unnamed: 0_level_0,Action,Adventure,Animation,BUDGET,Comedy,Concert,Crime,DOMESTIC_OPENING,Documentary,Drama,...,Romantic,Sci-Fi,Sports,Thriller,War,Western,RELEASE_QUARTER_1,RELEASE_QUARTER_2,RELEASE_QUARTER_3,RELEASE_QUARTER_4
MOVIE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Just get rid of entries with no budget info

In [288]:
movie_data = movie_data[movie_data['BUDGET'] != 0]

In [289]:
movie_data.shape

(754, 31)

In [290]:
len(movie_data[movie_data['Foreign'] == 1])

9

# Drop foreign movies because 

In [291]:
movie_data = movie_data[movie_data['Foreign'] == 0]

In [292]:
movie_data.shape

(745, 31)

# Get some plots the data

In [293]:
import sklearn
from sklearn.cross_validation import train_test_split,KFold,cross_val_score
from pandas.tools.plotting import scatter_matrix

### Drop na values (there is apparently 1)

In [294]:
movie_data.dropna(inplace=True)

In [295]:
movie_data.shape

(745, 31)

In [296]:
movie_features,movie_target = movie_data.drop('FOREIGN_TOTAL', axis=1),movie_data['FOREIGN_TOTAL']

## Want to get some plots...write these to a directory
#### currently without the extra actor/director/writer information...

In [298]:
column_names = list(movie_features.columns)

for column in column_names:
    try:
    #     print movie_features[column]
        # create plots of column vs foreign budget

        ## want to rescale budget and domestic fields to make the graphs look better and be 
        ## easier to understand
        if column == 'BUDGET':
            plt.scatter(movie_features[column]/float(1000000),movie_target/float(1000000),alpha=0.2)
        elif column == 'DOMESTIC_OPENING':
            plt.scatter(movie_features[column]/float(1000000),movie_target/float(1000000),alpha=0.2)
        else:
            plt.scatter(movie_features[column],movie_target/float(1000000),alpha=0.2)

        plt.xlabel(column)
        plt.ylabel('foreign gross (millions $)')

        path = '/Users/williamcosby/Documents/metis/Project_Luther/plots/'+str(column)+'.png'
        plt.savefig(path)
        plt.close() 
    except:
        print column
        print movie_features[column]

# Start some modeling
### Split the data into features and targets



In [299]:
from sklearn import linear_model
from sklearn.cross_validation import cross_val_predict, train_test_split

# Training, Testing, and holdout splits

#### nh means "not holdout"

### Make a holdout set from the data

In [300]:
movie_features_nh, movie_features_holdout, movie_target_nh, movie_target_holdout = train_test_split(movie_features,movie_target,
                                                            test_size=0.1)

### Now split the data into the train and test sets (within the non-holdout data) and create the model

In [301]:
movie_features_train, movie_features_test, movie_target_train, movie_target_test = train_test_split(
                                                            movie_features_nh,movie_target_nh,
                                                            test_size=0.3)

### Drop domestic opening

In [302]:
# try dropping domestic opening 
movie_features_train_nodom = movie_features_train.drop(['DOMESTIC_OPENING'],axis=1)
movie_features_test_nodom = movie_features_test.drop(['DOMESTIC_OPENING'],axis=1)

## Train lin reg without domestic opening

In [303]:
lr_nodom = linear_model.LinearRegression()
lr_nodom.fit(movie_features_train_nodom,movie_target_train)
print "vanilla lin reg score: ", lr_nodom.score(movie_features_test_nodom, movie_target_test)

vanilla lin reg score:  0.508054778714


## Train lin reg with domestic opening

In [304]:
lr = linear_model.LinearRegression()
lr.fit(movie_features_train,movie_target_train)
print "vanilla lin reg score: ", lr.score(movie_features_test, movie_target_test)
# sort features by coefficient
sorted_features = sorted(zip(list(movie_features_train.columns),lr.coef_),
                         key=lambda tup: abs(tup[1]),reverse=True) 

for feature in sorted_features:
    print(feature)

vanilla lin reg score:  0.743651819823
('War', -114366952.09262225)
('Western', -45914350.772754051)
('Animation', 42452172.776336581)
('Sports', -31751647.188146755)
('Adventure', -31294011.999237716)
('Comedy', -26084141.373817116)
('Romantic', 25536252.980433028)
('Family', 22309085.463242464)
('Sci-Fi', 21026218.972608313)
('RELEASE_QUARTER_4', 20231531.648331355)
('RELEASE_QUARTER_1', -19476089.107278675)
('Documentary', -17687736.75847102)
('Crime', -16561465.095250443)
('Music', 15384814.817870604)
('Period', -15224393.340719763)
('Horror', -13324281.893274412)
('Thriller', -8848401.9231874179)
('Action', 6686892.2360856012)
('Fantasy', 5171996.8304276522)
('Drama', -4554159.2044115365)
('Romance', -4534384.9249272235)
('RELEASE_QUARTER_3', -2868661.2488414329)
('Historical', 2227352.6883650078)
('RELEASE_QUARTER_2', 2113218.3103773557)
('Concert', 419854.64525326714)
('Musical', -257597.13907351345)
('DOMESTIC_OPENING', 3.23981923609972)
('BUDGET', 1.0241132080554962)
('Foreign

# Run on holdout set

In [307]:
# with domestic opening information
lr.score(movie_features_holdout, movie_target_holdout)

0.80391026298045098

In [308]:
# without domestic opening information
movie_features_holdout_nodom = movie_features_holdout.drop(['DOMESTIC_OPENING'],axis=1)
lr_nodom.score(movie_features_holdout_nodom,movie_target_holdout)

0.48751224064287485

# Random Forest

In [314]:
len(movie_features.columns)

30

In [309]:
from sklearn.ensemble import RandomForestRegressor

In [315]:
rf_nodom = RandomForestRegressor(n_estimators=1000,max_features=25)
rf_nodom.fit(movie_features_train_nodom,movie_target_train)
print "Random Forest result with no domestic information: ", rf_nodom.score(movie_features_test_nodom,movie_target_test)

rf = RandomForestRegressor(n_estimators=1000,max_features=25)
rf.fit(movie_features_train,movie_target_train)
print "Random Forest result with domestic information: ", rf.score(movie_features_test,movie_target_test)



Random Forest result with no domestic information:  0.271716315384
Random Forest result with domestic information:  0.69245249013


In [316]:
rf.feature_importances_

array([  6.50519774e-03,   8.69346969e-03,   9.88065731e-03,
         2.46849085e-01,   3.11734730e-03,   4.35443149e-07,
         3.47573970e-04,   6.80320938e-01,   2.51802435e-05,
         2.00145807e-03,   7.21388467e-04,   2.36440686e-03,
         0.00000000e+00,   1.06729606e-04,   1.27277803e-03,
         0.00000000e+00,   5.33656548e-04,   1.47703437e-03,
         2.74192564e-04,   1.51963814e-03,   4.50932827e-04,
         4.08350187e-03,   9.66856152e-05,   2.01151050e-03,
         1.67941493e-04,   7.85662671e-05,   5.01252555e-03,
         5.82718387e-03,   6.68405622e-03,   9.57592855e-03])

# Gradient Boost

In [318]:
from sklearn.ensemble import GradientBoostingRegressor

In [347]:
grad_boost = GradientBoostingRegressor(n_estimators=100,learning_rate=0.1,alpha=0.3,loss='huber')

grad_boost.fit(movie_features_train,movie_target_train)
grad_boost.score(movie_features_test,movie_target_test)

0.72547271353233156

In [348]:
grad_boost.score(movie_features_holdout, movie_target_holdout)

0.78180295463652605