### Run Oscar Bait Analysis Linear Regression Models

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
%matplotlib inline

#### Read in clean movie data

In [2]:
df=pd.read_pickle("data/movie_data_clean.pickle")
df["aud_critic_diff"]=df.tomato_user_meter-df.tomato_meter #calculate difference between audience and critical reception

print "Number of Records: ", df.shape[0]
df[["title","tomato_meter","tomato_user_meter","aud_critic_diff"]].head(5)

Number of Records:  6797


Unnamed: 0,title,tomato_meter,tomato_user_meter,aud_critic_diff
0,The 40-Year-Old Virgin,85,84,-1
1,Beauty Shop,37,62,25
2,Guess Who,43,49,6
3,Monster-in-Law,16,55,39
4,The Ringer,40,68,28


#### Create dummy variables for missing budget and IMDb data for model

In [3]:
df["budget_null"]=0
df.ix[df.budget.isnull(), "budget_null"]=1
df.ix[df.budget.isnull(), "budget"]=0

df["imdb_null"]=0
df.ix[df.imdb_id.isnull(), "imdb_null"]=1
df.ix[df.imdb_id.isnull(), "tomato_meter"]=0
df.ix[df.imdb_id.isnull(), "aud_critic_diff"]=0
    
print df["budget_null"].value_counts(dropna=False)
print df["imdb_null"].value_counts(dropna=False)

1    5459
0    1338
Name: budget_null, dtype: int64
0    3999
1    2798
Name: imdb_null, dtype: int64


#### Run Model 1

In [4]:
formula="oscar_noms ~ C(genre_group) + runtime + C(release_qtr) + \
tomato_meter + aud_critic_diff + imdb_null + budget + budget_null"

results=smf.ols(formula=formula, data=df).fit()
results.summary()

0,1,2,3
Dep. Variable:,oscar_noms,R-squared:,0.179
Model:,OLS,Adj. R-squared:,0.177
Method:,Least Squares,F-statistic:,78.55
Date:,"Fri, 22 Jul 2016",Prob (F-statistic):,2.97e-232
Time:,16:51:39,Log-Likelihood:,-7260.7
No. Observations:,5767,AIC:,14560.0
Df Residuals:,5750,BIC:,14670.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-1.2353,0.097,-12.740,0.000,-1.425 -1.045
C(genre_group)[T.Comedy],0.0640,0.053,1.207,0.227,-0.040 0.168
C(genre_group)[T.Documentary],-0.1089,0.056,-1.946,0.052,-0.219 0.001
C(genre_group)[T.Drama],0.1697,0.052,3.236,0.001,0.067 0.272
C(genre_group)[T.Family/ Animation],0.0772,0.070,1.099,0.272,-0.061 0.215
C(genre_group)[T.Foreign],-0.0317,0.056,-0.566,0.571,-0.141 0.078
C(genre_group)[T.Horror/ Thriller/ Sci-Fi],0.1163,0.056,2.074,0.038,0.006 0.226
C(genre_group)[T.Other],0.1851,0.068,2.719,0.007,0.052 0.319
C(release_qtr)[T.2],0.0212,0.033,0.644,0.519,-0.043 0.086

0,1,2,3
Omnibus:,6740.437,Durbin-Watson:,1.912
Prob(Omnibus):,0.0,Jarque-Bera (JB):,745737.556
Skew:,6.174,Prob(JB):,0.0
Kurtosis:,57.323,Cond. No.,301000.0


#### Run Model 2 (Exclude Genre)

In [5]:
formula="oscar_noms ~ runtime + C(release_qtr) + tomato_meter + aud_critic_diff + imdb_null + budget + budget_null"

results=smf.ols(formula=formula, data=df).fit()
results.summary()

0,1,2,3
Dep. Variable:,oscar_noms,R-squared:,0.169
Model:,OLS,Adj. R-squared:,0.167
Method:,Least Squares,F-statistic:,129.8
Date:,"Fri, 22 Jul 2016",Prob (F-statistic):,2.1300000000000003e-223
Time:,16:51:49,Log-Likelihood:,-7302.2
No. Observations:,5772,AIC:,14620.0
Df Residuals:,5762,BIC:,14690.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-1.0976,0.080,-13.700,0.000,-1.255 -0.941
C(release_qtr)[T.2],0.0209,0.033,0.633,0.527,-0.044 0.085
C(release_qtr)[T.3],-0.0264,0.033,-0.792,0.429,-0.092 0.039
C(release_qtr)[T.4],0.2652,0.035,7.611,0.000,0.197 0.333
runtime,0.0049,0.001,9.265,0.000,0.004 0.006
tomato_meter,0.0171,0.001,20.562,0.000,0.015 0.019
aud_critic_diff,0.0097,0.001,8.676,0.000,0.008 0.012
imdb_null,0.8927,0.056,15.940,0.000,0.783 1.002
budget,2.268e-06,7.17e-07,3.161,0.002,8.61e-07 3.67e-06

0,1,2,3
Omnibus:,6785.462,Durbin-Watson:,1.896
Prob(Omnibus):,0.0,Jarque-Bera (JB):,758006.833
Skew:,6.237,Prob(JB):,0.0
Kurtosis:,57.738,Cond. No.,190000.0


#### Run model 3 (exclude budget)

In [6]:
formula="oscar_noms ~ genre_group + runtime + C(release_qtr) + tomato_meter + aud_critic_diff + imdb_null"

results=smf.ols(formula=formula, data=df).fit()
results.summary()

0,1,2,3
Dep. Variable:,oscar_noms,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,78.27
Date:,"Fri, 22 Jul 2016",Prob (F-statistic):,2.15e-205
Time:,16:51:54,Log-Likelihood:,-7327.9
No. Observations:,5767,AIC:,14690.0
Df Residuals:,5752,BIC:,14790.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-1.3775,0.090,-15.234,0.000,-1.555 -1.200
genre_group[T.Comedy],-0.0042,0.053,-0.079,0.937,-0.107 0.099
genre_group[T.Documentary],-0.2506,0.055,-4.552,0.000,-0.358 -0.143
genre_group[T.Drama],0.0749,0.052,1.447,0.148,-0.027 0.176
genre_group[T.Family/ Animation],0.1001,0.071,1.409,0.159,-0.039 0.239
genre_group[T.Foreign],-0.1619,0.055,-2.938,0.003,-0.270 -0.054
genre_group[T.Horror/ Thriller/ Sci-Fi],0.0790,0.056,1.409,0.159,-0.031 0.189
genre_group[T.Other],0.1440,0.069,2.100,0.036,0.010 0.279
C(release_qtr)[T.2],0.0174,0.033,0.524,0.601,-0.048 0.083

0,1,2,3
Omnibus:,6834.634,Durbin-Watson:,1.881
Prob(Omnibus):,0.0,Jarque-Bera (JB):,793279.603
Skew:,6.314,Prob(JB):,0.0
Kurtosis:,59.052,Cond. No.,1330.0
