## DEALING WITH CATEGORICAL FEATURES

#### Read the data

In [None]:
import pandas as pd

def get_movies(aDir):
    import os, json
    file_contents = os.listdir(aDir)

    movie_list = []

    for filename in file_contents:
        filepath = os.path.join(aDir, filename)

        with open(filepath, 'r') as movie_file:
            movie_data = json.load(movie_file)
        if hasattr(movie_data, "keys"): # type(movie_data) == dict:
            movie_list.append(movie_data)
    return movie_list

movie_list = get_movies("/Users/mfenner/repos/metis/ct16_cap1_ds4"
                        "/project_1/data/boxofficemojo/")
movies = pd.DataFrame(movie_list)
movies.head()

In [None]:
import pandas as pd
movies = pd.DataFrame(movie_list)
movies.head()

#### Who are the directors with most movies?

In [None]:
#?pd.Series.value_counts

In [None]:
movies.director.value_counts()[:5]

#### Let's only take movies of the top directors

In [None]:
N = 4
top_directors = movies.director.value_counts().index[:N]
top_dir_movies = movies[movies['director'].isin(top_directors)]

print '%i movies by top %i directors: %s.' % (len(top_dir_movies),
                                              N,
                                              ', '.join(top_directors))

top_dir_movies.head()

We need to convert each director into a column, where the value is either 1 (directed by our guy) or 0 (not directed by our guy). Pandas has a quick way of handling this.

In [None]:
# ?pd.get_dummies

In [None]:
pd.get_dummies(top_dir_movies['director']).head()

#### Let's put the dummy variables for director into our data frame

In [None]:
# ?pd.merge

In [None]:
dummies = pd.get_dummies(top_dir_movies['director'])

# MEF - note
# if we don't want to rely on magical-order-sameness, we can join on key column or index
# top_dir_movies = pd.merge(top_dir_movies, dummies, 
#                           left_index=True, 
#                           right_index=True)

top_dir_movies = pd.concat([top_dir_movies,dummies],axis=1)
print len(top_dir_movies)
top_dir_movies.head()

#### Build the model, use dummies among the features in the model

In [None]:
features = ['production_budget',
            'Steven Spielberg',
            'Woody Allen',
            'Ridley Scott',
            'Joel Schumacher']

# only pick columns you'll use in the model and dropna so we get
# rid of movies without budget info, etc.
related_columns = features + ['domestic_gross']
print related_columns

In [None]:
clean_top_dir_movies = top_dir_movies[related_columns].dropna()
print '%i movies with all necessary info.' % len(clean_top_dir_movies)

In [None]:
# build the model
import statsmodels.api as sm
Y = clean_top_dir_movies['domestic_gross']
X = sm.add_constant(clean_top_dir_movies[features])
director_model = sm.OLS(Y, X).fit()
director_model.summary()

In [None]:
import numpy as np
mse = director_model.mse_resid
rmse = np.sqrt(mse)
print '2 x Root Mean Squared Error = %g' % (2 * rmse)

Our best model with these features ends up being:

    Gross of movie =  $55M cash guaranteed
                      + 0.3 * budget
                      + $101M * (did Steven Spielberg direct it?)
                      - $42M  * (did Woody Allen direct it?)
                      + $5M   * (did Ridley Scott direct it?)
                      - $17M  * (did Joel Schumacher direct it?)
                      + random noise (+- $162M with 95% confidence) 
                 
                 
But also, only Spielberg's effect passes significance, so the effect we measured for each of the others might not be true and only an artifact of noise. But it looks like Spielberg definitely brings extra value. 

You could say that we cannot really improve our guess in a meaningful way by knowing if Ridley Scott or Woody Allen or etc. directed it. But by knowing that Spielberg directed a movie, we can increase our prediction of the gross by $101 Million, and this will reduce the errors of our prediction. Spielberg does carry valuable information.

## Separating training and test sets

##### Define your X and Y

In [None]:
model_columns = ['domestic_gross',
                 'opening_weekend_take',
                 'Steven Spielberg',
                 'Woody Allen']
df = top_dir_movies[model_columns].dropna()

In [None]:
X = sm.add_constant(df[['opening_weekend_take',
                        'Steven Spielberg',
                        'Woody Allen']])
Y = df['domestic_gross']

In [None]:
X[:5]

##### Scikit.learn has a function to do this split

In [None]:
from sklearn.cross_validation import train_test_split

# splits x -> x_train, x_test
#        y -> y_train, y_test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.33)

print len(X_train), len(X_test)

##### Fit model to the training set

In [None]:
model = sm.OLS(Y_train, X_train).fit()
model.summary()

##### Evaluate performance on the test set

In [None]:
from sklearn.metrics import mean_squared_error

# Root mean squared error is standard deviation of
# the differences between predicted and actual values
def RMSE(model, X_, Y_):
    Y_pred = model.predict(X_)
    Y_true = Y_
    MSE = mean_squared_error(Y_pred, Y_true)
    return np.sqrt(MSE)

train_RMSE = RMSE(model, X_train, Y_train)
test_RMSE  = RMSE(model, X_test, Y_test)

print 'Training RMSE is $%.1f Million' % (train_RMSE / 1e6)
print 'Test RMSE     is $%.1f Million' % (test_RMSE / 1e6)

### Cross Validation

##### For the same X and Y, instead of creating a single training/test split, let's do a 5-fold cross validation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

model = LinearRegression()
RMSE_folds = cross_val_score(model,X,Y,
                             cv=5,
                             scoring=RMSE)
for i,error in enumerate(RMSE_folds):
    print 'Cross val fold %i: %.1f Million' % (i, error/1e6)