In this notebook, we explore different techniques to optimize hyperparameters working with
an IMDb dataset and trying to predict the score each movie received. 

In [144]:
# Data analysis imports
import pandas as pd
import numpy as np

# Machine learning imports
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [169]:
# Some constants
SEED = 100
TEST_SIZE = 0.2
MAX_EVALS = 100

In [146]:
# Some utility functions
def compute_rmse(model, features, targets):
    prediction = model.predict(features)
    rmse = np.sqrt(np.mean((prediction - targets) ** 2))
    return rmse

def train_grid_search(cv_parameters, features, targets):
    xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
    grid_search = GridSearchCV(xgb_regressor, cv_parameters, cv=5, verbose=1, n_jobs=4)
    grid_search.fit(features, targets)
    return grid_search

# Load processed data

In [25]:
imdb_df = pd.read_csv('../data/processed_movie_metadata.csv')

In [26]:
imdb_df.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,num_voted_users,num_user_for_reviews,title_year,actor_2_facebook_likes,imdb_score,...,Not Rated,PG,PG-13,Passed,R,TV-14,TV-G,TV-PG,Unrated,X
0,723.0,178.0,0.0,855.0,1000.0,886204,3054.0,2009.0,936.0,7.9,...,0,0,1,0,0,0,0,0,0,0
1,302.0,169.0,563.0,1000.0,40000.0,471220,1238.0,2007.0,5000.0,7.1,...,0,0,1,0,0,0,0,0,0,0
2,602.0,148.0,0.0,161.0,11000.0,275868,994.0,2015.0,393.0,6.8,...,0,0,1,0,0,0,0,0,0,0
3,813.0,164.0,22000.0,23000.0,27000.0,1144337,2701.0,2012.0,23000.0,8.5,...,0,0,1,0,0,0,0,0,0,0
4,462.0,132.0,475.0,530.0,640.0,212204,738.0,2012.0,632.0,6.6,...,0,0,1,0,0,0,0,0,0,0


# Train - test split 

In [147]:
train_df, test_df = train_test_split(imdb_df, test_size=TEST_SIZE, random_state=SEED)

In [148]:
train_features = train_df.drop('imdb_score', axis=1)
train_targets = train_df.loc[:, 'imdb_score']
test_features = test_df.drop('imdb_score', axis=1)
test_targets = test_df.loc[:, 'imdb_score']

# Naive grid search 

In [149]:
naive_cv_parameters = {'max_depth':[4, 6, 8, 10],
                 'n_estimators': [10, 15, 20, 25],
                 'learning_rate': [0.2, 0.4, 0.6, 0.8], 
                 'gamma': [0.2, 0.4, 0.6, 0.8]
}

In [150]:
naive_gs = train_grid_search(naive_cv_parameters, train_features, train_targets)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   50.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  5.0min
[Parallel(n_jobs=4)]: Done 1280 out of 1280 | elapsed:  5.2min finished


In [151]:
compute_rmse(naive_gs, test_features, test_targets)

0.68653962371567878

In [152]:
compute_rmse(naive_gs, train_features, train_targets)

0.52874754923578193

In [199]:
naive_gs.best_score_

0.5899843789082917

In [200]:
expert_gs

0.610430915628532

# Expert grid search

This grid search is based on the recommmendations of the following article: 

In [153]:
expert_cv_parameters = {'max_depth':[4, 6, 10, 15],
                 'n_estimators': [10, 50, 100, 500],
                 'learning_rate': [0.01, 0.025, 0.05, 0.1],
                 'gamma': [0.05, 0.5, 0.9, 1.]
}

In [154]:
expert_gs = train_grid_search(expert_cv_parameters, train_features, train_targets)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   55.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  7.7min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 18.6min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 34.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 55.0min
[Parallel(n_jobs=4)]: Done 1280 out of 1280 | elapsed: 57.9min finished


In [155]:
compute_rmse(expert_gs, train_features, train_targets)

0.29533407358232083

In [156]:
compute_rmse(expert_gs.best_estimator_, test_features, test_targets)

0.66304732147221468

In [157]:
expert_gs.best_params_

{'gamma': 0.05, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 500}

In [158]:
naive_gs.best_params_

{'gamma': 0.6, 'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 25}

# Hyperopt

## Link to the theory:
* score => f(x)
* optimize => defines the hyperparameters space and the optimization strategy (here TPE) 

In [170]:
def score(params):
    params["gamma"] = np.log(params["gamma"])
    params["learning_rate"] = np.log(params["learning_rate"])
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    xgb_regressor = xgb.XGBRegressor(silent=False, **params)
    score = cross_val_score(xgb_regressor, train_features, train_targets, 
                            cv=5, verbose=0, 
                            n_jobs=4).mean()
    # Try - score instead of 1 - score
    loss = - score
    return {'loss': loss, 
            'status': STATUS_OK}

In [171]:
def optimize(trials):
    space = {'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
             'learning_rate' : hp.loguniform('learning_rate', 0.01, 1),
             'max_depth' : hp.quniform('max_depth', 3, 15, 1),
             'gamma': hp.loguniform('gamma', 0.01, 1)}
    best = fmin(score, space, algo=tpe.suggest, 
                trials=trials, 
                max_evals=MAX_EVALS)
    return best

In [172]:
trials = Trials()
optimal_param = optimize(trials)

## Get CV score for Hyperopt tunned model

In [195]:
hyperopt_cv_score = min(abs(t['result']['loss']) for t in trials.trials)

In [174]:
params = {'max_depth': int(optimal_param['max_depth']),
                 'n_estimators':  int(optimal_param['n_estimators']),
               'learning_rate': np.log(optimal_param['learning_rate']),
                'gamma': np.log(optimal_param['gamma'])
                }
params = {'gamma': 1.0, 'learning_rate': 0.025, 'max_depth': 15, 'n_estimators': 500}
xgb_regressor = xgb.XGBRegressor(**params)
xgb_regressor.fit(train_features, train_targets)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=1.0, learning_rate=0.025, max_delta_step=0, max_depth=15,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [32]:
# Max evals 100 , 4 ** 4 (256)
# Doesn't mean less trainng time 

In [None]:
# Show selected features 

# Results

In [209]:
configurations = {'model':['naive_grid', 'expert_grid', 'hyperopt_tpe'], 
                  'hyperparameters': [naive_gs.best_params_, expert_gs.best_params_, params]}

In [215]:
def get_model_results(hyperparameters):
    xgb_regressor = xgb.XGBRegressor(**hyperparameters)
    cv_scores = cross_val_score(xgb_regressor, train_features, train_targets, cv=5, verbose=0, n_jobs=4)
    print cv_scores.mean()
    xgb_regressor.fit(train_features, train_targets)
    train_score = compute_rmse(xgb_regressor, train_features, train_targets)
    test_score = compute_rmse(xgb_regressor, test_features, test_targets)
    return {'hyperparameters': hyperparameters,
            'cv_scores': cv_scores, 
            'train_score': train_score, 
            'test_score': test_score}

In [216]:
results = [get_model_results(hp) for hp in configurations['hyperparameters']] 

0.589986767364
0.610438233751
0.612212670721


In [218]:
results_df = pd.DataFrame(results)

In [221]:
results_df.to_csv('/tmp/hyperopt_imdb.csv', sep=';')