In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
% matplotlib inline
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from os import path as osp
from sklearn.grid_search import GridSearchCV

In [5]:
SEED = 0
TRAIN_TEST_SPLIT = 0.2
np.random.seed(SEED) # Set seed for reproducibility

# Load processed data

In [8]:
imdb_df = pd.read_csv('../data/processed_movie_metadata.csv')
imdb_df.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,num_voted_users,num_user_for_reviews,title_year,actor_2_facebook_likes,imdb_score,...,Not Rated,PG,PG-13,Passed,R,TV-14,TV-G,TV-PG,Unrated,X
0,723.0,178.0,0.0,855.0,1000.0,886204,3054.0,2009.0,936.0,7.9,...,0,0,1,0,0,0,0,0,0,0
1,302.0,169.0,563.0,1000.0,40000.0,471220,1238.0,2007.0,5000.0,7.1,...,0,0,1,0,0,0,0,0,0,0
2,602.0,148.0,0.0,161.0,11000.0,275868,994.0,2015.0,393.0,6.8,...,0,0,1,0,0,0,0,0,0,0
3,813.0,164.0,22000.0,23000.0,27000.0,1144337,2701.0,2012.0,23000.0,8.5,...,0,0,1,0,0,0,0,0,0,0
4,462.0,132.0,475.0,530.0,640.0,212204,738.0,2012.0,632.0,6.6,...,0,0,1,0,0,0,0,0,0,0


# Train - test split

In [3]:
shuffle_index = np.random.permutation(imdb_df.shape[0])
train_df = imdb_df.loc[shuffle_index[:int(len(shuffle_index) * train_test_split)], :]
test_df = data.loc[shuffle_index[int(len(shuffle_index) * train_test_split):], :]

In [None]:
train_df, test_df = train_test_split()

In [4]:
train_features = train_df.drop('imdb_score', axis=1)
train_targets = train_df.loc[:, 'imdb_score']
test_features = test_df.drop('imdb_score', axis=1)
test_targets = test_df.loc[:, 'imdb_score']

In [6]:
def get_training_error(model):
    prediction = model.predict(train_features)
    error = np.mean((prediction - train_targets) ** 2)
    return error

In [7]:
def evaluate_on_test_set(model):
    prediction = model.predict(test_features)
    error = np.mean((prediction - test_targets) ** 2)
    return error

# Naive grid search 

In [10]:
# naive search
naive_cv_parameters = {'max_depth':[4, 6, 8, 10],
                 'n_estimators': [10, 15, 20, 25],
                 'learning_rate': [0.2, 0.4, 0.6, 0.8], 
                 'gamma': [0.2, 0.4, 0.6, 0.8]
}

In [12]:
xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
naive_gs = GridSearchCV(xgb_regressor, naive_cv_parameters, cv=5, verbose=1, n_jobs=4)

In [13]:
naive_gs.fit(train_features, train_targets)

NameError: name 'train_features' is not defined

# Expert grid search

In [24]:
expert_cv_parameters = {'max_depth':[4, 6, 10, 15],
                 'n_estimators': [10, 50, 100, 500],
                 'learning_rate': [0.01, 0.025, 0.05, 0.1],
                 'gamma': [0.05, 0.5, 0.9, 1.]
}

In [25]:
xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
expert_gs = GridSearchCV(xgb_regressor, cv_parameters, cv=5, verbose=1, n_jobs=4)

In [26]:
gs.fit(train_features, train_targets)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   24.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  4.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0,
       max_depth={'eval_score': 'rmse'}, min_child_weight=1, missing=None,
       n_estimators=100, nthread=-1, objective='reg:linear', reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [10, 50, 100, 500], 'learning_rate': [0.01, 0.025, 0.05, 0.1], 'max_depth': [4, 6, 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [27]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100}

In [28]:
- gs.best_score_

0.50060425913595474

In [29]:
get_training_error(gs.best_estimator_)

0.17289026745140981

In [30]:
evaluate_on_test_set(gs.best_estimator_)

0.57769943223165821

# Hyperopt

In [31]:
def score(params):
    cv_parameters = {'max_depth':[int(params['max_depth'])],
                     'n_estimators': [int(params['n_estimators'])],
                     'learning_rate': [np.log(params['learning_rate'])],
                     'gamma': [np.log(params['gamma'])]
                    }
    xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
    gs = GridSearchCV(xgb_regressor, cv_parameters, cv=5, verbose=0, n_jobs=4)
    gs.fit(train_features, train_targets)
    loss = - gs.best_score_
    return {'loss': loss, 
            'status': STATUS_OK}

In [32]:
def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
             'learning_rate' : hp.loguniform('learning_rate', 0.01, 1),
             'max_depth' : hp.quniform('max_depth', 3, 15, 1),
             'gamma': hp.loguniform('gamma', 0.01, 1),
             'eval_score': 'rmse',
             'objective': 'reg:linear'
             }
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=100)
    print best
    return best

In [33]:
trials = Trials()
optimal_param = optimize(trials)

{'n_estimators': 172.0, 'learning_rate': 1.1860142751674096, 'max_depth': 3.0, 'gamma': 1.771817276666266}


In [34]:
cv_parameters = {'max_depth':[int(optimal_param['max_depth'])],
                 'n_estimators': [int(optimal_param['n_estimators'])],
                 'learning_rate': [np.log(optimal_param['learning_rate'])],
                 'gamma': [np.log(optimal_param['gamma'])]
                }
xgb_regressor = xgb.XGBRegressor({'eval_score':'rmse'})
gs = GridSearchCV(xgb_regressor, cv_parameters, cv=5, verbose=0, n_jobs=4)
gs.fit(train_features, train_targets)

GridSearchCV(cv=2, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0,
       max_depth={'eval_score': 'rmse'}, min_child_weight=1, missing=None,
       n_estimators=100, nthread=-1, objective='reg:linear', reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [172], 'learning_rate': [0.17059833690057549], 'max_depth': [3], 'gamma': [0.57200572986047216]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [35]:
- gs.best_score_

0.47622344941626482

In [36]:
get_training_error(gs.best_estimator_)

0.23960940411466794

In [37]:
evaluate_on_test_set(gs.best_estimator_)

0.57842418163747689