In [1]:
# This script allows the search of the best hyperparameters of our models
# Read the instructions in each cell to use it
# Many ideas for this script were based on SciKit tutorial on how to do hyper parameter tuning. Below are the links
#   http://scikit-learn.org/stable/modules/grid_search.html
#   http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
#   http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html

print(__doc__)

import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler    
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
import pandas as pd
import math
from sklearn import svm

# A function to do some data preprocessing
def transformDf(a_df):
    a_df.loc[:, "norm_myTfIdf_all"] = df.loc[:, "myTfIdf_all"] / df.loc[:, "sizeOfQuery"]
    a_df.loc[:, "norm_myTfIdf_title"] = df.loc[:, "myTfIdf_title"] / df.loc[:, "sizeOfQuery"]
    a_df.loc[:, "norm_myTfIdf_desc"] = df.loc[:, "myTfIdf_desc"] / df.loc[:, "sizeOfQuery"]
    a_df.loc[:, "norm_myTfIdf_attrib"] = df.loc[:, "myTfIdf_attrib"] / df.loc[:, "sizeOfQuery"]

    brandMatches_df = pd.get_dummies(df.loc[:, "brandMatches"])
    brandMatches_df.columns = ["query_product_brands_noMatch", "query_has_no_brand", "query_product_brands_match"]
    a_df.loc[:, "query_product_brands_noMatch"] = brandMatches_df.loc[:, "query_product_brands_noMatch"]
    a_df.loc[:, "query_has_no_brand"] = brandMatches_df.loc[:, "query_has_no_brand"]
    a_df.loc[:, "query_product_brands_match"] = brandMatches_df.loc[:, "query_product_brands_match"]
    


Automatically created module for IPython interactive environment


In [None]:
#Set this path to your feature file location
df = pd.read_csv("/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/featureFiles/train_set/MergedFeatures.csv", 
                 delimiter=",")

transformDf(df)

In [5]:
#This part fine tune parameters for different models. 
#Set the variable "modelToSearch" below to be 
#RF = Random forest
#SVR = Support vector machine with Gaussian kernel
#GB = Gradient boosting
#EN = Elastic net
#LSVR = Linear support vector machine

features = ['queryId', 'productId', 'productTitle', 'originalQuery', 'ratioNumberOfQueryTermsIn_title', 'ratioNumberOfExpQueryTermsIn_all', 'productRank_title','ratioNumberOfExpQueryTermsIn_attrib','ratioNumberOfExpQueryTermsIn_desc', 'ratioNumberOfExpQueryTermsIn_title', 'productRank_all', 'sizeOfExpQuery_title', 'ratioNumberOfQueryTermsIn_desc', 'productRank_desc', 'ratioNumberOfQueryTermsIn_all', 'ratioNumberOfQueryTermsIn_attrib', 'productRank_attrib', 'sizeOfQuery', 'sizeOfExpQuery_attrib', 'sizeOfExpQuery_desc', 'sizeOfExpQuery_all', "norm_myTfIdf_all", "norm_myTfIdf_title", "norm_myTfIdf_desc", "norm_myTfIdf_attrib", "norm_myTfIdf_expQuery_all", "norm_myTfIdf_expQuery_title", "norm_myTfIdf_expQuery_desc", "norm_myTfIdf_expQuery_attrib", "docLength_all", "docLength_title", "docLength_desc", "docLength_attrib", "query_product_brands_noMatch", "query_has_no_brand", "query_product_brands_match", "sim_score_all", "sim_rank_all"]
xFeatures = features[4:]

y = df.loc[:, 'y'].values
X = df.loc[:, xFeatures]

steps = None
param_dist = None

modelToSearch = "SVR" #RF, SVR, GB, EN, LSVR

if modelToSearch == "RF":
    steps = [('scaler', StandardScaler()), ('RF', RandomForestRegressor())]
    param_dist = {"RF__max_depth": [3, None],
              "RF__max_features": sp_randint(1, X.columns.size),
              "RF__min_samples_split": sp_randint(1, 11),
              "RF__min_samples_leaf": sp_randint(1, 11),
              "RF__bootstrap": [True, False],
              "RF__n_estimators": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
             }
    
elif modelToSearch == "GB":
    steps = [('scaler', StandardScaler()), ('GB', GradientBoostingRegressor())]
    param_dist = {'GB__learning_rate': [0.1, 0.05, 0.02, 0.01],
                'GB__max_depth': [1, 4, 6],
                'GB__min_samples_leaf': [1, 3, 5, 9, 17], 
                'GB__max_features': [1.0, 0.5, 0.1],
                'GB__n_estimators': [100, 300, 500]
                  }
elif modelToSearch == "SVR":
    steps = [('scaler', StandardScaler()), ('SVR', svm.SVR(cache_size=500, verbose=True))]
    param_dist = {'SVR__C': [math.pow(2, -5), math.pow(2, -3), math.pow(2, -1), 1.0, math.pow(2, 1), math.pow(2, 3)],
                'SVR__gamma': [math.pow(2, -5), math.pow(2, -3), math.pow(2, -1), 1.0, math.pow(2, 1), math.pow(2, 3)]
                  }
elif modelToSearch == "EN": #Elastic net
    steps = [('scaler', StandardScaler()), ('EN', ElasticNet())]
    param_dist = {'EN__l1_ratio': [.1, .5, .7, .9, .95, .99, 1]
                  }
elif modelToSearch == "LSVR":
    steps = [('scaler', StandardScaler()), ('LSVR', svm.LinearSVR())]
    param_dist = {'LSVR__C': [math.pow(2, -5), math.pow(2, -3), math.pow(2, -1), 1.0, math.pow(2, 1), math.pow(2, 3)]}
    
    
pipeline = Pipeline(steps)
    
# run randomized search
n_iter_search = 25
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=3, verbose=1,
                                   n_iter=n_iter_search, scoring="mean_squared_error", n_jobs=-1,
                                  error_score=0)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 204.6min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 411.8min finished


[LibSVM]RandomizedSearchCV took 25297.26 seconds for 25 candidates parameter settings.[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]


In [6]:
# After the cross validation is finised, you can run this snipplet of code to get the 3 top parameter sets
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        
        mean = math.sqrt(-1 * score.mean_validation_score)
        sd = np.std(np.sqrt(-1 * score.cv_validation_scores))
        
        print("Mean RMSE: {0:.3f} (std: {1:.3f})".format(mean, sd))
        print("Parameters: {0}".format(score.parameters))
        print("")
        
report(random_search.grid_scores_)

Model with rank: 1
Mean RMSE: 0.476 (std: 0.013)
Parameters: {'SVR__C': 0.5, 'SVR__gamma': 0.03125}

Model with rank: 2
Mean RMSE: 0.476 (std: 0.014)
Parameters: {'SVR__C': 0.125, 'SVR__gamma': 0.03125}

Model with rank: 3
Mean RMSE: 0.477 (std: 0.015)
Parameters: {'SVR__C': 0.125, 'SVR__gamma': 0.125}

