In [2]:
# This script is used to train models using training data
# Read the comments at each cell to use it

# Many ideas for this script were based on SciKit tutorial on how to do hyper parameter tuning. Below are the links
#   http://scikit-learn.org/stable/modules/grid_search.html
#   http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
#   http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html
#   http://scikit-learn.org/stable/modules/preprocessing.html

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn import svm

In [3]:
# A function to do some data preprocessing
def transformDf(a_df):
    a_df.loc[:, "norm_myTfIdf_all"] = a_df.loc[:, "myTfIdf_all"] / a_df.loc[:, "sizeOfQuery"]
    a_df.loc[:, "norm_myTfIdf_title"] = a_df.loc[:, "myTfIdf_title"] / a_df.loc[:, "sizeOfQuery"]
    a_df.loc[:, "norm_myTfIdf_desc"] = a_df.loc[:, "myTfIdf_desc"] / a_df.loc[:, "sizeOfQuery"]
    a_df.loc[:, "norm_myTfIdf_attrib"] = a_df.loc[:, "myTfIdf_attrib"] / a_df.loc[:, "sizeOfQuery"]

    brandMatches_df = pd.get_dummies(a_df.loc[:, "brandMatches"])
    brandMatches_df.columns = ["query_product_brands_noMatch", "query_has_no_brand", "query_product_brands_match"]
    a_df.loc[:, "query_product_brands_noMatch"] = brandMatches_df.loc[:, "query_product_brands_noMatch"]
    a_df.loc[:, "query_has_no_brand"] = brandMatches_df.loc[:, "query_has_no_brand"]
    a_df.loc[:, "query_product_brands_match"] = brandMatches_df.loc[:, "query_product_brands_match"]

In [4]:
# Set here the location of your feature file
df = pd.read_csv("/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/featureFiles/train_set/MergedFeatures.csv", 
                 delimiter=",")
transformDf(df)


In [5]:
# This part separate data for the cross validation training. 
# You don't have to change anything here
features = ['queryId', 'productId', 'productTitle', 'originalQuery', 'ratioNumberOfQueryTermsIn_title', 'ratioNumberOfExpQueryTermsIn_all', 'productRank_title','ratioNumberOfExpQueryTermsIn_attrib','ratioNumberOfExpQueryTermsIn_desc', 'ratioNumberOfExpQueryTermsIn_title', 'productRank_all', 'sizeOfExpQuery_title', 'ratioNumberOfQueryTermsIn_desc', 'productRank_desc', 'ratioNumberOfQueryTermsIn_all', 'ratioNumberOfQueryTermsIn_attrib', 'productRank_attrib', 'sizeOfQuery', 'sizeOfExpQuery_attrib', 'sizeOfExpQuery_desc', 'sizeOfExpQuery_all', "norm_myTfIdf_all", "norm_myTfIdf_title", "norm_myTfIdf_desc", "norm_myTfIdf_attrib", "norm_myTfIdf_expQuery_all", "norm_myTfIdf_expQuery_title", "norm_myTfIdf_expQuery_desc", "norm_myTfIdf_expQuery_attrib", "docLength_all", "docLength_title", "docLength_desc", "docLength_attrib", "query_product_brands_noMatch", "query_has_no_brand", "query_product_brands_match", "sim_score_all", "sim_rank_all"]

xFeatures = features[4:]
y = df.loc[:, 'y'].values
feat_train, feat_test, y_train, y_test = cross_validation.train_test_split(
    df.loc[:, features], y, test_size=0.4, random_state=0)

In [6]:
# Train a StandardScaler instance which is use to normalize the train and test data.
# You don't have to change anything here
scaler = preprocessing.StandardScaler().fit(feat_train.loc[:, xFeatures])
X_train = scaler.transform(feat_train.loc[:, xFeatures])
X_test = scaler.transform(feat_test.loc[:, xFeatures])

In [7]:
# Set the variable modelToRun to train different models
#
# RF = Random forest
# SVR-rbf = Support vector machine with Gaussian kernel
# GB = Gradient boosting
 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import svm

modelToRun = "SVR-rbf"
model = None
if modelToRun == "RF":
    print("Training Random Forest")
    params = {'min_samples_leaf': 10, 'bootstrap': False, 'max_features': 11, 
              'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'verbose': 1}
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

if modelToRun == "GB":
    print("Training Gradient Boosting")

    params = {'max_depth': 6, 'learning_rate': 0.05, 'max_features': 1.0, 'n_estimators': 300, 
              'min_samples_leaf': 17, 'verbose': 1}
    
    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)

if modelToRun == "SVR-rbf":
    print("Training SVR-rbf")
    model = svm.SVR(verbose=True, C = 0.5, gamma = 0.03125)
    model.fit(X_train, y_train)


Training SVR-rbf
[LibSVM]

In [8]:
# Run this part to calculate the RMSE error for your trained model
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
MSE = mean_squared_error(y_test ,model.predict(X_test))
RMSE = math.sqrt(MSE)
print("RMSE: " + str(RMSE) + 
      "   MAE: " + str(mean_absolute_error(y_test,model.predict(X_test))))

RMSE: 0.46791180492810563   MAE: 0.370345964199


In [9]:
# Run this cell to show the importance of each feature. Only available for
# random forest and gradient boosting
featureImportance = {}
for i in range(0, len(xFeatures)):
    featureImportance[xFeatures[i]] = model.feature_importances_[i]

table = pd.Series(featureImportance)
sortedTable = table.sort_values(ascending=False)
sortedTable

AttributeError: 'SVR' object has no attribute 'feature_importances_'

In [10]:
def reportPredictions(fittedModel, X, X_features):
    result = []
    predictions = fittedModel.predict(X)
    #print(str(len(X_features)) + " " + str(predictions.size))
    for i in range(0, len(X_features)):
        queryId = str(X_features.iloc[i, 0])
        aPred = predictions[i]
        if aPred > 3:
            aPred = 3
        result.append(queryId + "," + str(aPred))
    return result

def reportPredictionsWithDetail(fittedModel, X, X_features, y):
    result = []
    predictions = fittedModel.predict(X)
    #print(str(len(X_features)) + " " + str(predictions.size))
    for i in range(0, len(X_features)):
        aPred = predictions[i]
        if aPred > 3:
            aPred = 3
        row = str(aPred) + "," + str(y[i])
        for j in range(0, X_features.columns.size):
            row = row + "," + str(X_features.iloc[i, j])
        result.append(row)
    return result



In [11]:
# Set this path to your test feature file
test_df = pd.read_csv("/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/featureFiles/test_set/MergedFeatures.csv", 
                 delimiter=",")
transformDf(test_df)


In [79]:
# Debugging code
# Useful code to check for Nan column during scaling
myFeatures = test_df.loc[:, xFeatures]

myFeatures.loc[:, ["norm_myTfIdf_all"]].to_csv(path_or_buf="/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/Result/Null.csv")
for i in range(4, len(features)):
    print(myFeatures.columns[i])
    preprocessing.StandardScaler().fit(myFeatures.iloc[:, i:i+1])

ratioNumberOfExpQueryTermsIn_desc
ratioNumberOfExpQueryTermsIn_title
productRank_all
sizeOfExpQuery_title
ratioNumberOfQueryTermsIn_desc
productRank_desc
ratioNumberOfQueryTermsIn_all
ratioNumberOfQueryTermsIn_attrib
productRank_attrib
sizeOfQuery
sizeOfExpQuery_attrib
sizeOfExpQuery_desc
sizeOfExpQuery_all
norm_myTfIdf_all


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [12]:
# Run this to standarize your test feature and make prediction
X_test_submit = scaler.transform(test_df.loc[:, xFeatures])
report = reportPredictions(model, X_test_submit, test_df)

In [14]:
# Set here the location in which you want to save your predictions
file = open('/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/Result/test_result_SVM.csv', 'w')
file.write('"id","relevance"\n')
for i in range(0, len(report)):
    file.write(report[i] + "\n")
    
file.close()