In [None]:
# Data for this project is taken from the ML competition by G-Research

In [None]:
#data manipulation

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score


#xgboost
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor



In [None]:
# Read from data files
df = pd.read_csv('../Data/train.csv')
dt = pd.read_csv('../Data/test.csv')

In [None]:
#Get rid of NaN values
#Find out which columns has null values
columns = df.columns
for i in range(len(df.columns)):
    index = df.index[df[columns[i]].isnull()]
    if index.shape[0]>0:
        print(df.columns[i],index.shape)
        df[columns[i]] = df[columns[i]].fillna(0)
columns2 = dt.columns
for i in range(len(dt.columns)):
    index2 = dt.index[dt[columns2[i]].isnull()]
    if index2.shape[0]>0:
        print(dt.columns[i],index2.shape)
        dt[columns2[i]] = dt[columns2[i]].fillna(0)

In [None]:
#Training dataset and test dataset
x_train = df.drop(['Weight','y'],axis = 1)
x_weight = np.array(df.Weight)
y_train = df.y
x_test = dt

In [None]:
# Function to find the best stopping point
def BestStoppingRounds(XGBmodel,
                       x_train,
                       y_train,
                       weight,
                       eval_metric = 'mae'):
    result_set = []
    kf = KFold(n_splits=10)
    fold = 0
    for train_index, test_index in kf.split(x_train):
        fold += 1
        xfold_train, xfold_test = x_train.values[train_index], x_train.values[test_index]
        yfold_train, yfold_test = y_train.values[train_index], y_train.values[test_index]
        xfold_weight = weight[train_index]
        eval_set = ([xfold_train,yfold_train],[xfold_test, yfold_test])
        XGBmodel.fit(xfold_train,
                     yfold_train,
                     sample_weight = xfold_weight,
                     early_stopping_rounds = 10,
                     eval_metric=[eval_metric],
                     eval_set=eval_set,
                     verbose=True)
        # evaluate predictions
        best_score = XGBmodel.best_score
        best_iteration = XGBmodel.best_iteration
        result_set.append([best_score, XGBmodel.best_iteration])
        print("Minimum " + eval_metric + " : %f" %best_score)
        print("Best iteration: %d" %best_iteration)
    return result_set

In [None]:
# Tune XGBoost hyperparameters
# Part 1: set a low learning rate and find the best stopping point

XGBmodel = XGBRegressor(learning_rate = 0.1,
                        n_estimators = 1000)

result_set = BestStoppingRounds(XGBmodel, x_train, y_train, x_weight, 'mae')


In [None]:
#Find the best stopping point
result_array = np.array(result_set)
plt.plot(result_array[:,1],result_array[:,0])
#take the mean of stopping points
mean_mae = result_array[:,0].mean()
best_stopping = int(result_array[:,1].mean())
print("best_stopping: %d" %best_stopping)
print("mean mae: %f" %mean_mae)


In [None]:
# Function to tune parameters
def XGBTuneParam(XGBmodel,
                 param_grid,
                 x_train,
                 y_train,
                 weight):

    xgbcv = GridSearchCV(XGBmodel,param_grid)
    if type(weight)==str:
        xgbcv.fit(x_train,y_train)
    else:
        xgbcv.fit(x_train,y_train,weight)
    return xgbcv, xgbcv.best_score_, xgbcv.best_params_


In [None]:
# Part 2: Tune max_depth and mean_child_weight
# best fit parameter: learning_rate = 0.1, max_depth = 6, min_child_weight = 2
XGBmodel = XGBRegressor(learning_rate = 0.1, 
                        n_estimators = best_stopping
                       )
param_grid = { 
        "max_depth"             : [2, 4, 6],
        "min_child_weight"      : [2, 4, 8]
        }

Tune2model, Tune2score, Tune2params = XGBTuneParam(XGBmodel,
                                                   param_grid,
                                                   x_train,
                                                   y_train,
                                                   x_weight)

Tuned_max_depth = Tune2params['max_depth']
Tuned_min_child_weight = Tune2params['min_child_weight']
print("max_depth %d" %Tune2params['max_depth'])
print("min_child_weight %f" %Tune2params['min_child_weight'])
print("Accuracy_score %f" %Tune2score)

In [None]:
# Part 3: Tune gamma
# best fit parameter: gamma = 0.0
XGBmodel = Tune2model

param_grid = { 
        "gamma"             : [0.0, 0.1, 0.2]
        }

Tune3model, Tune3score, Tune3params = XGBTuneParam(XGBmodel,
                                                   param_grid,
                                                   x_train,
                                                   y_train,
                                                   x_weight) 
Tuned_gamma = Tune3params["gamma"]
print("gamma %f" %Tuned_gamma)
print("Accuracy_score %f" %Tune3score)


In [None]:
# Revisit best stopping rounds

XGBmodel = XGBRegressor(learning_rate = 0.1,
                        gamma = Tuned_gamma,
                        max_depth = Tuned_max_depth,
                        min_child_weight = Tuned_min_child_weight,
                        n_estimators = 1000)
     
result_set = BestStoppingRounds(XGBmodel, x_train, y_train, x_weight)

#Find the best stopping point
result_array = np.array(result_set)
plt.plot(result_array[:,1],result_array[:,0])
#take the mean of stopping points
mean_mae = result_array[:,0].mean()
best_stopping = int(result_array[:,1].mean())
print("best_stopping: %d" %best_stopping)
print("mean mae: %f" %mean_mae)


In [None]:
# Part 4: Tune colsample_bytree and subsample
# best fit parameter: colsample_bytree = 0.7, subsample = 1
XGBmodel = XGBRegressor(learning_rate = 0.1,
                       gamma = Tuned_gamma,
                       max_depth = Tuned_max_depth,
                       min_child_weight = Tuned_min_child_weight,
                       n_estimators= best_stopping)

param_grid = { 
        "subsample"         : [i/10 for i in range(7,11)],
        "colsample_bytree"  : [i/10 for i in range(7,11)]
        }

Tune4model, Tune4score, Tune4params = XGBTuneParam(XGBmodel,
                                                   param_grid,
                                                   x_train,
                                                   y_train,
                                                   x_weight)
Tuned_subsample = Tune4params['subsample']
Tuned_colsample_bytree = Tune4params['colsample_bytree']
print("subsample %f" %Tuned_subsample)
print("colsample_bytree %f" %Tuned_colsample_bytree)
print("Accuracy_score %f" %Tune4score)


In [None]:
#Part 5: Tune learning rate
# The slower the learning rate, the more boosting rounds will be needed, and the longer
# it will take to train
# Find the optimal trade off between time and error
from sklearn.model_selection import GridSearchCV
def XGBTunePart4(XGBmodel, x_train, y_train, weight):
    param_grid = { 
            "subsample"             : [0.6,0.8,1],
            "colsample_bytree"      : [0.6,0.8,1],
            }
    xgbcv = GridSearchCV(XGBmodel,param_grid)
    if type(weight)==str:
        xgbcv.fit(x_train,y_train)
    else:
        xgbcv.fit(x_train,y_train,weight)
    return xgbcv.best_score_, xgbcv.best_params_
Tune4model, Tune4score, Tune4params = XGBTunePart4(XGBmodel, x_train,y_train,x_weight) 

In [None]:
# Part 6: XGBoost with all tuned parameters
    XGBmodel = XGBRegressor(learning_rate = 0.1, 
                            n_estimators = best_stopping,
                            max_depth = 6,
                            min_child_weight = 4,
                            colsample_bytree = 0.8,
                            subsmaple = 1
                           )
XGBmodel.fit(x_train, y_train, x_weight)
print(XGBmodel.score(x_train, y_train, x_weight))

In [None]:
# predict result with trained model
yp = pd.Series(XGBmodel.predict(x_test)).rename('y')
yp.index.name = 'Index'


In [None]:
# save predicted data to csv
yp.to_csv('XGBoostRegressor_model.csv', header=True)