# Using XgBoost for regression

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

## Data Processing

In [None]:
Train_Data = pd.read_csv("C:/Users/Kevin Zhang/Documents/GitHub/Group20-Project/Doc/train.csv")
print(Train_Data.shape)
Test_Data = pd.read_csv("C:/Users/Kevin Zhang/Documents/GitHub/Group20-Project/Doc/test.csv")
print(Test_Data.shape)

In [None]:
Train_Y = Train_Data.pop('review_scores_rating')
print(Train_Y.shape)
Train_X = Train_Data
print(Train_X.shape)
Test_Y = Test_Data.pop('review_scores_rating')
print(Test_Y.shape)
Test_X = Test_Data
print(Test_X.shape)

In [None]:
print(Train_X.columns)
len(Train_X.columns)

In [None]:
All_Data_ID = pd.DataFrame(data =Train_X['id'])
All_Data_ID = All_Data_ID.append(pd.DataFrame(data = Test_X['id']), ignore_index=True)

In [None]:
Train_X.drop(['Unnamed: 0', 'id'], axis = 1, inplace = True)
print(Train_X.shape)
Test_X.drop(['Unnamed: 0', 'id'], axis = 1, inplace = True)
print(Test_X.shape)

In [None]:
XgBoost_Train = xgb.DMatrix(data = Train_X, label = Train_Y)
XgBoost_Test = xgb.DMatrix(data = Test_X, label = Test_Y)

## Selection of num_boost_round

In [None]:
params = {"objective":"reg:linear",'colsample_bytree': 1,'learning_rate': 0.03,'max_depth': 6, 'alpha': 0, 'gamma': 0,
          'min_child_weight ': 1,'subsample': 1, 'lambda': 0}

cv_results = xgb.cv(dtrain=XgBoost_Train, params=params, nfold=5, num_boost_round=500, 
                    metrics="rmse", as_pandas=True, early_stopping_rounds=True)

In [None]:
Min_MAE_Index = cv_results['test-rmse-mean'].idxmin()
print(cv_results.iloc[cv_results['test-rmse-mean'].idxmin()][2])
print(Min_MAE_Index)

## Selection of Learning Rate

In [None]:
learning_rate_vector = [0.01, 0.03, 0.05, 0.07, 0.1, 0.2, 0.3]

In [None]:
RMAE_learning_rate = []
for i in range(len(learning_rate_vector)):
    learning_rate_current = learning_rate_vector[i]
    RMAEj = []
    for j in range(1,11):
        XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = learning_rate_current,
                                       max_depth = 6, alpha = 0, n_estimators = Min_MAE_Index, gamma = 0, min_child_weight = 1,
                                       subsample = 1, reg_lambda = 0)
        X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(Train_X, Train_Y, test_size=0.1)
        XgBoost_Fit.fit(X_Train,Y_Train)
        Preds = XgBoost_Fit.predict(X_Validation)
        RMAEj.append(np.sqrt(mean_squared_error(Y_Validation, Preds)))
    RMAE_learning_rate.append(np.mean(RMAEj))
    print("RMSE: %f" % (np.mean(RMAEj)))
Optimal_Learning_Rate = learning_rate_vector[RMAE_learning_rate.index(min(RMAE_learning_rate))]

In [None]:
Optimal_Learning_Rate

## Selection of Max Depth

In [None]:
Max_Depth_Vector = range(1,11)

In [None]:
RMAE_Max_Depth = []
for i in range(len(Max_Depth_Vector)):
    Max_Depth_Current = Max_Depth_Vector[i]
    RMAEj = []
    for j in range(1,11):
        XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = Optimal_Learning_Rate,
                                       max_depth = Max_Depth_Current, alpha = 0, n_estimators = Min_MAE_Index, gamma = 0, min_child_weight = 1,
                                       subsample = 1, reg_lambda = 0)
        X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(Train_X, Train_Y, test_size=0.1)
        XgBoost_Fit.fit(X_Train,Y_Train)
        Preds = XgBoost_Fit.predict(X_Validation)
        RMAEj.append(np.sqrt(mean_squared_error(Y_Validation, Preds)))
    RMAE_Max_Depth.append(np.mean(RMAEj))
    print("RMSE: %f" % (np.mean(RMAEj)))
Optimal_Max_Depth = Max_Depth_Vector[RMAE_Max_Depth.index(min(RMAE_Max_Depth))]

In [None]:
Optimal_Max_Depth

## Selection of min_child_weight

In [None]:
min_child_weight_Vector = range(1,11)

In [None]:
RMAE_min_child_weight = []
for i in range(len(min_child_weight_Vector)):
    min_child_weight_Current = min_child_weight_Vector[i]
    RMAEj = []
    for j in range(1,11):
        XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = Optimal_Learning_Rate,
                                   max_depth = Optimal_Max_Depth, alpha = 0, n_estimators = Min_MAE_Index, 
                                   min_child_weight = min_child_weight_Current, gamma = 0, subsample = 1, reg_lambda = 0)
        X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(Train_X, Train_Y, test_size=0.1)
        XgBoost_Fit.fit(X_Train,Y_Train)
        Preds = XgBoost_Fit.predict(X_Validation)
        RMAEj.append(np.sqrt(mean_squared_error(Y_Validation, Preds)))
    RMAE_min_child_weight.append(np.mean(RMAEj))
    print("RMSE: %f" % (np.mean(RMAEj)))
Optimal_min_child_weight = min_child_weight_Vector[RMAE_min_child_weight.index(min(RMAE_min_child_weight))]

In [None]:
Optimal_min_child_weight

## Selection of Gamma

In [None]:
gamma_vector = [i/10.0 for i in range(0,7)]

In [None]:
RMAE_gamma = []
for i in range(len(gamma_vector)):
    gamma_Current = gamma_vector[i]
    RMAEj = []
    for j in range(1,11):
        XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = Optimal_Learning_Rate,
                                       max_depth = Optimal_Max_Depth, alpha = 0, n_estimators = Min_MAE_Index, 
                                       min_child_weight = Optimal_min_child_weight, gamma = gamma_Current, subsample = 1, reg_lambda = 0)
        X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(Train_X, Train_Y, test_size=0.1)
        XgBoost_Fit.fit(X_Train,Y_Train)
        Preds = XgBoost_Fit.predict(X_Validation)
        RMAEj.append(np.sqrt(mean_squared_error(Y_Validation, Preds)))
    RMAE_gamma.append(np.mean(RMAEj))
    print("RMSE: %f" % (np.mean(RMAEj)))
Optimal_gamma = gamma_vector[RMAE_gamma.index(min(RMAE_gamma))]

In [None]:
Optimal_gamma

## Selection of subsample

In [None]:
subsample_vector = [i/10.0 for i in range(5,11)]

In [None]:
RMAE_subsample = []
for i in range(len(subsample_vector)):
    subsample_Current = subsample_vector[i]
    RMAEj = []
    for j in range(1,11):
        XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = Optimal_Learning_Rate,
                                       max_depth = Optimal_Max_Depth, alpha = 0, n_estimators = Min_MAE_Index, 
                                       min_child_weight = Optimal_min_child_weight, gamma = Optimal_gamma, subsample = subsample_Current,
                                       reg_lambda = 0)
        X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(Train_X, Train_Y, test_size=0.1)
        XgBoost_Fit.fit(X_Train,Y_Train)
        Preds = XgBoost_Fit.predict(X_Validation)
        RMAEj.append(np.sqrt(mean_squared_error(Y_Validation, Preds)))
    RMAE_subsample.append(np.mean(RMAEj))
    print("RMSE: %f" % (np.mean(RMAEj)))
Optimal_subsample = subsample_vector[RMAE_subsample.index(min(RMAE_subsample))]

In [None]:
Optimal_subsample

## Selection of colsample_bytree¶

In [None]:
colsample_bytree_vector = [i/10.0 for i in range(5,11)]

In [None]:
RMAE_colsample_bytree = []
for i in range(len(colsample_bytree_vector)):
    colsample_bytree_Current = colsample_bytree_vector[i]
    RMAEj = []
    for j in range(1,11):
        XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = colsample_bytree_Current, 
                                       learning_rate = Optimal_Learning_Rate,
                                       max_depth = Optimal_Max_Depth, alpha = 0, n_estimators = Min_MAE_Index, 
                                       min_child_weight = Optimal_min_child_weight, gamma = Optimal_gamma, subsample = Optimal_subsample,
                                       reg_lambda = 0)
        X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(Train_X, Train_Y, test_size=0.1)
        XgBoost_Fit.fit(X_Train,Y_Train)
        Preds = XgBoost_Fit.predict(X_Validation)
        RMAEj.append(np.sqrt(mean_squared_error(Y_Validation, Preds)))
    RMAE_colsample_bytree.append(np.mean(RMAEj))
    print("RMSE: %f" % (np.mean(RMAEj)))
Optimal_colsample_bytree = colsample_bytree_vector[RMAE_colsample_bytree.index(min(RMAE_colsample_bytree))]

In [None]:
Optimal_colsample_bytree

## Selection of alpha¶

In [None]:
alpha_vector = [0, 0.001, 0.005, 0.01, 0.05, 0.5, 1, 3, 5, 10, 50, 100]

In [None]:
RMAE_alpha = []
for i in range(len(alpha_vector)):
    alpha_Current = alpha_vector[i]
    RMAEj = []
    for j in range(1,11):
        XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = Optimal_colsample_bytree, 
                                       learning_rate = Optimal_Learning_Rate,
                                       max_depth = Optimal_Max_Depth, alpha = alpha_Current, n_estimators = Min_MAE_Index, 
                                       min_child_weight = Optimal_min_child_weight, gamma = Optimal_gamma, subsample = Optimal_subsample,
                                       reg_lambda = 0)
        X_Train, X_Validation, Y_Train, Y_Validation = train_test_split(Train_X, Train_Y, test_size=0.1)
        XgBoost_Fit.fit(X_Train,Y_Train)
        Preds = XgBoost_Fit.predict(X_Validation)
        RMAEj.append(np.sqrt(mean_squared_error(Y_Validation, Preds)))
    RMAE_alpha.append(np.mean(RMAEj))
    print("RMSE: %f" % (np.mean(RMAEj)))
Optimal_alpha = alpha_vector[RMAE_alpha.index(min(RMAE_alpha))]

In [None]:
Optimal_alpha

## Trained XgBoost Model

In [None]:
start_time = time.time()
XgBoost_Fit = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = Optimal_colsample_bytree, 
                               learning_rate = Optimal_Learning_Rate,
                               max_depth = Optimal_Max_Depth, alpha = Optimal_alpha, n_estimators = Min_MAE_Index, 
                               min_child_weight = Optimal_min_child_weight, gamma = Optimal_gamma, subsample = Optimal_subsample,
                               reg_lambda = 0)
XgBoost_Fit.fit(Train_X,Train_Y)

In [None]:
Preds = XgBoost_Fit.predict(Test_X)
elapsed_time = time.time() - start_time
print(elapsed_time)
print("RMSE: %f" % (np.sqrt(mean_squared_error(Preds, Test_Y))))

In [None]:
np.max(Preds)

## Prediction for Whole Dataset

In [None]:
All_Data = Train_X.append(pd.DataFrame(data = Test_X), ignore_index=True)
Preds = XgBoost_Fit.predict(All_Data)
All_Data_Pred = pd.concat([All_Data_ID, pd.DataFrame(Preds,columns=['Prediction'])], axis=1)

In [None]:
All_Data_Pred.to_csv('All_Data_Pred.csv')

## Feature Importance

In [None]:
xgb.plot_importance(XgBoost_Fit, max_num_features = 10)
plt.rcParams['figure.figsize'] = [19, 10]
plt.show()

## Reference

Why MAE?

https://medium.com/human-in-a-machine-world/mae-and-rmse-which-metric-is-better-e60ac3bde13d

XgBoost

https://www.datacamp.com/community/tutorials/xgboost-in-python

Tuning Parameter for XgBoost

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

XgBoost Website

https://xgboost.readthedocs.io/en/latest/parameter.html