In [None]:
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBRegressor
from sklearn import metrics
import random

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
test_kaggle = pd.read_csv("test2_PCA.csv")
data_train = pd.read_csv("train2_PCA.csv")

print(test_kaggle.head())
print(data_train.head())

**Sequentially Divided training dataset**

Numpy Array

In [None]:
data_train_array = np.array(data_train.drop('ID', axis=1))
print(data_train_array.shape)
X_train = data_train_array[:, :-1]
print(X_train.shape)
Y_train = data_train_array[:,-1]
print(Y_train.shape)

nfolds = 5
test_CV_N = (X_train.shape[0] // nfolds) # ceiling

**RandomSearch Parameter Setting**

In [None]:
LearningRate_Max, LearningRate_Min = 1, 0.1
LearningRate_Step = 0.1
LearningRate_N = (LearningRate_Max - LearningRate_Min) / LearningRate_Step + 1 # the number of given learning rates
LearningRate = np.arange(LearningRate_Min, LearningRate_Max+LearningRate_Step, LearningRate_Step) # (0.1, 1, 0.1)

nEstimators = np.arange(0, 5000+100, 100)
MaxDepth = np.arange(1, 9, 2)
MinChildWeight = np.arange(1, 9, 2)

ParamDict = {"learning_rate": random.choice(LearningRate), "n_estimators": random.choice(nEstimators),
             "max_depth": random.choice(MaxDepth), "min_child_weight": random.choice(MinChildWeight)}
print(ParamDict)

In [None]:
RandonSearch_N = 200# must be smaller than LearningRate_N*nEstimators_N*MaxDepth_N*MinChildWeight_N

**Complete Code for RandomSearch**

In [None]:
Result = {}
while RandonSearch_N > 0:
    RandonSearch_N -= 1
    print(RandonSearch_N)
    ParamDict = {"learning_rate": random.choice(LearningRate), "n_estimators": random.choice(nEstimators),
                "max_depth": random.choice(MaxDepth), "min_child_weight": random.choice(MinChildWeight)}
    if ParamDict in Result.values():
        continue
    else:           
        rmse_CV = 0
        for i in range(nfolds):

            ## get train and test data in CV
            test_CV_idx = [j + i*test_CV_N for j in range(test_CV_N)]
            test_CV_x, test_CV_y = X_train[test_CV_idx, :], Y_train[test_CV_idx]
            train_CV_x, train_CV_y = np.delete(X_train, test_CV_idx, axis=0), np.delete(Y_train, test_CV_idx)

            ## train xgb model
            xgb_CV = XGBRegressor(
                learning_rate = ParamDict["learning_rate"],
                n_estimators = ParamDict["n_estimators"],
                max_depth = ParamDict["max_depth"], 
                min_child_weight = ParamDict["min_child_weight"],
                objective='reg:squarederror', 
                subsample=0.8, 
                colsample_bytree=0.8)
            xgb_CV.fit(train_CV_x, train_CV_y)

            #calculate rmse
            rmse_CV += np.sqrt(metrics.mean_squared_error(xgb_CV.predict(test_CV_x), test_CV_y))

        Result[rmse_CV / nfolds] = ParamDict

In [None]:
Result_sorted = {k: v for k, v in sorted(Result.items(), key = lambda x:x[0])}
Result_df = pd.DataFrame(Result_sorted)
pd.DataFrame(Result_sorted)

In [None]:
Result_df.to_csv("RandomSearch v1_200.csv")
Result_df.to_csv("RandomSearch v1_"+str(Result_df.shape[1])+".csv")

**GridSearch Parameter Setting**

In [None]:
LearningRate_GS = np.arange(0.01, 0.1+0.01, 0.01) # [0.01, 0.1, 0.01]
nEstimators_GS = np.arange(1200, 1800+30, 30) # [1500-300, 1500+300, 30] 20%, step = 2%;
MaxDepth_GS = np.arange(2, 5+1, 1) # [2, 5, 1]
MinChildWeight_GS = np.arange(2, 5+1, 1) # [2, 5, 1]
GridSearch_N = len(LearningRate_GS) * len(nEstimators_GS) * len(MaxDepth_GS) * len(MinChildWeight_GS)

count = 0
ParamComb = {}
for lr in LearningRate_GS:
    for n in nEstimators_GS:
        for d in MaxDepth_GS:
            for c in MinChildWeight_GS:
                ParamComb[count] = {"learning_rate": lr, "n_estimators": n, "max_depth": d, "min_child_weight": c}
                count += 1

In [None]:
print(len(ParamComb))
sorted(ParamComb.items(), key = lambda x:(x[1]["learning_rate"], x[1]["n_estimators"], x[1]["max_depth"]))

**Complete Code for GridSearch**

In [None]:
GridSearch_N = 300


In [None]:

Result_GS = {} # store GridSearch CV result
Param_GS = {}
while GridSearch_N > 0:
    GridSearch_N -= 1
    print(GridSearch_N)

    ParamDict = {"learning_rate": ParamComb[GridSearch_N]["learning_rate"], "n_estimators": ParamComb[GridSearch_N]["n_estimators"],
                "max_depth": ParamComb[GridSearch_N]["max_depth"], "min_child_weight": ParamComb[GridSearch_N]["min_child_weight"]}
           
    rmse_CV = 0
    for i in range(nfolds):
        ## get train and test data in CV
        test_CV_idx = [j + i*test_CV_N for j in range(test_CV_N)]
        test_CV_x, test_CV_y = X_train[test_CV_idx, :], Y_train[test_CV_idx]
        train_CV_x, train_CV_y = np.delete(X_train, test_CV_idx, axis=0), np.delete(Y_train, test_CV_idx)

        ## train xgb model
        xgb_CV = XGBRegressor(
            learning_rate = ParamDict["learning_rate"],
            n_estimators = ParamDict["n_estimators"],
            max_depth = ParamDict["max_depth"], 
            min_child_weight = ParamDict["min_child_weight"],
            objective='reg:squarederror', 
            subsample=0.8, 
            colsample_bytree=0.8)
        xgb_CV.fit(train_CV_x, train_CV_y)
        #calculate rmse
        rmse_CV += np.sqrt(metrics.mean_squared_error(xgb_CV.predict(test_CV_x), test_CV_y))
    Param_GS[GridSearch_N] = rmse_CV / nfolds
    Result_GS[rmse_CV / nfolds] = ParamDict

In [None]:
rmse = list(Result_GS.keys())
len(rmse)

In [None]:
Result_GS_sorted = {k: v for k, v in sorted(Result_GS.items(), key = lambda x:x[0])}
Result_GS_df = pd.DataFrame(Result_GS_sorted)
print(pd.DataFrame(Result_GS_sorted))

In [None]:
Result_GS_df.to_csv("GridSearch v1_"+str(Result_GS_df.shape[1])+".csv")

In [None]:
Result_GS_df.to_csv("GridSearch v1_"+"0-300"+".csv")