In [None]:
# Import
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [None]:
# Read & Preprocessing Data
bankRaw = pd.read_csv("data/Bank/bank_train.csv")
bank = bankRaw.apply(LabelEncoder().fit_transform)

In [None]:
# General Fitting
yTrain = bank['y']
xTrain = bank.drop('y', axis=1)

cl = GradientBoostingClassifier(random_state=0, n_estimators=20000, learning_rate=0.3, max_depth=1)
rs = cl.fit(xTrain, yTrain)

pr = rs.predict(bank.drop('y', axis=1))
print("Accuracy: ", accuracy_score(bank['y'], pr))
# 20000/0.2/2 = 0.9862445565770374
# 20000/0.3/2 = 0.9939517522637727
# 20000/0.3/1 = 0.9134927766641322

In [None]:
# KFold for test
scoresTest = np.zeros(20)
scoresTrain = np.zeros(20)
cv = KFold(20, shuffle=True, random_state=0)
for i, (idxTrain, idxTest) in enumerate(cv.split(bank)):
    bankTrain = bank.iloc[idxTrain]
    bankTest = bank.iloc[idxTest]
    
    # Prepare training data
    yTrain = bankTrain['y']
    xTrain = bankTrain.drop('y', axis=1)
    
    # Fitting
    cl = GradientBoostingClassifier(random_state=0, n_estimators=100, learning_rate=0.2)
    rs = cl.fit(xTrain, yTrain)
    
    pr = rs.predict(bankTest.drop('y', axis=1))
    scoresTest[i] = accuracy_score(bankTest['y'], pr)
    prTrain = rs.predict(bankTrain.drop('y', axis=1))
    scoresTrain[i] = accuracy_score(bankTrain['y'], prTrain)
    
print("Train Set Accuracy: ", np.mean(scoresTrain))
print("Test Set Accuracy: ", np.mean(scoresTest))               