# AdaBoost DecisionTree

In [1]:
import logging, os, warnings
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from data_helper import XY_from_df
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from bincls import BinaryClassificationAverageReport
from sklearn.metrics import classification_report, confusion_matrix

# sklearn ignore warnings
warnings.filterwarnings('ignore')

## Data Preparing

In [2]:
SEED = 5
FOLD = 10
TRAIN_PATH = "../dataset/train.csv"
TARGET_NAMES = ["bad", "good"]

np.random.seed(SEED)
df_train = pd.read_csv(TRAIN_PATH)
X, Y = XY_from_df(df_train)

stratified_folder = StratifiedKFold(n_splits=FOLD, random_state=SEED, shuffle=False)

## Args tunning ( RandomForestClassifier )
### Decision Tree
- max_features :選擇最適屬性時劃分的特徵不能超過此值。 當為整數時，即最大特徵數；當為小數時，訓練集特徵數*小數；
- max_depth : (default=None)設置樹的最大深度，默認為None，這樣建樹時，會使每一個葉節點只有一個類別，或是達到min_samples_split。
- min_samples_split :根據屬性劃分節點時，每個劃分最少的樣本數。
- min_samples_leaf :葉子節點最少的樣本數。
- max_leaf_nodes : (default=None)葉子樹的最大樣本數。
- min_weight_fraction_leaf : (default=0)葉子節點所需要的最小權值。

### Adaboost
- base_estimator:基分類器，默認是決策樹，在該分類器基礎上進行boosting，理論上可以是任意一個分類器，但是如果是其他分類器時需要指明樣本權重。
- n_estimators:基分類器提升（循環）次數，默認是50次，這個值過大，模型容易過擬合；值過小，模型容易欠擬合。
- learning_rate:學習率，表示梯度收斂速度，默認為1，如果過大，容易錯過最優值，如果過小，則收斂速度會很慢；該值需要和n_estimators進行一個權衡，當分類器迭代次數較少時，學習率可以小一些，當迭代次數較多時，學習率可以適當放大。
- algorithm:boosting算法，也就是模型提升準則，有兩種方式SAMME, 和SAMME.R兩種，默認是SAMME.R，兩者的區別主要是弱學習器權重的度量，前者是對樣本集預測錯誤的概率進行劃分的，後者是對樣本集的預測錯誤的比例，即錯分率進行劃分的，默認是用的SAMME.R。  

In [3]:
def cross_valid_process(stratified_folder, X, Y, model, report, mode="report", up=False):
    
    def upsampling(X,Y,train_index):
        pos = []
        for i, t in enumerate(Y[train_index]):
            if t == 0: pos.append(i)
        X_new = np.append(X[train_index], X[pos], axis=0)
        Y_new = np.append(Y[train_index], Y[pos], axis=0)
        idxs = [i for i in range(len(Y_new))]
        idxs = shuffle(idxs, random_state=3)
        return X_new[idxs], Y_new[idxs]
    
    for train_index, valid_index in stratified_folder.split(X, Y):
        if up:
            X_train ,Y_train = upsampling(X,Y,train_index)
        else:
            X_train ,Y_train = X[train_index], Y[train_index]
        if mode == "report": print(".", end=" ")
        m = model
        m.fit(X_train, Y_train)
        Y_valid_pred = m.predict(X[valid_index])
        cm = confusion_matrix(Y[valid_index], Y_valid_pred)
        report.cm_append(cm)
    if mode == "report":
        report.avg_cm_report()
        return None
    if mode == "obj":
        return report.object_score()

## Argument Combination

In [10]:
max_obj_score = 0
candidates = []

max_depth = [1,2]
min_samples_leaf = [4, 5, 10]
min_samples_split= [10,15,20]
algo = ["SAMME.R","SAMME",]

for md in max_depth:
    for msl in min_samples_leaf:
        for mss in min_samples_split:
            for a in algo:
                for up in [True, False]:
                    print(md,msl,mss,a,up)
                    report = BinaryClassificationAverageReport(TARGET_NAMES)
                    dt = DecisionTreeClassifier(max_depth=md, min_samples_leaf=msl, min_samples_split=mss )
                    bdt = AdaBoostClassifier(dt, algorithm=a, n_estimators=500, learning_rate=0.3, random_state= SEED)
                    obj_score = cross_valid_process(stratified_folder, X, Y, bdt, report, mode="obj", up=up)
                    if obj_score >= max_obj_score:
                        candidates.append((md, msl,mss, a, up, obj_score))
                        max_obj_score = obj_score

1 4 10 SAMME.R True
1 4 10 SAMME.R False
1 4 10 SAMME True
1 4 10 SAMME False
1 4 15 SAMME.R True
1 4 15 SAMME.R False
1 4 15 SAMME True
1 4 15 SAMME False
1 4 20 SAMME.R True
1 4 20 SAMME.R False
1 4 20 SAMME True
1 4 20 SAMME False
1 5 10 SAMME.R True
1 5 10 SAMME.R False
1 5 10 SAMME True
1 5 10 SAMME False
1 5 15 SAMME.R True
1 5 15 SAMME.R False
1 5 15 SAMME True
1 5 15 SAMME False
1 5 20 SAMME.R True
1 5 20 SAMME.R False
1 5 20 SAMME True
1 5 20 SAMME False
1 10 10 SAMME.R True
1 10 10 SAMME.R False
1 10 10 SAMME True
1 10 10 SAMME False
1 10 15 SAMME.R True
1 10 15 SAMME.R False
1 10 15 SAMME True
1 10 15 SAMME False
1 10 20 SAMME.R True
1 10 20 SAMME.R False
1 10 20 SAMME True
1 10 20 SAMME False
2 4 10 SAMME.R True
2 4 10 SAMME.R False
2 4 10 SAMME True
2 4 10 SAMME False
2 4 15 SAMME.R True
2 4 15 SAMME.R False
2 4 15 SAMME True
2 4 15 SAMME False
2 4 20 SAMME.R True
2 4 20 SAMME.R False
2 4 20 SAMME True
2 4 20 SAMME False
2 5 10 SAMME.R True
2 5 10 SAMME.R False
2 5 10 SAMM

In [11]:
candidates

[(1, 4, 10, 'SAMME.R', True, 0.6401323714324441),
 (1, 4, 15, 'SAMME.R', True, 0.6401323714324441),
 (1, 4, 20, 'SAMME.R', True, 0.6401323714324441),
 (1, 10, 10, 'SAMME.R', True, 0.6519454772801614),
 (1, 10, 15, 'SAMME.R', True, 0.6519454772801614),
 (1, 10, 20, 'SAMME.R', True, 0.6519454772801614),
 (2, 4, 10, 'SAMME.R', True, 0.6888876072665755),
 (2, 5, 15, 'SAMME.R', True, 0.7020063671883592),
 (2, 10, 10, 'SAMME.R', True, 0.7101335865225722),
 (2, 10, 15, 'SAMME.R', True, 0.7101335865225722),
 (2, 10, 20, 'SAMME.R', True, 0.7101335865225722)]

## Robustness and Performance Good

In [12]:
md = 1
msl = 4
mss = 10

a = "SAMME.R"

report = BinaryClassificationAverageReport(TARGET_NAMES)

dt = DecisionTreeClassifier(max_depth=md, min_samples_leaf=msl, min_samples_split=mss)
bdt = AdaBoostClassifier(dt, algorithm=a, n_estimators=500, learning_rate=0.3, random_state= SEED)

cross_valid_process(stratified_folder, X, Y, bdt, report, "report", True)

. . . . . . . . . . 

Below number are the average of 10 fold.

bad
             precision:    58.41%
                recall:    57.00%
                    F1:    57.00%
good
             precision:    82.00%
                recall:    82.43%
                    F1:    82.05%
---------------------------------
             weight_F1:    74.53%
                   acc:    74.80%



## Final Model

In [13]:
def upsampling(X,Y):
    pos = []
    for i, t in enumerate(Y):
        if t == 0: pos.append(i)
    X_new = np.append(X, X[pos], axis=0)
    Y_new = np.append(Y, Y[pos], axis=0)
    idxs = [i for i in range(len(Y_new))]
    idxs = shuffle(idxs, random_state=3)
    return X_new[idxs], Y_new[idxs]

X_new, Y_new = upsampling(X,Y)

md = 1
msl = 4
mss = 10

a = "SAMME.R"

dt = DecisionTreeClassifier(max_depth=md, min_samples_leaf=msl, min_samples_split=mss)
final_bdt = AdaBoostClassifier(dt, algorithm=a, n_estimators=500, learning_rate=0.3, random_state= SEED)

final_bdt.fit(X_new, Y_new)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=4,
                                                         min_samples_split=10,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                         

In [15]:
import pickle

model_name = 'best_mdoel.pickle'

with open(model_name, 'wb') as f:
    pickle.dump(final_bdt, f)