# Random Forest

In [1]:
import logging, os, warnings
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from data_helper import XY_from_df
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from bincls import BinaryClassificationAverageReport
from sklearn.metrics import classification_report, confusion_matrix

# sklearn ignore warnings
warnings.filterwarnings('ignore')

## Data Preparing

In [2]:
SEED = 5
FOLD = 10
TRAIN_PATH = "../dataset/train.csv"
TARGET_NAMES = ["bad", "good"]

np.random.seed(SEED)
df_train = pd.read_csv(TRAIN_PATH)
X, Y = XY_from_df(df_train)

stratified_folder = StratifiedKFold(n_splits=FOLD, random_state=SEED, shuffle=False)

## Args tunning ( RandomForestClassifier )

### Decision Tree
- max_features :選擇最適屬性時劃分的特徵不能超過此值。 當為整數時，即最大特徵數；當為小數時，訓練集特徵數*小數；
- max_depth : (default=None)設置樹的最大深度，默認為None，這樣建樹時，會使每一個葉節點只有一個類別，或是達到min_samples_split。
- min_samples_split :根據屬性劃分節點時，每個劃分最少的樣本數。
- min_samples_leaf :葉子節點最少的樣本數。
- max_leaf_nodes : (default=None)葉子樹的最大樣本數。
- min_weight_fraction_leaf : (default=0)葉子節點所需要的最小權值。

### Random Forest
- n_estimators =10：決策樹的個數，越多越好，但是性能就會越差，至少100左右，可以達到可接受的性能和誤差率。 
- bootstrap =True ：是否有放回的採樣。  
- oob_score=False：oob（out of band，帶外）數據，即：在某次決策樹訓練中沒有被bootstrap選中的數據。多單個模型的參數訓練，我們知道可以用cross validation（cv）來進行，但是特別消耗時間，而且對於隨機森林這種情況也沒有大的必要，所以就用這個數據對決策樹模型進行驗證，算是一個簡單的交叉驗證。性能消耗小，但是效果不錯。  
- n_jobs=1：並行job個數。這個在ensemble算法中非常重要，尤其是bagging（而非boosting，因為boosting的每次迭代之間有影響，所以很難進行並行化），因為可以並行從而提高性能。1=不並行；n：n個並行；-1：CPU有多少core，就啟動多少job
- warm_start=False：熱啟動，決定是否使用上次調用該類的結果然後增加新的。  
- class_weight=None：各個label的權重。  


## 10 cross validation

In [3]:
def cross_valid_process(stratified_folder, X, Y, model, report, mode="report", up=False):
    
    def upsampling(X,Y,train_index):
        pos = []
        for i, t in enumerate(Y[train_index]):
            if t == 0: pos.append(i)
        X_new = np.append(X[train_index], X[pos], axis=0)
        Y_new = np.append(Y[train_index], Y[pos], axis=0)
        idxs = [i for i in range(len(Y_new))]
        idxs = shuffle(idxs, random_state=3)
        return X_new[idxs], Y_new[idxs]
    
    for train_index, valid_index in stratified_folder.split(X, Y):
        if up:
            X_train ,Y_train = upsampling(X,Y,train_index)
        else:
            X_train ,Y_train = X[train_index], Y[train_index]
        if mode == "report": print(".", end=" ")
        m = model
        m.fit(X_train, Y_train)
        Y_valid_pred = m.predict(X[valid_index])
        cm = confusion_matrix(Y[valid_index], Y_valid_pred)
        report.cm_append(cm)
    if mode == "report":
        report.avg_cm_report()
        return None
    if mode == "obj":
        return report.object_score()

## Argument Combination 

In [4]:
max_obj_score = 0
candidates = []

max_features = [0.2, 0.5, 1.]
min_samples_leaf = [10, 20]
class_weight =[{0:2.}, {0:5.}, {0:10.}]

for mf in max_features:
    for msl in min_samples_leaf:
        for cw in class_weight:
            for up in [True, False]:
                print(mf, msl, cw, up)
                report = BinaryClassificationAverageReport(TARGET_NAMES)
                rf = RandomForestClassifier(n_estimators = 500, max_features=mf, bootstrap =True, oob_score=True, n_jobs=3,\
                                            min_samples_leaf=msl, random_state=SEED, class_weight=cw)
                obj_score = cross_valid_process(stratified_folder, X, Y, rf, report, mode="obj", up=up)
                if obj_score >= max_obj_score:
                    candidates.append((mf, msl, cw, up, obj_score))
                    max_obj_score = obj_score

0.2 10 {0: 2.0} True
0.2 10 {0: 2.0} False
0.2 10 {0: 5.0} True
0.2 10 {0: 5.0} False
0.2 10 {0: 10.0} True
0.2 10 {0: 10.0} False
0.2 20 {0: 2.0} True
0.2 20 {0: 2.0} False
0.2 20 {0: 5.0} True
0.2 20 {0: 5.0} False
0.2 20 {0: 10.0} True
0.2 20 {0: 10.0} False
0.5 10 {0: 2.0} True
0.5 10 {0: 2.0} False
0.5 10 {0: 5.0} True
0.5 10 {0: 5.0} False
0.5 10 {0: 10.0} True
0.5 10 {0: 10.0} False
0.5 20 {0: 2.0} True
0.5 20 {0: 2.0} False
0.5 20 {0: 5.0} True
0.5 20 {0: 5.0} False
0.5 20 {0: 10.0} True
0.5 20 {0: 10.0} False
1.0 10 {0: 2.0} True
1.0 10 {0: 2.0} False
1.0 10 {0: 5.0} True
1.0 10 {0: 5.0} False
1.0 10 {0: 10.0} True
1.0 10 {0: 10.0} False
1.0 20 {0: 2.0} True
1.0 20 {0: 2.0} False
1.0 20 {0: 5.0} True
1.0 20 {0: 5.0} False
1.0 20 {0: 10.0} True
1.0 20 {0: 10.0} False


In [8]:
candidates

[(0.2, 10, {0: 2.0}, True, 0.7361493398639518),
 (0.2, 10, {0: 5.0}, True, 0.7943834391153833),
 (0.5, 10, {0: 5.0}, True, 0.7951712604045212)]

## Robustness and Performance Good

In [9]:
mf = 0.5
msl = 10
cw = {0: 5.0}

report = BinaryClassificationAverageReport(TARGET_NAMES)

rf = RandomForestClassifier(n_estimators = 500, max_features=mf, bootstrap =True, oob_score=True, n_jobs=3,\
                                        min_samples_leaf=msl, random_state=SEED, class_weight=cw)

cross_valid_process(stratified_folder, X, Y, rf, report, "report", True)

. . . . . . . . . . 

Below number are the average of 10 fold.

bad
             precision:    46.99%
                recall:    87.67%
                    F1:    61.02%
good
             precision:    91.70%
                recall:    57.00%
                    F1:    69.98%
---------------------------------
             weight_F1:    67.29%
                   acc:    66.20%



## Final Model

In [15]:
mf = 0.5
msl = 10
cw = {0: 5.0}

def upsampling(X,Y):
    pos = []
    for i, t in enumerate(Y):
        if t == 0: pos.append(i)
    X_new = np.append(X, X[pos], axis=0)
    Y_new = np.append(Y, Y[pos], axis=0)
    idxs = [i for i in range(len(Y_new))]
    idxs = shuffle(idxs, random_state=3)
    return X_new[idxs], Y_new[idxs]

X_new, Y_new = upsampling(X,Y)

final_rf = RandomForestClassifier(n_estimators = 500, max_features=mf, bootstrap =True, oob_score=True, n_jobs=3,\
                                        min_samples_leaf=msl, random_state=SEED, class_weight=cw)
final_rf.fit(X_new, Y_new)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={0: 5.0},
                       criterion='gini', max_depth=None, max_features=0.5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=3,
                       oob_score=True, random_state=5, verbose=0,
                       warm_start=False)

In [16]:
import pickle

model_name = 'best_mdoel.pickle'

with open(model_name, 'wb') as f:
    pickle.dump(final_rf, f)