In [1]:
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set()

In [2]:
df1 = pd.read_excel('training_data_baseline.xlsx',index_col=0)
df2 = pd.read_excel('testing_data_baseline.xlsx',index_col=0)
X_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
X_test = df2.iloc[:,:-1]
y_test = df2.iloc[:,-1]

## **決策樹**

In [27]:
from sklearn.tree import DecisionTreeClassifier
clf2 = DecisionTreeClassifier(criterion='gini',max_depth=10)
clf2.fit(X_train,y_train)
print('Training data score: {}'.format(clf2.score(X_train, y_train)))
print('Training data score: {}'.format(clf2.score(X_test, y_test)))


Training data score: 0.8792115231259968
Training data score: 0.8782598010870822


## **決策樹**
* 不要求對資料標準化
* sklearn中並不能容忍特徵職的缺失
* 可以學習忽略與任務無關的特徵，決定那些特徵是有用的
* 支援多輸出任務
* 小型決策樹可以使用sklearn tree模組中的export_graphviz，輕鬆解釋和視覺化

## **Testing**

## **混淆矩陣解讀**
https://www.libinx.com/2018/understanding-sklearn-classification-report/

In [28]:
from sklearn.metrics import classification_report
predictions = clf2.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88     69432
           1       0.91      0.84      0.87     68920

    accuracy                           0.88    138352
   macro avg       0.88      0.88      0.88    138352
weighted avg       0.88      0.88      0.88    138352



In [30]:
print('Training data score: {}'.format(clf2.score(X_train, y_train)))
print('Training data score: {}'.format(clf2.score(X_test, y_test)))

Training data score: 0.8792115231259968
Training data score: 0.8782598010870822


## Feature Importance

In [29]:
from sklearn.inspection import permutation_importance
r = permutation_importance(clf2, X_test, y_test,
                           n_repeats=30,
                           random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{X_test.columns[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")


stick_level20.252 +/- 0.001
recency_m0.231 +/- 0.001
DiagnosisCode_DESC0.229 +/- 0.001
illness_desc0.222 +/- 0.001
cust_group20.220 +/- 0.001
BundleSubtype20.161 +/- 0.001
REIMBURSED_YR_TW0.094 +/- 0.001
WEALTH_LEVEL0.058 +/- 0.001
ternure_m0.056 +/- 0.001
REG_his 0.026 +/- 0.000
GENDER  0.024 +/- 0.000
REG     0.022 +/- 0.000
AHb_his 0.016 +/- 0.000
SIN_his 0.015 +/- 0.000
DIGI_FLG0.011 +/- 0.000
ILP     0.011 +/- 0.000
ILP_his 0.009 +/- 0.000
AHb     0.009 +/- 0.000
AHa     0.008 +/- 0.000
AHd     0.008 +/- 0.000
AHc     0.006 +/- 0.000
SIN     0.004 +/- 0.000
AHd_his 0.003 +/- 0.000
VIP_CLASS0.003 +/- 0.000
TOPCARD 0.001 +/- 0.000
AHc_his 0.001 +/- 0.000
AHa_his 0.000 +/- 0.000
VIP     0.000 +/- 0.000


## **不同參數的詳細結果**

## **視覺化**

In [32]:
from sklearn import tree
tree.export_graphviz(clf2,out_file="tree.dot",feature_names=X_train.columns,class_names=['neg','pos'])

In [33]:
import pydot
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [34]:
graph.write_png('tree.png')