In [6]:
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split


In [7]:
df1 = pd.read_excel('training_data.xlsx',index_col=0)
df2 = pd.read_excel('testing_data.xlsx',index_col=0)
X_train = df1.iloc[:,:-1]
y_train = df1.iloc[:,-1]
X_test = df2.iloc[:,:-1]
y_test = df2.iloc[:,-1]

## **不調參的決策樹**

In [89]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [94]:
print('Training data score: {}'.format(clf.score(X_train, y_train)))
print('Training data score: {}'.format(clf.score(X_test, y_test)))

Training data score: 1.0
Training data score: 0.9237969028056492


In [8]:
## 畫畫用
clf2 = DecisionTreeClassifier(criterion='entropy',max_depth=5)
clf2.fit(X_train,y_train)
print('Training data score: {}'.format(clf2.score(X_train, y_train)))
print('Training data score: {}'.format(clf2.score(X_test, y_test)))

Training data score: 0.8948569808773232
Training data score: 0.894566657831964


## **建立Pipeline模型並自動調參數**

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([('clf',DecisionTreeClassifier(criterion='entropy'))])

## **決策樹**
* 不要求對資料標準化
* sklearn中並不能容忍特徵職的缺失
* 可以學習忽略與任務無關的特徵，決定那些特徵是有用的
* 支援多輸出任務
* 小型決策樹可以使用sklearn tree模組中的export_graphviz，輕鬆解釋和視覺化

In [2]:
## 需要調參數的部位
parameters = {'clf__max_depth':(20,100,500),
              'clf__min_samples_split':(20,100,500),
              'clf__min_samples_leaf':(2,3,4)}

In [3]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')

In [4]:
grid_search.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [104]:
## 回傳最好的參數
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{}:{}'.format(param_name,best_parameters[param_name]))

clf__max_depth:100
clf__min_samples_leaf:20
clf__min_samples_split:20


In [105]:
## 最好的score
grid_search.best_score_

0.9175343327439514

## **Testing**

In [106]:
grid_search.score(X_test, y_test)

0.920046191481926

## **混淆矩陣解讀**
https://www.libinx.com/2018/understanding-sklearn-classification-report/

In [96]:
from sklearn.metrics import classification_report
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92     66056
           1       0.92      0.92      0.92     65999

    accuracy                           0.92    132055
   macro avg       0.92      0.92      0.92    132055
weighted avg       0.92      0.92      0.92    132055



In [107]:
print('Training data score: {}'.format(grid_search.score(X_train, y_train)))
print('Training data score: {}'.format(grid_search.score(X_test, y_test)))

Training data score: 0.9362802709885092
Training data score: 0.920046191481926


## **不同參數的詳細結果**

In [112]:
pd.DataFrame(grid_search.cv_results_).tail()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__max_depth,param_clf__min_samples_leaf,param_clf__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
22,5.563213,0.217165,0.071406,0.008236,500,100,100,"{'clf__max_depth': 500, 'clf__min_samples_leaf...",0.908102,0.907831,0.907556,0.910576,0.908653,0.908544,0.001079,9
23,5.334195,0.302861,0.067406,0.005161,500,100,500,"{'clf__max_depth': 500, 'clf__min_samples_leaf...",0.906543,0.905632,0.905499,0.909548,0.905454,0.906535,0.001558,17
24,4.406127,0.15284,0.066806,0.006432,500,500,20,"{'clf__max_depth': 500, 'clf__min_samples_leaf...",0.896825,0.896561,0.897292,0.898454,0.896505,0.897127,0.000719,19
25,4.509134,0.229263,0.063606,0.006408,500,500,100,"{'clf__max_depth': 500, 'clf__min_samples_leaf...",0.896825,0.896561,0.897292,0.898454,0.896505,0.897127,0.000719,19
26,4.102302,0.229378,0.048808,0.007726,500,500,500,"{'clf__max_depth': 500, 'clf__min_samples_leaf...",0.896825,0.896561,0.897292,0.898454,0.896505,0.897127,0.000719,19


## **視覺化**

In [17]:
from sklearn import tree
tree.export_graphviz(clf2,out_file="tree.dot",feature_names=X_train.columns,class_names=['neg','pos'])

In [18]:
import pydot
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [19]:
graph.write_png('tree.png')

In [None]:
# export_graphviz(clf, out_file="adspy_temp.dot", feature_names=feature_names, class_names=class_names, filled = True, impurity = False)
from sklearn.tree import export_graphviz
import graphviz
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
# Alternate method using pydotplus, if installed.
import pydotplus
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
graph = pydotplus.graphviz.graph_from_dot_data(dot_graph)
graph.create_png()

# **問題**

Q1:怎麼決定哪個feature好? future importance

In [None]:
def plot_decision_tree(clf, feature_names, class_names):
    # This function requires the pydotplus module and assumes it's been installed.
    # In some cases (typically under Windows) even after running conda install, there is a problem where the
    # pydotplus module is not found when running from within the notebook environment.  The following code
    # may help to guarantee the module is installed in the current notebook environment directory.
    #
    # import sys; sys.executable
    # !{sys.executable} -m pip install pydotplus

    export_graphviz(clf, out_file="adspy_temp.dot", feature_names=feature_names, class_names=class_names, filled = True, impurity = False)
    with open("adspy_temp.dot") as f:
        dot_graph = f.read()
    # Alternate method using pydotplus, if installed.
    # graph = pydotplus.graphviz.graph_from_dot_data(dot_graph)
    # return graph.create_png()
    return graphviz.Source(dot_graph)