**Q1决策树**

*实现基于信息增益率率进行划分选择的决策树算法，   
对男女生样本数据中的（喜欢颜色，喜欢运动，喜欢文学，鞋码）4个特征进行分类，   
计算模型预测性能（包含 SE、SP、ACC），并以友好的方式图示化结果。*

`1.导入模块`

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz
import os

`2.处理数据`

In [None]:
def pre_data(file_path):
    file_data = pd.read_excel(file_path)
    data_color = file_data[["喜欢颜色"]]
    ordinal_encoder = OrdinalEncoder()
    color_encoder = ordinal_encoder.fit_transform(data_color)
    print(color_encoder, data_color)
    data_feature = np.hstack((color_encoder, pd.DataFrame(file_data, columns=['喜欢运动', '喜欢文学']).values))
    scaler = preprocessing.StandardScaler().fit(data_feature)
    data_feature = scaler.transform(data_feature)
    data_label = file_data.性别男1女0.values
    return data_feature, data_label

data_feature, data_label = pre_data(os.path.join('data', '作业数据_2021合成.xls'))

`3.数据分层` 

In [None]:
def split_data(data_feature, data_label):

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data_feature, data_label):
        data_train, data_test = data_feature[train_index], data_feature[test_index]
        label_train, label_test = data_label[train_index], data_label[test_index]
    print(len(data_train), len(data_test), len(label_train))
    return data_train, data_test, label_train, label_test

data_train, data_test, label_train, label_test = split_data(data_feature, data_label)

`4.训练`

In [None]:
def train(data_train, data_test, label_train, label_test):
    clf = DecisionTreeClassifier(criterion="entropy", random_state=30, splitter='random',max_depth=7)
    clf = clf.fit(data_train, label_train)
    score_train = clf.score(data_train, label_train)
    score = clf.score(data_test, label_test)
    return score_train, score, clf

score_train, score, clf = train(data_train, data_test, label_train, label_test)
print(score_train, score)

`5.可视化`

In [None]:
def vitualize(clf):
    tree.export_graphviz(clf)
    dot_data = tree.export_graphviz(clf, out_file=None)
    graphviz.Source(dot_data)

    # 给图形增加标签和颜色
    feature_name = ['color', 'sport', 'literature']
    dot_data = tree.export_graphviz(clf, out_file=None, feature_names=['color', 'sport', 'literature'],
                                    class_names=['girl', 'boy'], filled=True, rounded=True,
                                    special_characters=True)
    graphviz.Source(dot_data)

    # 利用render方法生成图形
    graph = graphviz.Source(dot_data)
    graph.render("classify")

    print(clf.feature_importances_)
    print([*zip(feature_name, clf.feature_importances_)])

    # 最优剪枝参数
    test = []
    for i in range(10):
        clf = tree.DecisionTreeClassifier(criterion="entropy", random_state=30, splitter='random', max_depth=i + 1)
        clf = clf.fit(data_train, label_train)
        score = clf.score(data_test, label_test)
        test.append(score)
    plt.plot(range(1, 11), test, color="red", label="max_depth")
    plt.xlabel('决策树深度')
    plt.ylabel('ACC')
    plt.legend()
    plt.show()

vitualize(clf)