## 一个例子

In [3]:
import pandas as pd 

# 原始数据
datasets = [
    ['晴', 29, 85, '否', '0'],
    ['晴', 26, 88, '是', '0'],
    ['多云', 28, 78, '否', '1'],
    ['雨', 21, 96, '否', '1'],
    ['雨', 20, 80, '否', '1'],
    ['雨', 18, 70, '是', '0'],
    ['多云', 18, 65, '是', '1'],
    ['晴', 22, 90, '否', '0'],
    ['晴', 21, 68, '否', '1'],
    ['雨', 24, 80, '否', '1'],
    ['晴', 24, 63, '是', '1'],
    ['多云', 22, 90, '是', '1'],
    ['多云', 27, 75, '否', '1'],
    ['雨', 21, 80, '是', '0']
]
# 数据的列名
labels = ['天气', '温度', '湿度', '是否有风', '是否前往游乐场']
# 将湿度大小分为大于 75 和小于等于 75 这两个属性值，
# 将温度大小分为大于 26 和小于等于 26 这两个属性值
for i in range(len(datasets)):
    if datasets[i][2] > 75:
        datasets[i][2] = '>75'
    else:
        datasets[i][2] = '<=75'
    if datasets[i][1] > 26:
        datasets[i][1] = '>26'
    else:
        datasets[i][1] = '<=26'
# 构建 dataframe 并查看数据
df = pd.DataFrame(datasets, columns=labels)
df

Unnamed: 0,天气,温度,湿度,是否有风,是否前往游乐场
0,晴,>26,>75,否,0
1,晴,<=26,>75,是,0
2,多云,>26,>75,否,1
3,雨,<=26,>75,否,1
4,雨,<=26,>75,否,1
5,雨,<=26,<=75,是,0
6,多云,<=26,<=75,是,1
7,晴,<=26,>75,否,0
8,晴,<=26,<=75,否,1
9,雨,<=26,>75,否,1


## 计算信息熵

In [5]:
def calc_entropy(total_num, count_dict):
    """
    计算信息熵
    :param total_num: 总样本数, 例如总的样本数是 14
    :param count_dict: 每类样本及其对应数目的字典，例如：{'前往游乐场': 9, '不前往游乐场': 5}
    :return: 信息熵
    """    
    ent = 0    
    for key, value in count_dict.items():        
        if value > 0:            
            pk = float(value)/total_num            
            ent -= (pk * np.log2(pk))    
    return round(ent, 3)


## 用`Sklearn`训练决策树模型

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 按属性和标签载入数据，共150个数据集
X, y = load_iris(return_X_y=True)

# 切分训练集合测试集，将150个数据集一分为二，100个作为训练集，50个作为测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# 初始化模型，可以调整 max_depth 来观察模型的表现， 
# 也可以调整 criterion  为 gini 来使用 gini 指数构建决策树
clf = tree.DecisionTreeClassifier()
# 训练模型
clf = clf.fit(X_train, y_train)

In [6]:
# 使用 graphviz 来展示构建好的决策树

import graphviz

feature_names = ['萼片长度','萼片宽度','花瓣长度','花瓣宽度']
target_names = ['山鸢尾', '杂色鸢尾', '维吉尼亚鸢尾']
# 可视化生成的决策树
dot_data = tree.export_graphviz(clf, out_file=None,
                     feature_names=feature_names,
                     class_names=target_names,
                     filled=True, rounded=True,
                     special_characters=True)
graph = graphviz.Source(dot_data)
graph

ModuleNotFoundError: No module named 'graphviz'