<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#C4.5-决策树实验验证" data-toc-modified-id="C4.5-决策树实验验证-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>C4.5 决策树实验验证</a></span></li></ul></div>

## C4.5 决策树实验验证

In [1]:
import numpy as np
from collections import Counter
from math import log
import pandas as pd

In [2]:
class DTNode:
    def __init__(self, i_feature=None, label=None):
        # 该节点包含的标记
        self.i_feature = i_feature
        # 该节点的分类标签
        self.label = label
        self.tree = {}
        
    def isLeaf(self):
        return len(self.tree) == 0
    
    def addChild(self, feature_value, child_node):
        self.tree[feature_value] = child_node
        
    def _predict(self, x):
        if self.isLeaf():
            return self.label
        return self.tree[x[self.i_feature]]._predict(x)
    
    def predict(self, X):
        result = []
        for x in X:
            result.append(self._predict(x))
        return result
    
    def __repr__(self):
        # 通过在类中重写这个方法，从而实现当输出实例化对象时，输出我们想要的信息。
        return 'lanel={}, i_feature={}, subtreee={}'.format(self.label, self.i_feature, self.tree)

In [23]:
class C45Tree:
    def __init__(self, threshold=0.001):
        self.threshold = threshold
        self.root = None
    
    def CalcDatasetEntropy(self, Y):
        N = len(Y)
        labels_counter = Counter(Y)
        return -sum([Ni/N*log(Ni/N, 2) for Ni in labels_counter.values()])
    
    def CalConditionEntropy(self, X, Y, i_feature):
        N = len(X)
        subsets = {}
        for i in range(N):
            feature_value = X[i][i_feature]
            if feature_value not in subsets:
                subsets[feature_value] = []
            subsets[feature_value].append(Y[i])
        return sum([len(subset)/N*self.CalcDatasetEntropy(subset) \
                    for subset in subsets.values()])
    
    def CalInformationGainRatio(self, X, Y, i_feature):
        return self.CalConditionEntropy(X, Y, i_feature) \
                / self.CalcDatasetEntropy(Y) - 1
    
    def SelectBestFeature(self, X, Y, used):
        feature_info = []
        for i in range(len(X[0])):
            if i in used:
                continue
            feature_info.append((i, self.CalInformationGainRatio(X, Y, i)))
        print(feature_info)
        return min(feature_info, key=lambda x:x[-1])
    
    '''
    function : 使用数训练得到ID3决策树
    '''
    def train(self, X, Y, used):
        if (len(X[0]) - len(used) == 0):
            return DTNode(label=max(Counter(Y), key=lambda x:x[-1])[0])
        
        if len(Counter(Y)) == 1:
            return DTNode(label=Y[0])
        
        best_i_feature, best_cond_entropy = self.SelectBestFeature(X, Y, used)
        
        # 信息熵小于阈值
        if self.CalcDatasetEntropy(Y) - best_cond_entropy < self.threshold:
            return DTNode(label=max(Counter(Y), key=lambda x:x[-1])[0])    
        
        # 非叶子节点
        current_dtnode = DTNode(i_feature=best_i_feature)
        used.append(best_i_feature)
        
        # 重新划分 X 与 Y
        subsets = {}
        for i in range(len(X)):
            feature_value = X[i][best_i_feature]
            if feature_value not in subsets:
                subsets[feature_value] = []
            subsets[feature_value].append((X[i], Y[i]))
        
        # 处理每一个特征离散值对应的数据
        for feature_value, newdataset in subsets.items():
            New_X = np.array([xy[0] for xy in newdataset])
            New_Y = np.array([xy[1] for xy in newdataset])
            child = self.train(New_X, New_Y, used)
            current_dtnode.addChild(feature_value, child)
        
        return current_dtnode
    
    '''
    function : 生成决策树
    '''
    def fit(self, X, Y):
        self.root = self.train(X, Y, [])
        print(self.root)
        print('ID1 dection tree training completed !')
        
    '''
    预测
    '''
    def predict(self, X):
        return self.root.predict(X)

In [24]:
datasets = [['青年', '否', '否', '一般', '否'],
            ['青年', '否', '否', '好', '否'],
            ['青年', '是', '否', '好', '是'],
            ['青年', '是', '是', '一般', '是'],
            ['青年', '否', '否', '一般', '否'],
            ['中年', '否', '否', '一般', '否'],
            ['中年', '否', '否', '好', '否'],
            ['中年', '是', '是', '好', '是'],
            ['中年', '否', '是', '非常好', '是'],
            ['中年', '否', '是', '非常好', '是'],
            ['老年', '否', '是', '非常好', '是'],
            ['老年', '否', '是', '好', '是'],
            ['老年', '是', '否', '好', '是'],
            ['老年', '是', '否', '非常好', '是'],
            ['老年', '否', '否', '一般', '否'],
            ]
labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
train_data = pd.DataFrame(datasets, columns=labels)
X = np.array(train_data.iloc[:, :-1])
Y = np.array(train_data.iloc[:, -1])
train_data

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


In [25]:
C45_dt = C45Tree()
C45_dt.fit(X, Y)

[(0, -0.08549096146585067), (1, -0.33333333333333337), (2, -0.43253806776631254), (3, -0.3738496733100589)]
[(0, -0.27401754212128093), (1, -1.0), (3, -0.5160116947475206)]
lanel=None, i_feature=2, subtreee={'否': lanel=None, i_feature=1, subtreee={'否': lanel=否, i_feature=None, subtreee={}, '是': lanel=是, i_feature=None, subtreee={}}, '是': lanel=是, i_feature=None, subtreee={}}
ID1 dection tree training completed !


In [27]:
XTEST = [['老年', '否', '否', '一般'],
        ['中年', '是', '否', '一般'],
        ['中年', '否', '否', '一般'],
        ['青年', '否', '否', '一般'],
        ['青年', '否', '否', '好'],
        ['青年', '否', '是', '一般'],
        ]
C45_dt.predict(XTEST)

['否', '是', '否', '否', '否', '是']