## 决策树算法

决策树对每个特征的筛选根据信息增益区分

In [3]:
import numpy as np
import pandas as pd
from sklearn import datasets

> 导入贷款申请样本数据表

In [4]:
data = [
    ['青年','否','否','一般','否'],
    ['青年','否','否','好','否'],
    ['青年','是','否','好','是'],
    ['青年','是','是','一般','是'],
    ['青年','否','否','一般','否'],
    ['中年','否','否','一般','否'],
    ['中年','否','否','好','否'],
    ['中年','是','是','好','是'],
    ['中年','否','是','非常好','是'],
    ['中年','否','是','非常好','是'],
    ['老年','否','是','非常好','是'],
    ['老年','否','是','好','是'],
    ['老年','是','否','好','是'],
    ['老年','是','否','非常好','是'],
    ['老年','否','否','一般','否'],
]

headers = ['年龄','有工作','有自己的房子','信贷情况','类别']

dataset = pd.DataFrame(data,columns=headers)

In [5]:
dataset

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


#### 定义一个函数来计算熵

In [6]:
def H(x):
    K = x.value_counts().values
    D = K.sum()
    P = K/D
    return -(P*np.log2(P)).sum()

#### 定义一个函数来计算条件熵 H(Y|X) = SUM(H(Y|X=x_i))

In [7]:
def Hyx(y,x):
    Kname = x.value_counts().index
    K = x.value_counts()
    Kname = K.index
    D = K.sum()
    Hyx = 0.0
    for k in Kname:
        Hyx = Hyx + K[k]/D*H(y[x==k])
    return Hyx

#### 定义信息增益函数g(D,A1) = H(D) - H(Y|A1)

In [8]:
def g(D,A1):
    return H(D) - Hyx(D,A1)

In [9]:
def g_radio(D,A1):
    h = H(A1)
    if h == 0:
        return 0
    return g(D,A1)/H(A1)

#### 根据信息增益选择最好的特征

In [10]:
def select_feature(x,y,metric='entropy'):
    C = x.columns
    max_g = -1
    feature = -1
    for c in C:
        if metric == 'entropy':
            c_g = g(y, x[c])
        else:
            c_g = g_radio(y,x[c])
        if c_g >= max_g:
            max_g = c_g
            feature = c
    return feature,max_g

In [15]:
X = dataset.drop(['类别'],axis=1)
Y = dataset['类别']
print(select_feature(X,Y,metric='entropy_radio'))

('有自己的房子', 0.4325380677663126)


In [89]:
class Node:
    
    def __init__(self,fname="",fdict={},leaf=False,tag="",y_labels=None): # y_labels记录叶节点的分布情况 方便剪枝
        self.feature = fname
        self.fdict = fdict
        self.tag = tag
        self.leaf = leaf
        self.y_labels = y_labels
        

In [187]:
class Dicision_Tree:
    
    def __init__(self,type='ID3'):
        self.type = type
            
    def _H(self,x):
        K = x.value_counts().values
        D = K.sum()
        P = K/D
        return -(P*np.log2(P)).sum()
    
    def _Hyx(self,y,x):
        Kname = x.value_counts().index
        K = x.value_counts()
        Kname = K.index
        D = K.sum()
        Hyx = 0.0
        for k in Kname:
            Hyx = Hyx + K[k]/D*self._H(y[x==k])
        return Hyx
    
    def _g(self,D,A1):
        return self._H(D) - self._Hyx(D,A1)
    
    def _g_radio(self,D,A1):
        h = H(A1)
        if h == 0:
            return 0
        return self._g(D,A1)/self._H(A1)
    
    def _select_feature(self,x,y,metric='entropy'):
        C = x.columns
        max_g = -1
        feature = -1
        for c in C:
            if metric == 'entropy':
                c_g = self._g(y, x[c])
            else:
                c_g = self._g_radio(y,x[c])
            if c_g > max_g:
                max_g = c_g
                feature = c
        print('选择',feature,'增益：',max_g)
        return feature,max_g
    
    
    def _build_tree(self,dataset,labels,epsilon):
        Ck_num = labels.value_counts().shape[0]
        # 所有实例属于同一类Ck
        if Ck_num == 1:
            # 设T为单节点数，并将类Ck作为该节点的类标记
            ck = labels[labels.index[0]]
            return Node(leaf=True,tag=ck,y_labels=labels) # 叶节点记录y值
        # 如果dataset没有其他属性了 那么将T设置为单节点数，投票决定Ck
        if dataset.shape[1] == 0:
            ck = labels.value_counts().index[0]
            return Node(leaf=True,tag=ck,y_labels=labels) # 叶节点记录y值

        # 特征选择
        metric = 'entropy'
        if self.type == 'C4.5':
            metric = 'ent_radio'
        fname,Ag = self._select_feature(dataset,labels,metric=metric)
        # 判断信息增益是否小于阈值
        if Ag < epsilon:
            ck = labels.value_counts().index[0]
            return Node(leaf=True,tag=ck,y_labels=labels) # 叶节点记录y值

        # 按照fname列的每一个属性值进一步分类
        kname = dataset[fname].value_counts().index

        # 判断是否属于同一类
        fdict = {}
        for k in kname:
            flags = (dataset[fname]==k) 
            # 删去已经作为分类标准的属性
            dat = dataset[flags].drop([fname],axis=1)
            lab = labels[flags]
            # 将这一路径加入子节点
            fdict[k] = self._build_tree(dataset=dat,labels=lab,epsilon=epsilon)
        return Node(fname=fname,fdict=fdict)
    
    def _get_index(self,feature):
        return int(np.argwhere(self.columns==feature))
    
    def fit(self,dataset=None,labels=None,epsilon=0.1):
        if not isinstance(dataset,pd.DataFrame):
            dataset = pd.DataFrame(dataset)
        if not isinstance(labels,pd.Series):
            labels = pd.Series(labels)
        self.columns = dataset.columns
        self.root = self._build_tree(dataset,labels,epsilon)
        
    def _predict(self,X):
        # 对单个数据做预测
        node = self.root #获取根节点
        while not node.leaf: # 当节点不是叶节点一直循环
            feature = node.feature # 获取属性值
            index = self._get_index(feature) # 获取属性值对应的缩影
            value = X[index] # 获取测试数据对应索引的值
            node = node.fdict[value]
        return node.tag
        
    def predict(self,X):
        # 对多个数据做预测
        if isinstance(X,pd.DataFrame):
            X = X.values
        y_pre = []
        for x in X:
            y_pre.append(self._predict(x))
        return y_pre
    
    # 深度搜索 计算损失
    def deep_loss(self,node):
        if not node.leaf:
            print('我不是叶节点')
            fdict = node.fdict
            H = 0
            num = 0 # 统计此节点下的信息熵和叶节点个数
            for value in fdict:
                n = fdict[value]
                a,b = self.deep_loss(n)
                H = H + a
                num = num + b
            return H,num
        print('我是叶节点')
        H = self._H(node.y_labels)
        return H,1
    
    def loss(self):
        return self.deep_loss(self.root)
    
    # 决策数剪枝算法
    def cut_ai():
        return 0

In [188]:
id_tree = Dicision_Tree('C4.5')
id_tree.fit(X,Y,0.1)

选择 有自己的房子 增益： 0.4325380677663126
选择 有工作 增益： 1.0


In [189]:
id_tree.loss()

我不是叶节点
我不是叶节点
我是叶节点
我是叶节点
我是叶节点


(0.0, 3)

# 西瓜书测试数据

In [192]:
dataset = [
    ['青绿','蜷缩','浊响','清晰','凹陷','硬滑','是'],
    ['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
    ['乌黑','蜷缩','浊响','清晰','凹陷','硬滑','是'],    
    ['青绿','稍蜷','浊响','清晰','稍凹','软粘','是'],
    ['乌黑','稍蜷','浊响','稍糊','稍凹','软粘','是'],    
    ['青绿','硬挺','清脆','清晰','平坦','软粘','否'],
    ['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑','否'],
    ['乌黑','稍蜷','浊响','清晰','稍凹','软粘','否'],
    ['浅白','蜷缩','浊响','模糊','平坦','硬滑','否'],
    ['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑','否'],
]

valset = [
    ['青绿','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
    ['浅白','蜷缩','浊响','清晰','凹陷','硬滑','是'],
    ['乌黑','稍蜷','浊响','清晰','稍凹','硬滑','是'],
    ['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑','否'],
    ['浅白','硬挺','清脆','模糊','平坦','硬滑','否'],
    ['浅白','蜷缩','浊响','模糊','平坦','软粘','否'],
    ['青绿','稍蜷','浊响','稍糊','凹陷','硬滑','否'],
]

columns = ['色泽','根蒂','敲声','纹理','脐部','触感','好瓜']

Train_data = pd.DataFrame(dataset,columns=columns)
Val_data = pd.DataFrame(dataset,columns=columns)

X_train = Train_data.drop(['好瓜'],axis=1)
y_train = Train_data['好瓜']


X_val = Val_data.drop(['好瓜'],axis=1)
y_val = Val_data['好瓜']

In [193]:
id_tree = Dicision_Tree()
id_tree.fit(X_train,y_train,0)

选择 色泽 增益： 0.2754887502163468
选择 根蒂 增益： 0.31127812445913283
选择 纹理 增益： 1.0
选择 敲声 增益： 1.0


In [194]:
id_tree.loss()

我不是叶节点
我不是叶节点
我是叶节点
我不是叶节点
我是叶节点
我是叶节点
我不是叶节点
我是叶节点
我是叶节点
我是叶节点
我是叶节点


(0.0, 7)

In [174]:
node = id_tree.root

In [181]:
node.fdict['清晰'].fdict['稍蜷'].fdict

{'乌黑': <__main__.Node at 0x7f49276efef0>,
 '青绿': <__main__.Node at 0x7f4927a9afd0>}