## 决策树算法

决策树对每个特征的筛选根据信息增益区分

In [3]:
import numpy as np
import pandas as pd
from sklearn import datasets

> 导入贷款申请样本数据表

In [4]:
data = [
    ['青年','否','否','一般','否'],
    ['青年','否','否','好','否'],
    ['青年','是','否','好','是'],
    ['青年','是','是','一般','是'],
    ['青年','否','否','一般','否'],
    ['中年','否','否','一般','否'],
    ['中年','否','否','好','否'],
    ['中年','是','是','好','是'],
    ['中年','否','是','非常好','是'],
    ['中年','否','是','非常好','是'],
    ['老年','否','是','非常好','是'],
    ['老年','否','是','好','是'],
    ['老年','是','否','好','是'],
    ['老年','是','否','非常好','是'],
    ['老年','否','否','一般','否'],
]

headers = ['年龄','有工作','有自己的房子','信贷情况','类别']

dataset = pd.DataFrame(data,columns=headers)

In [5]:
dataset

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


#### 定义一个函数来计算熵

In [6]:
def H(x):
    K = x.value_counts().values
    D = K.sum()
    P = K/D
    return -(P*np.log2(P)).sum()

#### 定义一个函数来计算条件熵 H(Y|X) = SUM(H(Y|X=x_i))

In [7]:
def Hyx(y,x):
    Kname = x.value_counts().index
    K = x.value_counts()
    Kname = K.index
    D = K.sum()
    Hyx = 0.0
    for k in Kname:
        Hyx = Hyx + K[k]/D*H(y[x==k])
    return Hyx

#### 定义信息增益函数g(D,A1) = H(D) - H(Y|A1)

In [8]:
def g(D,A1):
    return H(D) - Hyx(D,A1)

In [9]:
def g_radio(D,A1):
    h = H(A1)
    if h == 0:
        return 0
    return g(D,A1)/H(A1)

#### 根据信息增益选择最好的特征

In [10]:
def select_feature(x,y,metric='entropy'):
    C = x.columns
    max_g = -1
    feature = -1
    for c in C:
        if metric == 'entropy':
            c_g = g(y, x[c])
        else:
            c_g = g_radio(y,x[c])
        if c_g >= max_g:
            max_g = c_g
            feature = c
    return feature,max_g

In [15]:
X = dataset.drop(['类别'],axis=1)
Y = dataset['类别']
print(select_feature(X,Y,metric='entropy_radio'))

('有自己的房子', 0.4325380677663126)


In [16]:
class Node:
    
    def __init__(self,fname="",fdict={},leaf=False,tag=""):
        self.feature = fname
        self.fdict = fdict
        self.tag = tag
        self.leaf = leaf
        

In [82]:
class Dicision_Tree:
    
    def __init__(self,type='ID3'):
        self.type = type
            
    def _H(self,x):
        K = x.value_counts().values
        D = K.sum()
        P = K/D
        return -(P*np.log2(P)).sum()
    
    def _Hyx(self,y,x):
        Kname = x.value_counts().index
        K = x.value_counts()
        Kname = K.index
        D = K.sum()
        Hyx = 0.0
        for k in Kname:
            Hyx = Hyx + K[k]/D*self._H(y[x==k])
        return Hyx
    
    def _g(self,D,A1):
        return self._H(D) - self._Hyx(D,A1)
    
    def _g_radio(self,D,A1):
        h = H(A1)
        if h == 0:
            return 0
        return self._g(D,A1)/self._H(A1)
    
    def _select_feature(self,x,y,metric='entropy'):
        C = x.columns
        max_g = -1
        feature = -1
        for c in C:
            if metric == 'entropy':
                c_g = self._g(y, x[c])
            else:
                c_g = self._g_radio(y,x[c])
            print('feature:',c,'ent:',c_g)
            if c_g >= max_g:
                max_g = c_g
                feature = c
        return feature,max_g
    
    
    def _build_tree(self,dataset,labels,epsilon):
        Ck_num = labels.value_counts().shape[0]
        # 所有实例属于同一类Ck
        if Ck_num == 1:
            # 设T为单节点数，并将类Ck作为该节点的类标记
            ck = labels[labels.index[0]]
            return Node(leaf=True,tag=ck)
        # 如果dataset没有其他属性了 那么将T设置为单节点数，投票决定Ck
        if dataset.shape[1] == 0:
            ck = labels.value_counts().index[0]
            return Node(leaf=True,tag=ck)

        # 特征选择
        metric = 'entropy'
        if self.type == 'C4.5':
            metric = 'ent_radio'
        fname,Ag = self._select_feature(dataset,labels,metric=metric)
        # 判断信息增益是否小于阈值
        if Ag < epsilon:
            ck = labels.value_counts().index[0]
            return Node(leaf=True,tag=ck)

        # 按照fname列的每一个属性值进一步分类
        kname = dataset[fname].value_counts().index

        # 判断是否属于同一类
        fdict = {}
        for k in kname:
            flags = (dataset[fname]==k) 
            # 删去已经作为分类标准的属性
            dat = dataset[flags].drop([fname],axis=1)
            lab = labels[flags]
            # 将这一路径加入子节点
            fdict[k] = self._build_tree(dataset=dat,labels=lab,epsilon=epsilon)
        return Node(fname=fname,fdict=fdict)
    
    def _get_index(self,feature):
        return int(np.argwhere(self.columns==feature))
    
    def fit(self,dataset=None,labels=None,epsilon=0.1):
        if not isinstance(dataset,pd.DataFrame):
            dataset = pd.DataFrame(dataset)
        if not isinstance(labels,pd.Series):
            labels = pd.Series(labels)
        self.columns = dataset.columns
        self.root = self._build_tree(dataset,labels,epsilon)
        
    def _predict(self,X):
        # 对单个数据做预测
        node = self.root #获取根节点
        while not node.leaf: # 当节点不是叶节点一直循环
            feature = node.feature # 获取属性值
            index = self._get_index(feature) # 获取属性值对应的缩影
            value = X[index] # 获取测试数据对应索引的值
            node = node.fdict[value]
        return node.tag
        
    def predict(self,X):
        # 对多个数据做预测
        if isinstance(X,pd.DataFrame):
            X = X.values
        y_pre = []
        for x in X:
            y_pre.append(self._predict(x))
        return y_pre

In [83]:
id_tree = Dicision_Tree('C4.5')
id_tree.fit(X,Y,0.1)

feature: 年龄 ent: 0.05237190142858302
feature: 有工作 ent: 0.3524465495205019
feature: 有自己的房子 ent: 0.4325380677663126
feature: 信贷情况 ent: 0.23185388128724224
feature: 年龄 ent: 0.16441052527276862
feature: 有工作 ent: 1.0
feature: 信贷情况 ent: 0.34037448163185724


In [84]:
id_tree = Dicision_Tree()
id_tree.fit(X,Y,0.1)

feature: 年龄 ent: 0.08300749985576883
feature: 有工作 ent: 0.32365019815155627
feature: 有自己的房子 ent: 0.4199730940219749
feature: 信贷情况 ent: 0.36298956253708536
feature: 年龄 ent: 0.2516291673878229
feature: 有工作 ent: 0.9182958340544896
feature: 信贷情况 ent: 0.47385138961004514


In [81]:
id_tree.predict(X)

['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况
0,青年,否,否,一般
1,青年,否,否,好
2,青年,是,否,好
3,青年,是,是,一般
4,青年,否,否,一般
5,中年,否,否,一般
6,中年,否,否,好
7,中年,是,是,好
8,中年,否,是,非常好
9,中年,否,是,非常好


In [131]:
a = build_tree(f,l)

In [136]:
a.fdict['否'].fdict['否'].tag

'否'

In [55]:
l.value_counts().index[0]

'是'

In [89]:
ty = ['是','是','是']
pd.Series(ty).value_counts().shape

(1,)

In [85]:
l.value_counts().shape[0]

2