In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

### 分类树


In [2]:
#    导入sklearn自带的数据集
from sklearn.datasets import load_iris

In [3]:
iris = load_iris()
X = iris.data
y = iris.target
#    分割为训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=5)
X_test, X_cv, y_test, y_cv = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

Gini(D)表示集合D的不确定性

In [4]:
def giniD(D):
    '''
    集合D的基尼指数
    D: 样本集合
    return: 基尼指数
    '''
    #    该样本集合的分类的列表
    K = list(set(D[:,-1]))
    gini = 1
    for n in K:
        #    Ck是D中属于第k类的样本子集
        Ck = D[D[:,-1]==n]
        gini -= (Ck.shape[0]/D.shape[0])**2
    return gini

Gini(D,A)表示经A=a分割后集合D的不确定性

In [5]:
#    在特征A的条件下，集合D的基尼指数
def giniDA(D):
    '''
    在特征A的条件下，集合D的基尼指数
    D: 样本集合（训练数据集）-numpy
    '''

    min_giniDA = 1
    #    j: 最优特征
    #    s: 最优特征的切分变量
    j = s = 0
    for i in range(D.shape[1]-1):
        #    特征所有可能的取值
        a = list(set(D[:,i]))
        for n in a:
            D1 = D[D[:,i]==n]
            D2 = D[D[:,i]!=n]
            giniDA = (D1.shape[0]/D.shape[0])*giniD(D1) + (D2.shape[0]/D.shape[0])*giniD(D2)
            #print(min_giniDA)
            if giniDA < min_giniDA:
                min_giniDA = giniDA
                j = i
                s = n
            
    return j, s



    

生成决策树


In [6]:
class Node():
    '''
    二叉树节点
    '''
    def __init__(self, val=None, left=None, right=None, j=None, s=None):
        self.val = val
        self.left = left
        self.right = right
        self.j = j
        self.s = s

In [7]:
def select_Ck(D):
    '''
    多数表决，对叶节点进行选择类别
    '''
    label = list(D[:,-1])
    max_Ck = 0
    Ck = 1
    for n in set(label):
        if label.count(n) > max_Ck:
            max_Ck = label.count(n)
            Ck = n
    return Ck

In [8]:
def produce_cart(node):
    '''
    CART生成
    '''
    #    如果该节点的样本数小于等于5，则当作子节点
    if node.val.shape[0] <= 5:
        node.val = select_Ck(node.val)
        return None
    #    求基尼指数最小的最优特征和最优特征的最优切分变量
    j, s = giniDA(node.val)
    right_list, left_list = [], []
    #    根据最优特征和最优切分变量进行划分，左“是”右“否”原则
    for i in range(node.val.shape[0]):
        if node.val[i,j] != s:
            right_list.append(node.val[i,:])
        else:
            left_list.append(node.val[i,:])
    #    构造左右子节点
    right_node = Node(np.array(right_list))
    left_node = Node(np.array(left_list))
    node.right = right_node
    node.left = left_node
    node.j = j
    node.s = s
    #    递归
    produce_cart(right_node)
    produce_cart(left_node)
    #if(produce_cart(right_node) is None):
    #    #    如果是子节点 则多数表决选择子节点的label
    #    right_node.val = select_Ck(right_node.val)
    #if(produce_cart(left_node) is None):
    #    left_node.val = select_Ck(left_node.val)
    

In [9]:

#    添加一列,将fature和label合并
train_data = np.column_stack((X_train, y_train))
D = train_data

root = Node(D)
produce_cart(root)

对未剪枝的分类树进行预测

In [10]:
#    对未剪枝的CART预测
predict = []

def CART_predict(x, node):
    '''
    根据CART预测
    x: 被预测的数据
    node: 节点
    '''
    #    遍历到子节点后返回子节点的label
    if node.right is None and node.left is None:
        return node.val
    
    if x[node.j] == node.s:
        return CART_predict(x, node.left)
    else:
        return CART_predict(x, node.right)
        
#    对每一个数据进行预测
for n in X_cv:
    predict.append(CART_predict(n, root))

correct = [1 if i==j else 0 for (i, j) in zip(predict, y_cv)]
correct_rate = correct.count(1)/len(correct)
print('根据未剪枝的决策树进行预测的精确度: ',correct_rate)

根据未剪枝的决策树进行预测的精确度:  0.8666666666666667


对分类树进行剪枝

In [None]:
太难了
