## 作业内容##
<br>
**信息增益率代码实现**

In [11]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log

In [12]:
def create_data():
    '''
        xxx
    '''
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    # 返回数据集和每个维度的名称
    return datasets, labels

In [13]:

class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        '''
            1.叶子节点：label
            2.中间节点：条件（特征）[条件1]子节点；[条件2]子节点
            是否为叶子节点
        '''
        self.root = root # 是否为叶子节点
        self.label = label # 叶子节点所有样本的标签
        self.feature_name = feature_name # 切分条件
        self.feature = feature # 切分条件
        self.tree = {} # [条件1]子节点 node_son
        self.result = {
            'label:': self.label,
            'feature': self.feature,
            'tree': self.tree
        }

    def __repr__(self):
        return '{}'.format(self.result)

    def add_node(self, val, node):
        '''训练过程使用'''
        self.tree[val] = node

    def predict(self, features):
        '''
            features->预测数据的特征
            预测过程
        '''
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)

# 树的根节点 Node

class DTree:
    '''
        建树过程
    '''
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon # 超参数
        self._tree = {}

    # 获得数组元素对应标签的二分类比率
    def calc_p(self,datasets):
        data_length = len(datasets)
        label_count = {}
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        return label_count[label]/data_length


    # 获得指定特征geni系数
    def get_geni(self,datasets,axis=0):
        data_length = len(datasets)
        feature_sets = {}
        # 指定特征多种分类geni系数集合
        best_div = []
        for i in range(data_length):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])

        for key in feature_sets.keys():
            # key值对应集合中的标签比率
            px = self.calc_p(feature_sets[key])
            di=len(feature_sets[key])/data_length
            # 获得排除当前key所得到的集合
            others=sum((feature_sets[p] for p in feature_sets.keys() if p!=key),[])
            # 获得排除key值所获得集合中的标签比率
            qx=self.calc_p(others)
            geni = di * 2 * px * (1 - px) + (1 - di) * 2 * qx * (1 - qx)
            best_div.append((key,geni))
        # 获得多种分类组合中最小的geni系数
        best_ = min(best_div, key=lambda x: x[-1])
        return best_


    def info_geni_train(self, datasets):
        count = len(datasets[0]) - 1

        best_feature = []
        # 遍历特征
        for c in range(count):
            geni_=  self.get_geni(datasets, axis=c)
            best_feature.append((c, geni_))
        # 比较大小
        best_ = min(best_feature, key=lambda x: x[-1][-1])
        return best_

    def train(self, train_data):
        """
        input:数据集D(DataFrame格式)，特征集A，阈值epc
        output:决策树T
        """
        """
            不断返回子树node，上级调用可以直接将子树node填在自己的.tree的dict里
            递归过程结束：返回叶子节点
            每次递归调用的train_data，都是上一级tree的节点下发下来的子数据集
        """
        _, y_train, features = train_data.iloc[:, :
                                               -1], train_data.iloc[:,
                                                                    -1], train_data.columns[:
                                                                                            -1]
        # 1,若D中实例属于同一类Ck，则T为单节点树，并将类Ck作为结点的类标记，返回T
        if len(y_train.value_counts()) == 1:
            return Node(root=True, label=y_train.iloc[0])

        # 2, 若A为空，则T为单节点树，将D中实例树最大的类Ck作为该节点的类标记，返回T
        if len(features) == 0:
            return Node(
                root=True,
                label=y_train.value_counts().sort_values(
                    ascending=False).index[0])

        # 3,计算最大信息增益 同5.1,Ag为信息增益最大的特征
        max_feature, max_info_gain = self.info_geni_train(np.array(train_data))
        max_feature_name = features[max_feature]

        # 4,Ag的信息增益小于阈值eta,则置T为单节点树，并将D中是实例数最大的类Ck作为该节点的类标记，返回T
        if max_info_gain[1]< self.epsilon:
            return Node(
                root=True,
                label=y_train.value_counts().sort_values(
                    ascending=False).index[0])

        # 5,构建Ag子集
        node_tree = Node(
            root=False, feature_name=max_feature_name, feature=max_feature)

        feature_list = train_data[max_feature_name].value_counts().index
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] ==
                                          f].drop([max_feature_name], axis=1)

            # 6, 递归生成树
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)

        return node_tree

    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree

    def predict(self, X_test):
        return self._tree.predict(X_test)


In [14]:
datasets, labels = create_data()
data_df = pd.DataFrame(datasets, columns=labels)
dt = DTree()
tree = dt.fit(data_df)
tree



{'label:': None, 'feature': 2, 'tree': {'否': {'label:': '否', 'feature': None, 'tree': {}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}

In [15]:
dt.predict(['老年', '否', '否', '一般'])

'否'