In [184]:
import numpy as np
import pandas as pd

class Node:
    
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root # true: 叶子节点 false:根结点或内部节点
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {
            'label' : self.label,
            'feature' : self.feature,
            "tree" : self.tree
        }
        
    def __repr__(self):
        return '{}'.format(self.result)
    
    def add_node(self, val, node):
        self.tree[val] = node
        
    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)

class C4_5DTree:
    
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}
        
    @staticmethod           
    def cal_ent(ent_array):
        data_length = len(ent_array)
        label_count = {}
        for i in range(data_length):
            label = ent_array[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        ent = -sum([(p / data_length) * log(p / data_length, 2)
                    for p in label_count.values()])
        return ent
        
    def calculate_iv(self,iv_array,axis = 0):
        data_len = len(iv_array)
        feature_dict = {}
        for i in range(data_len):
            feature = iv_array[i][axis]
            if feature not in feature_dict:
                feature_dict[feature] = []
            feature_dict[feature].append(iv_array[i])
        feature_ent = -sum([len(p) * 1.0 / data_len * log (len(p) * 1.0 / data_len,2) for p in feature_dict.values()])
        return feature_ent
    
    # 经验条件熵
    def cond_ent(self,cond_array, axis=0):
        data_length = len(cond_array)
        feature_sets = {}
        for i in range(data_length):
            feature = cond_array[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(cond_array[i])
        cond_ent = sum([(len(p) / data_length) * self.cal_ent(p)
                        for p in feature_sets.values()])
        return cond_ent
        
    def info_gain_ratio(self,ent,cond_ent,feature_ent):
        return (ent - cond_ent)
       
    def info_gain_ratio_train(self,gr_datas):
        feature_count = len(gr_datas[0]) -1
        ent = self.cal_ent(gr_datas)
        best_feature = []
        for c in range(feature_count):
            c_info_gain_ratio = self.info_gain_ratio(ent,self.cond_ent(gr_datas,c),self.calculate_iv(gr_datas,c))
            best_feature.append((c,c_info_gain_ratio))
        best_ = max(best_feature,key = lambda x:x[-1])
        return best_
        
    def train(self,train_data):
        y_train,feature_names = train_data.iloc[:,-1],train_data.columns[:-1]
        if len(y_train.value_counts()) == 1:
            return Node(root=True,label=y_train.iloc[0])             
        if len(feature_names)==0:
            return Node(root=True,label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        max_feature,max_info_gain_ratio = self.info_gain_ratio_train(np.array(train_data))
        max_feature_name = feature_names[max_feature]
            
        if max_info_gain_ratio < self.epsilon:
            return Node(
                    root=True,
                    label=y_train.value_counts().sort_values(
                    ascending=False).index[0])
        
        node_tree = Node(root = False,feature_name=max_feature_name,feature = max_feature)
        
        feature_list = train_data[max_feature_name].value_counts().index
        
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] ==
                                          f].drop([max_feature_name], axis=1)
            #递归生成树
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)
            
        return node_tree
                            
    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree

    def predict(self, X_test):
        return self._tree.predict(X_test)

datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
train_data = pd.DataFrame(datasets, columns=labels)
dt = C4_5DTree()
tree = dt.fit(train_data)
tree

{'label': None, 'feature': 2, 'tree': {'否': {'label': None, 'feature': 1, 'tree': {'否': {'label': '否', 'feature': None, 'tree': {}}, '是': {'label': '是', 'feature': None, 'tree': {}}}}, '是': {'label': '是', 'feature': None, 'tree': {}}}}

In [183]:
dt.predict(['青年', '否', '否', '一般'])

'否'