In [7]:
from math import *
import numpy as np
class DecisionTreeClassifier:
    def __init__(self):
        pass
    
    def fit(self,X,y,feature_labels=None):
        data=np.concatenate([X,y],axis=1).tolist()
        self.feature_labels=list(range(len(X[0][0]))) if feature_labels is None else feature_labels
        self.tree=create_tree(data,self.feature_labels)
        return self
        
    def predict(self,X):
        X=X.tolist()
        preds=[]
        for i in X:
            pred=classify(self.tree,self.feature_labels,i)
            preds.append(pred)
        return np.array(preds)
        

def cal_shannonent(data):
    data_size=len(data)    #计算数据集大小
    label_count={}         #初始化label，number字典
    for i in data:         #迭代每一条数据，并记录每个label出现的次数
        label=i[-1]
        if label not in label_count.keys():
            label_count[label]=1
        else:
            label_count[label]+=1
    shannonent=0            #初始化信息熵
    for label in label_count.keys():      #迭代计算信息熵
        prob=label_count[label]/data_size
        shannonent-=prob*log(prob,2)
    return shannonent


#根据字段(axis)和字段中的value进行切分数据集
def split_data(data,axis,value):
    '''将原始数据集中特征索引位为axis且值为value的数据抽取出来'''
    new_data=[]
    for i in data:
        if i[axis]==value:
            new_i=i[:axis]
            new_i.extend(i[axis+1:])
            new_data.append(new_i)
    return new_data

#选取当前数据集最优的特征，（获得最大信息增益的特征）
def get_bestfeature(data):
    n_features=len(data[0])-1  #最后一列为target字段
    base_shannonent=cal_shannonent(data)  #计算元数据集目标target信息熵
    best_feature=-1
    best_inf_gain=0.000   #初始化信息熵，最优特征
    for feature_index in range(n_features):    #迭代特征，进行特征选取
        value_set=set([i[feature_index] for i in data])
        new_shannonent=0        
        for value in value_set:
            splited_data=split_data(data,feature_index,value)
            prob=len(splited_data)/float(len(data))
            new_shannonent+=prob*cal_shannonent(splited_data)
        inf_gain=base_shannonent-new_shannonent
        if inf_gain>best_inf_gain:
            best_inf_gain=inf_gain
            best_feature=feature_index
    return best_feature

#根据列表中的value，获得计数最多的value
import operator
def majority_vote(class_list):
    value_set=set(class_list)
    class_counts={}     #value,count
    for i in class_list:
        if i not in class_counts.keys():
            class_counts[i]=0
        class_counts[i]+=1
    majority=sorted(class_counts.items(),key=operator.itemgetter(1),reverse=True)[0][0]
    return majority

def create_tree(data,feature_labels):  
    '''输入data和数据的特征label来构建决策树'''
    feature_labels=feature_labels[:]   #feature_lables是一个列表，此行代码是为了防止此函数在全局修改feature_label变量
    class_list=[i[-1] for i in data]  #获取当前数据集的类别列表
    if class_list.count(class_list[0])==len(class_list):  #如果数据集的当前类别等于数据集的行数，直接返回此类别
        return class_list[0]        
    if len(data[0])==1:           #如果此数据只剩下类别特征，返回占比最多的类别
        return majority_vote(class_list)

    best_feature_index=get_bestfeature(data)      #计算当前数据集最优切分特征的index
    best_feature_label=feature_labels[best_feature_index]    #获取当前数据最优切分特征的label
    tree={best_feature_label:{}}           
    del (feature_labels[best_feature_index])      #在feature_labes中删除此特征

    best_feature_values=[i[best_feature_index] for i in data]     #获取此特征不同的value集合 
    unique_values=set(best_feature_values)
    for value in unique_values:       #遍历此特征的value迭代的构建树
        sub_labels=feature_labels[:]
        tree[best_feature_label][value]=create_tree(split_data(data,
                                                               best_feature_index,value),sub_labels)
    return tree

def classify(tree_model,feature_labels,test_vector):
    first_str=list(tree_model.keys())[0]
    second_dict=tree_model[first_str]
    feature_index=feature_labels.index(first_str)
    for key in second_dict.keys():
        if test_vector[feature_index]== key:
            if isinstance(second_dict[key],dict):
                class_lable={}
                class_lable=classify(second_dict[key],feature_labels,test_vector)
            else:
                class_lable=second_dict[key]
    return class_lable

In [8]:
import pandas as pd
data=pd.read_csv('DecisionTree.csv').values
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data[:,:-1],data[:,-1],test_size=0.3)
model=DecisionTreeClassifier().fit(X_train,y_train.reshape(-1,1))
for i in X_test:
    classify(model.tree,list(range(X_train.shape[1])),i)

UnboundLocalError: local variable 'class_lable' referenced before assignment

In [None]:
model.predict(X_test)