## DecisionTree

### dataset

| fe1 | fe2 | fish|
|-----|----|----   |
| 1 |1      |'yes' |
| 1 |1      |'yes' |
| 1 |0      |'no'  |
| 0 |1      |'no'  |
| 0 |1      |'no'  |

#### step1 create dataSet

In [24]:
import numpy as np
import math
import operator

In [25]:
def create_DateSet():
    dataSet=[[1,1,'yes'],
             [1,1,'yes'],
             [1,0,'no'],
             [0,1,'no'],
             [0,1,'no'],]
    featureNames=['no surfacing','flippers']
    return dataSet,featureNames

#### step2 cross entropy

In [26]:
def calcula_crossEntropy(dataSet):
    totalNum=len(dataSet)
    label_count={} # 这里是大括号注意
    for sample in dataSet:
        curlabel=sample[-1]
        if curlabel not in label_count.keys():
            label_count[curlabel]=0 # 第一次初始化
        label_count[curlabel]+=1
        
    crossEntropy=0.0
    for key in label_count:
        prob=float(label_count[key])/totalNum
        crossEntropy-=prob*math.log(prob,2)
    
    return crossEntropy

#### step3 split dataSet

In [27]:
def split_dataSet_with_axis(dataSet,tar_feature,exp_val):
    """把某个feature的值是exp_val的那个值从数据集中扣除
    
    args:
        tar_feature: 第几个feature
        exp_val: 该feature的期望值
    
    return:
        扣除之后的数据集
    """
    reduced_dataSet=[]
    for sample in dataSet:
        if sample[tar_feature]==exp_val:
            Temp_reduced_dataSet=sample[:tar_feature] # 左端点
            Temp_reduced_dataSet.extend(sample[tar_feature+1:]) # 右端点
            reduced_dataSet.append(Temp_reduced_dataSet)
    return reduced_dataSet
            

#### step4 calc info Gain

In [45]:
def find_bset_feature_to_split(dataSet):
    """找到一个特征,能使该树的信息增益下降最快
    args:
        dataSet
    
    return:
        下降最快的feature
    """
    num_feature=len(dataSet[0])-1
    num_sample=len(dataSet)
    best_info_gain=0.0
    best_feature=-1
    base_entropy=calcula_crossEntropy(dataSet)
    for i in range(num_feature): # 对于每个feature
        cur_feature_list=[sample[i] for sample in dataSet] # 取出第i个feature
        unique_feature_val=set(cur_feature_list)
        new_info_gain=0.0
        for feature_val in (unique_feature_val):
            # H(D|A)
            reduced_dataSet=split_dataSet_with_axis(dataSet,i,feature_val)
            cur_sample_proportion=len(reduced_dataSet)/float(num_sample) # 子树Ni/N 比例
            new_info_gain+=cur_sample_proportion*calcula_crossEntropy(reduced_dataSet)
        info_gain=base_entropy-new_info_gain # 注意这里的info_gain的算法,需要知道基础gain.看那个减小的最大
        if(info_gain>best_info_gain):
            best_info_gain=info_gain
            best_feature=i
    
    return best_feature

#### step5 create decision tree

In [46]:
def majority_label(labelList):
    """对labelList做统计,返回最高的label.
    
    """
    count_labels={}
    for vote in labelList:
        if vote not in labelList.keys(): # 每种label,第一次进来都要做初始化
            count_labels[vote]=0
        count_labels[vote]+=1
    sorted_coutn_labels=sorted(labelList.iteritems(),key=operator.itemgetter(1),reverse=True)
    return sorted_coutn_labels[0][0]

def create_tree(dataSet,featureName):
    """根据信息增益递归地寻找子树
    args:
        featureName:对应的feature是0,1的数字,其实它是有含义的,这里需要对应的含义以便创建出的decision tree有意义
    
    return:
        返回当前树的最大信息增益的子树
    """
    labelList=[sample[-1] for sample in dataSet]
    # 递归结束条件
    if labelList.count(labelList[0])==len(labelList):
        return labelList[0]
    
    if len(dataSet[0])==1: # dataSet 含有features和最后一列的label,如果split到只剩下最后一列的label,则不需再进行分割.
        return majority_label(labelList)
    
    # 创建树
    best_feature=find_bset_feature_to_split(dataSet)
    best_feature_name=featureName[best_feature]
    myTree={best_feature_name:{}}
    del(featureName[best_feature])
    feat_values=[sample[best_feature] for sample in dataSet]
    unique_value=set(feat_values)
    for val in unique_value:
        sub_feature_name=featureName[:]
        myTree[best_feature_name][val]=create_tree(split_dataSet_with_axis(dataSet,best_feature,val),sub_feature_name)
    print(myTree)
    return myTree
    
        

#### step6 training

In [48]:
dataSet,feature_names=create_DateSet()
tree=create_tree(dataSet,feature_names)
print(tree)

{'flippers': {0: 'no', 1: 'yes'}}
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}


#### step7 classify

In [49]:
def classify(inputTree,featNames,testVec):
    """根据已知决策树,输入特定的测试向量及对应的feature名字,得到该测试向量是否属于期望label
    args:
        inputTree: 已经生成的决策树 
        featNames: 特征名字集合
        testVec: 待分类向量
    """
    first_str=list(inputTree.keys())[0]
    second_dict=inputTree[first_str]
    feature_index=featNames.index(first_str)
    key=testVec[feature_index]
    value_of_feature=second_dict[key]
    if isinstance(value_of_feature,dict):
        label_list=classify(value_of_feature,featNames,testVec)
    else:
        label_list=value_of_feature
    return label_list


In [52]:
dataSet,feature_names=create_DateSet()
print(classify(tree,feature_names,[1,0]))
print(classify(tree,feature_names,[0,1]))
print(classify(tree,feature_names,[1,1]))

no
no
yes
