## 导入必要的库

In [3]:
import math
import numpy as np
from collections import Counter

## 计算数据集的熵

In [4]:
'''
parameter:
    dataset: numpy array
    index: the feature to split the dataset
'''
def calculate_entropy(dataset, index=-1):
    data = dataset[:, index].tolist()
    data_counter = Counter(data)
    
    entropy = 0
    N = len(dataset)
    for k, v in data_counter.items():
        pk = float(v)/N
        entropy += -pk*math.log2(pk)
    
    return entropy

In [12]:
dataset_list = [
    [1, 1, 'yes'],
    [1, 1, 'yes'],
    [1, 0, 'no'],
    [0, 1, 'no'],
    [0, 1, 'no']
]
labels = ['no surface', 'Flippers']

dataset = np.array(dataset_list)
print(calculate_entropy(dataset))

0.9709505944546686


## 以某一个特征对数据集进行分割

In [13]:
'''
parameter:
    dataset: dataset
    axis: index of axis
    value: split point's value
'''
def split_dataset(dataset, axis, value):
    ret_data_set=[]
    axis_data = dataset[:, axis]
    ret_data_set = dataset[axis_data==value, :]
    ret_data_set = np.delete(ret_data_set, axis, 1)
    return ret_data_set

In [14]:
feature_axis = 1
feature_set = np.unique(dataset[:, feature_axis])
for feature in feature_set:
    print(split_dataset(dataset, feature_axis, feature))

[['1' 'no']]
[['1' 'yes']
 ['1' 'yes']
 ['0' 'no']
 ['0' 'no']]


## 选择最好的划分特征进行数据集的划分

1. 遍历所有的特征
2. 对于每一个特征，拿到该特征可取的所有值
3. 对该值和特征进行数据集的划分，得到划分之后的子集
4. 对于子集，计算条件熵和信息增益
5. 选择信息增益最大的特征进行数据集的划分

In [15]:
'''
parameter:
    dataset: dataset
'''
def choose_best_feature_to_split(dataset):
    ''' calculate H(D) '''
    HD = calculate_entropy(dataset)
    
    best_feature_axis = 0
    best_entropy_gain = -np.inf
    ''' choose best feature '''
    N = dataset.shape[1]-1
    for feat_axis in range(N):
        feature_set = np.unique(dataset[:, feat_axis])
        condition_entropy = 0
        ''' each split value '''
        for feature in feature_set:
            sub_feature_set = split_dataset(dataset, feature_axis, feature)
            pk = sub_feature_set.shape[0]/(N+1)
            condition_entropy += pk*calculate_entropy(sub_feature_set)
        entropy_gain = HD - condition_entropy
        if entropy_gain > best_entropy_gain:
            best_entropy_gain = entropy_gain
            best_feature_axis = feat_axis
    return best_feature_axis

In [16]:
best_feature_axis = choose_best_feature_to_split(dataset)
print(best_feature_axis)

0


In [17]:
feature_set = np.unique(dataset[:, best_feature_axis])
for feature in feature_set:
    print(split_dataset(dataset, best_feature_axis, feature))

[['1' 'no']
 ['1' 'no']]
[['1' 'yes']
 ['1' 'yes']
 ['0' 'no']]


In [18]:
dataset

array([['1', '1', 'yes'],
       ['1', '1', 'yes'],
       ['1', '0', 'no'],
       ['0', '1', 'no'],
       ['0', '1', 'no']], dtype='<U21')

## 递归的创建决策树

递归函数的描述:
1. 终止条件：数据集中仅有一个类了或者没有特征再进行细分
2. 不满足终止条件的话，继续选择最优的特征，之后按照最优特征分出子集，对子集进行递归

In [19]:
'''
choose the major class in dataset
class_list: numpy type
'''
def major_class(class_list):
    counter = Counter(class_list.tolist())
    main_class = -1
    main_num = -np.inf
    for k, v in counter.items():
        if v > main_num:
            main_num = v
            main_class = k
    return main_class

In [None]:
'''
create dtree in a recursive way
parameter:
    dataset: dataset
    labels: labels of the features
'''
def create_dtree(dataset, labels):
    class_list = dataset[:,-1]
    ''' check terminal conditions '''
    ''' if only one class in the dataset '''
    if np.alltrue(class_list==class_list[0]):
        return class_list[0]
    ''' if there is no feature can be split '''
    if len(dataset[0]) == 1:
        return major_class(class_list)
    
    ''' choose best feature to split dataset '''
    feat_axis = choose_best_feature_to_split(dataset)
    