<a href="https://colab.research.google.com/github/xiaochengJF/MachineLearning/blob/master/%E5%86%B3%E7%AD%96%E6%A0%91_CART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CART_Classification

In [0]:
from numpy import *

In [0]:
def loadDataSet():
    dataSet = [[0, 0, 0],
               [0, 1, 1],
               [0, 2, 0],
               [1, 0, 1],
               [1, 1, 1],
               [1, 2, 1],
               [2, 0, 0],
               [2, 1, 1],
               [2, 2, 0]]
    labels = ['color','shape']
    return dataSet, labels

## 计算基尼指数
基尼指数可以理解为熵模型的一阶泰勒  
基尼指数（基尼不纯度）= 样本被选中的概率 $\times$ 样本被分错的概率
$$Gini(D) = \sum_{k=1}^{K}p_{k}(1-p_{k}) = 1 - \sum_{k=1}^{K}p_{k}^{2}=1 - \sum_{k=1}^{K}\frac{|D_{k}|}{|D|}$$

- $p_k$表示选中的样本属于$k$类别的概率，则这个样本被分错的概率是$1-p_k$

- 样本集合中有$K$个类别，一个随机选中的样本可以属于这$K$个类别中的任意一个，因而对类别求和

- 当为二分类时：$Gini(P) = 2p(1-p)$

对于样本$D$,如果根据特征$A$的某个值$a$,把$D$分成$D_1$和$D_2$两部分，则在特征$A$的条件下，$D$的基尼系数表达式为：
$$Gini(D,A) = \frac{|D_1|}{|D|}Gini(D_1) + \frac{|D_2|}{|D|}Gini(D_2)$$ 

In [0]:
def calcGini(dataSet):
    totalNum = shape(dataSet)[0]
    labelNum = {}
    gini = 0
    for data in dataSet:
        label = data[-1]
        if label in labelNum:
            labelNum[label] += 1
        else:
            labelNum[label] = 1

    for key in labelNum:
        p = labelNum[key] / totalNum
        gini += p * (1 - p)
    return gini

## 选择最优特征

In [0]:
def chooseBestFeatVal2Split(dataSet):
    #如果没有可划分的特征或所有目标变量相等，停止
    if(len(dataSet[0]) == 1): return None, None
    if(len(set([d[-1] for d in dataSet])) == 1): return None, None
    bestFeature = 0
    bestValue = 0
    lowestGini = 100000
    totalGini = calcGini(dataSet)
    totalNum = shape(dataSet)[0]
    for feature in range(shape(dataSet)[1] - 1):
        allValues = [d[feature] for d in dataSet]
        values = set(allValues)
        for value in values:
            leftChild, rightChild = splitByFeatVal(feature, value, dataSet)
            if(shape(leftChild)[0] == 0 or shape(rightChild)[0] == 0): continue
            leftNum = shape(leftChild)[0]
            rightNum = shape(rightChild)[0]
            curGini = leftNum / totalNum * calcGini(leftChild) + \
                      rightNum / totalNum * calcGini(rightChild)
            if(curGini < lowestGini):
                bestFeature = feature
                bestValue = value
                lowestGini = curGini
    #如果gini减少很小，停止
    if(totalGini - lowestGini < 0.00001): return None, None
    return bestFeature, bestValue

## 按特征划分数据集
如果 feature 对应的属性值等于 value 值，则将该条数据划分到左子树；如果 feature 对应的属性值不等于 value 值，则将该条数据划分到右子树

In [0]:
def splitByFeatVal(feature, value, dataSet):
    #左子树的值大于根节点的值
    dataSet = mat(dataSet)
    leftChild = dataSet[nonzero(dataSet[:,feature] == value)[0], :].tolist()
    #右子树的值小于等于根节点的值
    rightChild = dataSet[nonzero(dataSet[:,feature] != value)[0], :].tolist()

    return leftChild, rightChild

## 结束条件

checkIsOneCateg 函数用来判断数据集的目标变量是否为一个分类结果  
majorityCateg 函数用来选出目标变量中的大多数值作为输出变量

In [0]:
def checkIsOneCateg(newDataSet):
    flag = False
    categoryList = [data[-1] for data in newDataSet]
    category = set(categoryList)
    if(len(category) == 1):
        flag = True
    return flag

# 按分类后类别数量排序，比如：最后分类为2男1女，则判定为男；
def majorityCateg(newDataSet):  
    categCount = {}
    categList = [data[-1] for data in newDataSet]
    for c in categList:
        if c not in categCount:
            categCount[c] = 1
        else:
            categCount[c] += 1
    sortedCateg = sorted(categCount.items(), key = lambda x:x[1], reverse = True)

    return sortedCateg[0][0]

## 创建分类树

In [7]:
def createClassifTree(dataSet):
    feature, value = chooseBestFeatVal2Split(dataSet)
    #如果无法分割，那么返回叶节点的值,即所有目标变量相同则为此值，不同则为多数值
    if feature == None and checkIsOneCateg(dataSet):
        return dataSet[0][-1]
    if feature == None and not checkIsOneCateg(dataSet):
        return majorityCateg(dataSet)
    #如果可以继续分割，那么继续创建新的子树
    classifTree = {}
    classifTree['featIndex'] = feature
    classifTree['value'] = value
    leftChild, rightChild = splitByFeatVal(feature, value, dataSet)
    classifTree['leftChild'] = createClassifTree(leftChild)
    classifTree['rightChild'] = createClassifTree(rightChild)

    return classifTree

if __name__ == '__main__':
    dataSet, labels = loadDataSet()
    classifTree = createClassifTree(dataSet)
    print(classifTree)

{'featIndex': 0, 'value': 1, 'leftChild': 1, 'rightChild': {'featIndex': 1, 'value': 1, 'leftChild': 1, 'rightChild': 0}}


# CART_Regression

In [0]:
from numpy import *

## 数据集
数据集共 25 条数据。第一列数据代表房子的评价得分，此数据集中所有数据的评价得分都是 5.23。第二列数据代表房子的平方数，第三列数据代表房子的价格，以万为单位  
5.23	1	0.1  
5.23	2	0.12  
5.23	3	0.02  
5.23	4	0.03  
5.23	5	0.12  
5.23	6	5.0  
5.23	7	5.2  
5.23	8	5.1  
5.23	9	5.02  
5.23	10	5.03  
5.23	11	10.8  
5.23	12	10.06  
5.23	13	10.03  
5.23	14	10.02  
5.23	15	10.44  
5.23	16	15.88  
5.23	17	15.06  
5.23	18	15.04  
5.23	19	15.30  
5.23	20	15.05  
5.23	21	20.8  
5.23	22	20.16  
5.23	23	20.24  
5.23	24	20.05  
5.23	25	20.09  

In [0]:
def loadDataSet():
    dataSet = []
    f = open('regData.txt')
    fr = f.readlines()
    for line in fr:
        line = line.strip().split('\t')
        linef = [float(li) for li in line]
        dataSet.append(linef)
    dataSetMat = mat(dataSet)
    return dataSetMat

## 计算平方误差

In [0]:
def calcErr(dataSetMat):
    error = var(dataSetMat[:,-1]) * shape(dataSetMat)[0]
    return error

## 选择最优特征

对每个特征的每个属性值，计算按该属性值二分后的两个子数据集的平方误差和，选择平方误差和最小的特征作为最优特征。除了用平方误差代替基尼指数之外，其他过程和分类树基本相同

In [0]:
def chooseBestFeatVal2Split(dataSetMat):
    #如果所有目标变量相等，停止
    if(len(set(dataSetMat[:,-1].T.tolist()[0])) == 1): return None, None
    bestFeature = 0
    bestValue = 0
    lowestErr = 100000
    totalErr = calcErr(dataSetMat)
    for feature in range(shape(dataSetMat)[1] - 1):
        allValues = [d[feature] for d in dataSetMat.tolist()]
        values = set(allValues)
        for value in values:
            leftChild, rightChild = splitByFeatVal(feature, value, dataSetMat)
            if(shape(leftChild)[0] == 0 or shape(rightChild)[0] == 0): continue
            curErr = calcErr(leftChild) + calcErr(rightChild)
            if(curErr < lowestErr):
                bestFeature = feature
                bestValue = value
                lowestErr = curErr
    #如果误差减少很小，停止
    if(totalErr - lowestErr < 1): return None, None
    return bestFeature, bestValue

 ## 按特征划分数据集

In [0]:
def splitByFeatVal(feature, value, dataSetMat):
    #左子树的值大于根节点的值
    leftChild = dataSetMat[nonzero(dataSetMat[:,feature] > value)[0], :]
    #右子树的值小于等于根节点的值
    rightChild = dataSetMat[nonzero(dataSetMat[:,feature] <= value)[0], :]

    return leftChild, rightChild

## 创建回归树

In [24]:
def createRegTree(dataSetMat):
    feature, value = chooseBestFeatVal2Split(dataSetMat)
    #如果无法分割，那么返回叶节点的值，即所有dataSetMat的均值
    if feature == None: return mean(dataSetMat[:,-1])
    #如果可以继续分割，那么继续创建新的子树
    regTree = {}
    regTree['featIndex'] = feature
    regTree['value'] = value
    leftChild, rightChild = splitByFeatVal(feature, value, dataSetMat)
    regTree['leftChild'] = createRegTree(leftChild)
    regTree['rightChild'] = createRegTree(rightChild)

    return regTree

if __name__ == '__main__':
    dataSetMat = loadDataSet()
    regTree = createRegTree(dataSetMat)
    print(regTree)

{'leftChild': {'leftChild': 20.268, 'featIndex': 1, 'value': 20.0, 'rightChild': {'leftChild': 15.266, 'featIndex': 1, 'value': 15.0, 'rightChild': 10.27}}, 'featIndex': 1, 'value': 10.0, 'rightChild': {'leftChild': 5.07, 'featIndex': 1, 'value': 5.0, 'rightChild': 0.078}}
