# 树回归
**优点：**可以对复杂和非线性的数据建模。  
**缺点：**结果不易理解。  
**适用数据类型：**数值型和标称型数据。

## 树回归的一般方法
1. 收集数据：采用任意方法收集数据。
2. 准备数据：需要数值型的数据，标称型数据应该映射成二值型数据。
3. 分析数据：绘出数据的二维可视化显示结果，以字典方式生成树。
4. 训练算法：大部分时间都花费在叶节点树模型的构建上。
5. 测试算法：使用测试数据上的R2值来分析模型的效果。
6. 使用算法：使用训练出的树做预测，预测结果还可以用来做很多事情

In [1]:
import numpy as np

## CART算法的实现代码

In [2]:
def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        # 将每行映射成浮点数
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return dataMat

In [3]:
# dataSet:数据集，feature：待切分的特征，value:该特征的某个值
# 将数据按照特征的值来划分
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
    mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
    return mat0, mat1

In [4]:
testMat = np.mat(np.eye(4))

In [5]:
testMat

matrix([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

In [6]:
mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)

In [7]:
mat0

matrix([[0., 1., 0., 0.]])

In [8]:
mat1

matrix([[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

## 构建树

chooseBestSplit()函数伪代码：  
对每个特征：  
&emsp;&emsp;对每个特征值：  
&emsp;&emsp;&emsp;&emsp;将数据集切分成两份  
&emsp;&emsp;&emsp;&emsp;计算切分的误差  
&emsp;&emsp;&emsp;&emsp;如果当前误差小于当前最小误差，那么将当前切分设定为最佳切分并更新最小误差  
返回最佳切分的特征和阈值

In [9]:
# 生成叶子节点
def regLeaf(dataSet):
    return np.mean(dataSet[:, -1])

In [10]:
# 误差估计函数，计算目标变量的平方误差
def regErr(dataSet):
    return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]

In [11]:
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
    # tolS是容许误差下降值， tolN是切分的最少样本数
    tolS = ops[0]; tolN = ops[1]
    # 如果所有值都相等则退出
    if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
        return None, leafType(dataSet)
    m, n = np.shape(dataSet)
    # S：用于与新切分误差对比，来检查新切分是否降低误差
    S = errType(dataSet)
    bestS = np.inf; bestIndex = 0; bestValue = 0
    # 获取最低误差的切分
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    # 如果误差减少不大则退出
    if (S - bestS) < tolS:
        return None, leafType(dataSet)
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    # 如果切分出的数据集很小则退出
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
        return None, leafType(dataSet)
    return bestIndex, bestValue

In [12]:
# 树构建函数
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
    if feat is None:
        return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree

In [13]:
myDat = loadDataSet('ex00.txt')

In [14]:
myMat = np.mat(myDat)

In [15]:
createTree(myMat)

{'spInd': 0,
 'spVal': 0.48813,
 'left': 1.0180967672413792,
 'right': -0.04465028571428572}

In [16]:
myDat1 = loadDataSet('ex0.txt')

In [17]:
myMat1 = np.mat(myDat1)

In [18]:
createTree(myMat1)

{'spInd': 1,
 'spVal': 0.39435,
 'left': {'spInd': 1,
  'spVal': 0.582002,
  'left': {'spInd': 1,
   'spVal': 0.797583,
   'left': 3.9871632,
   'right': 2.9836209534883724},
  'right': 1.980035071428571},
 'right': {'spInd': 1,
  'spVal': 0.197834,
  'left': 1.0289583666666666,
  'right': -0.023838155555555553}}

## 树剪枝

### 预剪枝

In [19]:
createTree(myMat, ops=(0, 1))

{'spInd': 0,
 'spVal': 0.48813,
 'left': {'spInd': 0,
  'spVal': 0.620599,
  'left': {'spInd': 0,
   'spVal': 0.625336,
   'left': {'spInd': 0,
    'spVal': 0.625791,
    'left': {'spInd': 0,
     'spVal': 0.643601,
     'left': {'spInd': 0,
      'spVal': 0.651376,
      'left': {'spInd': 0,
       'spVal': 0.6632,
       'left': {'spInd': 0,
        'spVal': 0.683921,
        'left': {'spInd': 0,
         'spVal': 0.819823,
         'left': {'spInd': 0,
          'spVal': 0.837522,
          'left': {'spInd': 0,
           'spVal': 0.846455,
           'left': {'spInd': 0,
            'spVal': 0.919384,
            'left': {'spInd': 0,
             'spVal': 0.976414,
             'left': {'spInd': 0,
              'spVal': 0.985425,
              'left': {'spInd': 0,
               'spVal': 0.989888,
               'left': {'spInd': 0,
                'spVal': 0.993349,
                'left': 1.035533,
                'right': 1.077553},
               'right': {'spInd': 0,
        

In [20]:
myDat2 = loadDataSet('ex2.txt')

In [21]:
myMat2 = np.mat(myDat2)

In [22]:
createTree(myMat2)

{'spInd': 0,
 'spVal': 0.499171,
 'left': {'spInd': 0,
  'spVal': 0.729397,
  'left': {'spInd': 0,
   'spVal': 0.952833,
   'left': {'spInd': 0,
    'spVal': 0.958512,
    'left': 105.24862350000001,
    'right': 112.42895575000001},
   'right': {'spInd': 0,
    'spVal': 0.759504,
    'left': {'spInd': 0,
     'spVal': 0.790312,
     'left': {'spInd': 0,
      'spVal': 0.833026,
      'left': {'spInd': 0,
       'spVal': 0.944221,
       'left': 87.3103875,
       'right': {'spInd': 0,
        'spVal': 0.85497,
        'left': {'spInd': 0,
         'spVal': 0.910975,
         'left': 96.452867,
         'right': {'spInd': 0,
          'spVal': 0.892999,
          'left': 104.825409,
          'right': {'spInd': 0,
           'spVal': 0.872883,
           'left': 95.181793,
           'right': 102.25234449999999}}},
        'right': 95.27584316666666}},
      'right': {'spInd': 0,
       'spVal': 0.811602,
       'left': 81.110152,
       'right': 88.78449880000001}},
     'right': 102.

In [23]:
createTree(myMat2, ops=(10000, 4))

{'spInd': 0,
 'spVal': 0.499171,
 'left': 101.35815937735848,
 'right': -2.637719329787234}

### 后剪枝
基于已有的树切分测试数据：  
&emsp;&emsp;如果存在任一子集是一棵树，则在该子集递归剪枝过程  
&emsp;&emsp;计算将当前两个叶节点合并后的误差  
&emsp;&emsp;计算不合并的误差  
&emsp;&emsp;如果合并会降低误差的话，就将叶节点合并

In [24]:
# 测试输入变量是否是一棵树
def isTree(obj):
    return (type(obj).__name__ == 'dict')

In [25]:
# 对树进行塌陷处理（返回树的平均值）
def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right'])/2.0

In [26]:
def prune(tree, testData):
    if np.shape(testData)[0] == 0: return getMean(tree)
    if (isTree(tree['right']) or isTree(tree['left'])):
        # 对树进行剪枝
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet)
    if not isTree(tree['left']) and not isTree(tree['right']):
        # 进行合并
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = np.sum(np.power(lSet[:, -1] - tree['left'], 2)) + np.sum(np.power(rSet[:, -1] - tree['right'], 2))
        treeMean = (tree['left'] + tree['right']) / 2.0
        errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
        if errorMerge < errorNoMerge:
            print("merging")
            return treeMean
        else: return tree
    else: return tree

In [27]:
myTree = createTree(myMat2, ops=(0, 1))

In [28]:
myDatTest = loadDataSet('ex2test.txt')

In [29]:
myMat2Test = np.mat(myDatTest)

In [30]:
prune(myTree, myMat2Test)

merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging


{'spInd': 0,
 'spVal': 0.499171,
 'left': {'spInd': 0,
  'spVal': 0.729397,
  'left': {'spInd': 0,
   'spVal': 0.952833,
   'left': {'spInd': 0,
    'spVal': 0.965969,
    'left': 92.5239915,
    'right': {'spInd': 0,
     'spVal': 0.956951,
     'left': {'spInd': 0,
      'spVal': 0.958512,
      'left': {'spInd': 0,
       'spVal': 0.960398,
       'left': 112.386764,
       'right': 123.559747},
      'right': 135.837013},
     'right': 111.2013225}},
   'right': {'spInd': 0,
    'spVal': 0.759504,
    'left': {'spInd': 0,
     'spVal': 0.763328,
     'left': {'spInd': 0,
      'spVal': 0.769043,
      'left': {'spInd': 0,
       'spVal': 0.790312,
       'left': {'spInd': 0,
        'spVal': 0.806158,
        'left': {'spInd': 0,
         'spVal': 0.815215,
         'left': {'spInd': 0,
          'spVal': 0.833026,
          'left': {'spInd': 0,
           'spVal': 0.841547,
           'left': {'spInd': 0,
            'spVal': 0.841625,
            'left': {'spInd': 0,
            

## 模型树

### 模型树的叶节点生成函数

In [31]:
# 将数据集格式化成目标变量Y和自变量X
def linearSolve(dataSet):
    m, n = np.shape(dataSet)
    X = np.mat(np.ones((m, n))); Y = np.mat(np.ones((m, 1)))
    X[:, 1:n] = dataSet[:, 0:n-1]; Y = dataSet[:, -1]
    xTx = X.T * X
    if np.linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse, try increasing the second value of ops')
    # 求ws系数矩阵
    ws = xTx.I * (X.T * Y)
    return ws, X, Y

In [32]:
def modelLeaf(dataSet):
    ws, X, Y = linearSolve(dataSet)
    return ws

In [33]:
# 平方误差
def modelErr(dataSet):
    ws, X, Y = linearSolve(dataSet)
    yHat = X * ws
    return np.sum(np.power(Y - yHat, 2))

In [34]:
myMat2 = np.mat(loadDataSet('exp2.txt'))

In [35]:
createTree(myMat2, modelLeaf, modelErr, (1, 10))

{'spInd': 0, 'spVal': 0.285477, 'left': matrix([[1.69855694e-03],
         [1.19647739e+01]]), 'right': matrix([[3.46877936],
         [1.18521743]])}