# 树回归与标准回归的比较

In [1]:
import numpy as np

In [2]:
def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        # 将每行映射成浮点数
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return dataMat

In [3]:
# dataSet:数据集，feature：待切分的特征，value:该特征的某个值
# 将数据按照特征的值来划分
def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
    mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :]
    return mat0, mat1

In [4]:
# 生成叶子节点
def regLeaf(dataSet):
    return np.mean(dataSet[:, -1])

In [5]:
# 误差估计函数，计算目标变量的平方误差
def regErr(dataSet):
    return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]

In [6]:
def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
    # tolS是容许误差下降值， tolN是切分的最少样本数
    tolS = ops[0]; tolN = ops[1]
    # 如果所有值都相等则退出
    if len(set(dataSet[:, -1].T.tolist()[0])) == 1:
        return None, leafType(dataSet)
    m, n = np.shape(dataSet)
    # S：用于与新切分误差对比，来检查新切分是否降低误差
    S = errType(dataSet)
    bestS = np.inf; bestIndex = 0; bestValue = 0
    # 获取最低误差的切分
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:, featIndex].T.tolist()[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS:
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    # 如果误差减少不大则退出
    if (S - bestS) < tolS:
        return None, leafType(dataSet)
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    # 如果切分出的数据集很小则退出
    if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
        return None, leafType(dataSet)
    return bestIndex, bestValue

In [7]:
# 树构建函数
def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)):
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
    if feat is None:
        return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree

In [8]:
# 将数据集格式化成目标变量Y和自变量X
def linearSolve(dataSet):
    m, n = np.shape(dataSet)
    X = np.mat(np.ones((m, n))); Y = np.mat(np.ones((m, 1)))
    X[:, 1:n] = dataSet[:, 0:n-1]; Y = dataSet[:, -1]
    xTx = X.T * X
    if np.linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse, try increasing the second value of ops')
    # 求ws系数矩阵
    ws = xTx.I * (X.T * Y)
    return ws, X, Y

In [9]:
def modelLeaf(dataSet):
    ws, X, Y = linearSolve(dataSet)
    return ws

In [10]:
# 平方误差
def modelErr(dataSet):
    ws, X, Y = linearSolve(dataSet)
    yHat = X * ws
    return np.sum(np.power(Y - yHat, 2))

In [11]:
# 对回归树叶节点进行预测
def regTreeEval(model, inDat):
    return float(model)

In [12]:
# 对模型树节点进行预测
def modelTreeEval(model, inDat):
    n = np.shape(inDat)[1]
    X = np.mat(np.ones((1, n+1)))
    X[:, 1:n+1] = inDat
    return float(X * model)

In [13]:
# 测试输入变量是否是一棵树
def isTree(obj):
    return (type(obj).__name__ == 'dict')

In [14]:
# 自顶向下遍历整个树，直到命中叶节点为止
def treeForeCast(tree, inData, modelEval=regTreeEval):
    # 对于单个数据点或者行向量，返回一个预测值
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
        else: return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
        else: return modelEval(tree['right'], inData)

In [15]:
def createForeCast(tree, testData, modelEval=regTreeEval):
    m = len(testData)
    yHat = np.mat(np.zeros((m, 1)))
    for i in range(m):
        yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
    return yHat

In [16]:
trainMat = np.mat(loadDataSet('bikeSpeedVsIq_train.txt'))
testMat = np.mat(loadDataSet('bikeSpeedVsIq_test.txt'))

In [17]:
myTree = createTree(trainMat, ops=(1, 20))

In [18]:
yHat = createForeCast(myTree, testMat[:, 0])

In [19]:
np.corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1]

0.964085231822215

In [20]:
myTree = createTree(trainMat, modelLeaf, modelErr, (1, 20))

In [21]:
yHat = createForeCast(myTree, testMat[:, 0], modelTreeEval)

In [22]:
np.corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1]

0.9760412191380629

我们知道， $R^2$值越接近1.0越好，所以从上面的结果可以看出，这里模型树的结果比回归树好。

In [23]:
ws, X, Y = linearSolve(trainMat)

In [24]:
ws

matrix([[37.58916794],
        [ 6.18978355]])

In [25]:
for i in range(np.shape(testMat)[0]):
    yHat[i] = testMat[i, 0] * ws[1, 0] + ws[0,0]

In [26]:
np.corrcoef(yHat, testMat[:, 1], rowvar=0)[0, 1]

0.9434684235674766

标准线性回归在$R^2$值上的表现不如上面两种树回归方法。