In [1]:
'''
Created on Feb 4, 2011
Tree-Based Regression Methods
@author: Peter Harrington
@eitor:Jude.wang
'''
from numpy import *

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = map(float,curLine) 
        dataMat.append(fltLine)
    return dataMat

def binSplitDataSet(dataSet, feature, value):
    """对数据集中的某一特征根据value切分成两个数据集"""
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1

def regLeaf(dataSet):
    '''返回数据集的叶子节点【叶子节点即为当前叶子节点样本中的均值】'''
    #returns the value used for each leaf
    return mean(dataSet[:,-1])

def regErr(dataSet):
    """返回数据集最后一切的方差*此数据集的样本数"""
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    '''返回数据集，连续特征的最优可切分的feature_index和切分的value'''
    tolS = ops[0]   #容许的误差下降值
    tolN = ops[1]   #切分的最少样本数
    
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:,-1].T.tolist())) == 1: #exit cond 1
        return None, leafType(dataSet)
    
    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)  #当前数据集未切分之前的RSS
    bestS = inf   #最大方差
    bestIndex = 0   #初始化最优索引特征
    bestValue = 0   #初始化最有特征的最优切分值
    
    for featIndex in range(n-1):  #遍历特征
        for splitVal in set(dataSet[:,featIndex]):     #遍历当前特征的可切分的unique_values
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  
                continue   #如果当前特征，切分值不满于预先设定的切分门槛，跳出当前循环，继续执行下一循环
            newS = errType(mat0) + errType(mat1)   #满足预先设定的切分门槛,返回切分后的SSR之和  
            if newS < bestS:    #如果此次切分后的SSR小于之前未切分的SSR，更新最优切分
                bestIndex = featIndex   
                bestValue = splitVal
                bestS = newS            
    if (S - bestS) < tolS:     #if the decrease (S-bestS) is less than a threshold don't do the split
        return None, leafType(dataSet) 
    
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #如果切分后的样本数量小于约定的返回None
        return None, leafType(dataSet)
    return bestIndex,bestValue    #returns the best feature to split onand the value used for that split
              

def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    #assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)   #choose the best split
    if feat == None: 
        return val #如果还有可切分的特征，直接返回叶子节点的均值
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)  #left,value>split_value
    retTree['right'] = createTree(rSet, leafType, errType, ops)  #right,value<=split_value
    return retTree  

def isTree(obj):
    '''用于判断当前处理的节点是否为叶子节点'''
    return isinstance(obj,dict)

def getMean(tree):
    '''递归函数，从上往下遍历树直到叶子节点位置，如果找到两个叶子节点，就计算平均值'''
    if isTree(tree['right']): 
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): 
        tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0
    
def prune(tree, testData):
    """使用测试集对模型进行后剪枝"""
    if shape(testData)[0] == 0: #如果测试集已经没有了数据，模型不再分割 直接返回剩余的均值
        return getMean(tree)        
    if (isTree(tree['right']) or isTree(tree['left'])):#如果左右分支其中有一个是树,进行分割
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])        
    if isTree(tree['left']):        #如果左分支为树，对左分支进行后剪枝
        tree['left'] = prune(tree['left'], lSet)       
    if isTree(tree['right']):     #如果右分支为树，对右分支进行分割
        tree['right'] =  prune(tree['right'], rSet)       
    if not isTree(tree['left']) and not isTree(tree['right']): #如果左右分支都为叶子节点，尝试合并        
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) #按照左右分支进行拆分数据集     
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2))+sum(power(rSet[:,-1] - tree['right'],2))    #计算没有合并的SSR      
        treeMean = (tree['left']+tree['right'])/2.0      #计算合并后的左右叶子的值       
        errorMerge = sum(power(testData[:,-1] - treeMean,2))  #计算合并后的SSR    
        if errorMerge < errorNoMerge:   #判断是否满足合并条件
            print ("merging")
            return treeMean
        else: 
            return tree
    else: 
        return tree
        
def linearSolve(dataSet):   #helper function used in two places
    m,n = shape(dataSet)
    X = ones((m,n))
    Y = ones((m,1))    #create a copy of data with 1 in 0th postion，偏置项b
    X[:,1:n] = dataSet[:,0:n-1]
    Y = dataSet[:,-1]    #and strip out Y
    xTx=linalg.inv(dot(X.T,X))
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = dot(xTx ,dot(X.T , Y))
    return mat(ws),mat(X),mat(Y)

def modelLeaf(dataSet):#create linear model and return coeficients
    ws,X,Y = linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws.T
    return sum(power(Y - yHat,2))
    
def regTreeEval(model, inDat):
    return model.astype(float)

def modelTreeEval(model, inDat): 
    model=array(model).flatten()
    inDat=array(inDat).flatten()
    intercept=model[0]
    coef=model[1:]
    value=coef.dot(inDat.T)+intercept
    return value

def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree):              #如果不为树，返回结果
        return modelEval(tree, inData) 
    if inData[tree['spInd']] > tree['spVal']:    #如果大于，走左分支
        if isTree(tree['left']):                 #如果左分支为树，递归继续调用此方法
            return treeForeCast(tree['left'], inData, modelEval)    
        else:                  #如果左分支已经不为树，返回此节点的值
            return modelEval(tree['left'], inData)
    else:              #如果小于等于，走又分支
        if isTree(tree['right']):     #如果为树，持续调用
            return treeForeCast(tree['right'], inData, modelEval)
        else:                     #如果为叶子节点，返回模型数据
            return modelEval(tree['right'], inData)
        
def createForeCast(tree, testData, modelEval=regTreeEval):
    m=len(testData)
    yHat = zeros(m)
    for i in range(m):
        yHat[i] = treeForeCast(tree, testData[i], modelEval)
    return yHat

#### 使用sklearn中的回归树

In [2]:
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
X,y=make_regression(n_features=6,n_samples=300)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
sklearn_regressiontree=DecisionTreeRegressor().fit(X_train,y_train)
pre=sklearn_regressiontree.predict(X_test)
mean_squared_error(pre,y_test)

6185.83778735477

#### 使用sklearn中的随机森林

In [3]:
from sklearn.ensemble import RandomForestRegressor
sklearn_randomForest=RandomForestRegressor().fit(X_train,y_train)
pre=sklearn_randomForest.predict(X_test)
mean_squared_error(pre,y_test)

3105.0344932511844

#### 使用自写的CARD算法

In [4]:
train_data=concatenate([X_train,y_train.reshape(-1,1)],axis=1)
self_tree=createTree(train_data,leafType=modelLeaf,errType=modelErr,ops=(2,20))
treeForeCast(self_tree,X_test[1],modelEval=modelTreeEval)
pre=createForeCast(self_tree,X_test,modelEval=modelTreeEval)
mean_squared_error(pre,y_test)

1.3920293439025163e-25

#### 使用自写的回归树

In [5]:
self_tree=createTree(train_data,ops=(2,20))
pure_data=concatenate([X_test,y_test.reshape(-1,1)],axis=1)
self_tree=prune(self_tree,pure_data)
pre=createForeCast(self_tree,X_test)
mean_squared_error(pre,y_test)

9244.556536860908

#### 3.使用简单线性回归

In [6]:
from sklearn.linear_model import LinearRegression
sklearn_LR=LinearRegression().fit(X_train,y_train)
res_LR=sklearn_LR.predict(X_test)
mean_squared_error(res_LR,y_test)

4.366218892653465e-26

#### 使用sklearn中的SVR

In [7]:
from sklearn.svm import SVR
sklearn_SVR=SVR().fit(X_train,y_train)
pre=sklearn_SVR.predict(X_test)
mean_squared_error(pre,y_test)

17375.256784483907