In [1]:
from numpy import *

In [4]:
def load_dataset(fileName):
    dataMat=[]
    fr=open(fileName)
    for line in fr.readlines():
        curline=line.strip().split('\t')
        fltline=map(float,curline)
        dataMat.append(list(fltline) )
    return dataMat

In [6]:
dataMat=load_dataset('ex00.txt')
dataMat[0:5]

[[0.036098, 0.155096],
 [0.993349, 1.077553],
 [0.530897, 0.893462],
 [0.712386, 0.564858],
 [0.343554, -0.3717]]

In [7]:
# 对列完成切分
def binSplitDataSet(dataSet,features,value):
    bigIndex=dataSet[:,features]>value
    smallIndex=dataSet[:,features]<=value
    big=dataSet[nonzero(bigIndex)[0],:]
    small=dataSet[nonzero(smallIndex)[0],:]
    return small,big

In [40]:
small,big=binSplitDataSet(mat(dataMat),0,0.5)
print(small)
print(big)

[[  3.60980000e-02   1.55096000e-01]
 [  3.43554000e-01  -3.71700000e-01]
 [  9.80160000e-02  -3.32760000e-01]
 [  9.13580000e-02   9.99350000e-02]
 [  1.46366000e-01   3.42830000e-02]
 [  1.83510000e-01   1.84843000e-01]
 [  3.39563000e-01   2.06783000e-01]
 [  8.30700000e-03   6.99760000e-02]
 [  2.43909000e-01  -2.94670000e-02]
 [  3.06964000e-01  -1.77321000e-01]
 [  3.64920000e-02   4.08155000e-01]
 [  2.95511000e-01   2.88200000e-03]
 [  2.02054000e-01  -8.77440000e-02]
 [  3.77201000e-01  -2.43550000e-01]
 [  7.22430000e-02  -4.20983000e-01]
 [  4.10230000e-01   3.31722000e-01]
 [  1.01149000e-01   6.88340000e-02]
 [  4.88130000e-01  -9.77910000e-02]
 [  4.75976000e-01  -1.63707000e-01]
 [  2.73147000e-01  -4.55219000e-01]
 [  7.47950000e-02  -3.49692000e-01]
 [  9.84900000e-03   5.65940000e-02]
 [  3.02217000e-01  -1.48650000e-01]
 [  1.80506000e-01   1.03676000e-01]
 [  1.93641000e-01  -3.27589000e-01]
 [  3.43479000e-01   1.75264000e-01]
 [  1.45809000e-01   1.36979000e-01]
 

In [41]:
# 函数 求叶子节点上的样本的标签的均值
def meanLeaf(dataSet):
    return mean( dataSet[:,-1])

In [42]:
#测试求均值
print('全部数据的均值',meanLeaf(mat(dataMat) ) )
print('均值1',meanLeaf(small) )
print('均值2',meanLeaf(big))

全部数据的均值 0.571743005
均值1 -0.0446502857143
均值2 1.01809676724


In [46]:
# 函数求当前数据集的所有的数据方差和
def regErr(dataSet):
    return var(dataSet[:,-1])*shape(dataSet)[0]

In [47]:
#测试数据集的方差
print('全部数据的方差',regErr(mat(dataMat) ) )
print('方差1',regErr(small))
print('方差2',regErr(big))

全部数据的方差 63.380639386
方差1 3.38241970178
方差2 4.97232681808


In [48]:
def regErr2(rows):
    if len(rows)==0:
        return 0
    data=[float(row[len(row)-1]) for row in rows]
    mean=sum(data)/len(data)
    variance=sum([(d-mean)**2 for d in data] )
    return variance

In [49]:
#测试数据集的方差
print('全部数据的方差',regErr2(dataMat)  )
print('方差1',regErr2(array(small)))
print('方差2',regErr2(array(big)))

全部数据的方差 63.380639386
方差1 3.38241970178
方差2 4.97232681808


In [50]:
def chooseBestSplit(dataSet,leafType=meanLeaf,errType=regErr,ops=(1,4)):
    '''
    找到数据集切分的最佳的位置   它遍历所有的特征及其可能的取值来找到方差最小化的切分阈值
    
    步骤：
        循环每个特征
            取每个特征值
                将数据集切分成两份
                计算切分的方差
                如果当前的方差小于已知最小的方差 记录列索引及值  并且更新最小方差
        返回 特征索引 和 值
    '''
    #如果标签只有一个 数据无法在进行切分
    labelCount=len( set( dataSet[:,-1].T.A.tolist()[0] )  )
    if labelCount==1:
        return None,leafType(dataSet)
    
    bestSplitIndex=0
    bestSplitValue=0
    
    bestError=errType(dataSet)  #没有切分的情况下数据集的方差
    originalError=errType(dataSet)  
    
    errorLoss=ops[0]  #容许方差下降的值
    minSample=ops[1]  #最小样本数
    m,n=shape(dataSet)  # 数据集为 m 行 n列
    for featIndex in range(n-1):
        uniqueValues=set(dataSet[:,featIndex].T.A.tolist()[0])
        for splitValue in uniqueValues:
            small,big=binSplitDataSet( dataSet,featIndex,splitValue)
            if shape(small)[0]<minSample or shape(big)[0]<minSample:
                continue
            totalError=errType(small)+errType(big)  #方差和
            if totalError<bestError:
                bestError=totalError
                bestSplitIndex=featIndex
                bestSplitValue=splitValue
    if originalError-bestError < errorLoss:
        return None,leafType(dataSet)
    small,big=binSplitDataSet(dataSet,bestSplitIndex,bestSplitValue)
    if shape(small)[0]<minSample or shape(big)[0]<minSample:
                return None,leafType(dataSet)
    return bestSplitIndex,bestSplitValue

In [51]:
#函数构建树
def createTree(dataSet,leafType=meanLeaf,errType=regErr,ops=(1,4)):
    '''
    dataSet  待切分的数据集
    
    leafType  叶子节点的函数（一种是meanLeaf，即将叶子节点的样本的均值作为叶子节点的值，另一种为另一个回归模型）
    
    errType  节点分裂时用于判断的方差(回归用方差  分类用熵/基尼系数)
    
    ops  容许方差下降的值 切分的最少样本数
    
    '''
    bestFeatureIndex,bestSplitValue=chooseBestSplit( dataSet,leafType,errType,ops)
    if bestFeatureIndex==None:
        return bestSplitValue
    small,big=binSplitDataSet(dataSet,bestFeatureIndex,bestSplitValue)
    leftTree=createTree(small,leafType,errType,ops)
    rightTree=createTree(big,leafType,errType,ops)
    
    tree={}
    tree['spInd']=bestFeatureIndex
    tree['spVal']=bestSplitValue
    tree['left']=leftTree
    tree['right']=rightTree
    
    return tree

In [52]:
dataMat=load_dataset('ex00.txt')
myMat=mat(dataMat)
createTree(myMat,ops=(0.1,1))

{'left': {'left': {'left': {'left': 0.097105625000000001,
    'right': {'left': -0.23431800000000003,
     'right': {'left': 0.042778315789473685,
      'right': {'left': -0.17738871428571426,
       'right': -0.033359055555555552,
       'spInd': 0,
       'spVal': 0.23807},
      'spInd': 0,
      'spVal': 0.188218},
     'spInd': 0,
     'spVal': 0.081931},
    'spInd': 0,
    'spVal': 0.048014},
   'right': 0.15461575,
   'spInd': 0,
   'spVal': 0.325412},
  'right': {'left': -0.19291407692307688,
   'right': {'left': 0.33172200000000002,
    'right': -0.066808375000000003,
    'spInd': 0,
    'spVal': 0.41023},
   'spInd': 0,
   'spVal': 0.406649},
  'spInd': 0,
  'spVal': 0.343479},
 'right': {'left': {'left': 1.082699551724138,
   'right': 1.4777560000000001,
   'spInd': 0,
   'spVal': 0.61127},
  'right': {'left': 0.69501299999999999,
   'right': 0.99223373493975886,
   'spInd': 0,
   'spVal': 0.625336},
  'spInd': 0,
  'spVal': 0.620599},
 'spInd': 0,
 'spVal': 0.48813}

In [53]:
dataMat=load_dataset('ex0.txt')
myMat=mat(dataMat)
createTree(myMat)

{'left': {'left': -0.023838155555555553,
  'right': 1.0289583666666666,
  'spInd': 1,
  'spVal': 0.197834},
 'right': {'left': 1.980035071428571,
  'right': {'left': 2.9836209534883724,
   'right': 3.9871631999999999,
   'spInd': 1,
   'spVal': 0.797583},
  'spInd': 1,
  'spVal': 0.582002},
 'spInd': 1,
 'spVal': 0.39435}

In [54]:
createTree(myMat,ops=(0,4))

{'left': {'left': {'left': {'left': {'left': {'left': -0.033111799999999997,
      'right': -0.14647450000000001,
      'spInd': 1,
      'spVal': 0.016241},
     'right': {'left': 0.12067520000000001,
      'right': {'left': -0.066625399999999987,
       'right': -0.0093720000000000001,
       'spInd': 1,
       'spVal': 0.105279},
      'spInd': 1,
      'spVal': 0.081306},
     'spInd': 1,
     'spVal': 0.052031},
    'right': -0.14759542857142857,
    'spInd': 1,
    'spVal': 0.129291},
   'right': {'left': 0.10609520000000001,
    'right': 0.043393999999999995,
    'spInd': 1,
    'spVal': 0.176202},
   'spInd': 1,
   'spVal': 0.148654},
  'right': {'left': {'left': 1.1841675,
    'right': {'left': 0.97744150000000007,
     'right': {'left': 1.1668217999999999,
      'right': 1.0299635,
      'spInd': 1,
      'spVal': 0.303094},
     'spInd': 1,
     'spVal': 0.238619},
    'spInd': 1,
    'spVal': 0.217152},
   'right': {'left': 0.80301928571428582,
    'right': 1.10786949999999