# 车贷模型决策树与SVM等各类算法对比报告

In [83]:
import numpy as np

from pymongo import MongoClient
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import cross_validation
from sklearn.svm import SVR

with open('Data/data.csv') as f:
    line = f.readline()
    header = line.split(',')
data = np.genfromtxt('Data/data.csv', delimiter=',', skip_header=1, dtype=np.int)
targetData = np.genfromtxt('Data/TargetData.csv', delimiter=',', skip_header=1, dtype=np.int)

这里的数据是从mongo导入的，这处省去了所有的预处理流程，直接取出做算法选择的最终数据版本

In [84]:
print(header)
print(data[:5])

['carRegisterCertHosting', 'carTicketHosting', 'isDriversLicense', 'isGps', 'loanPurpose', 'marryCodition', 'carSpareKeyHosting', 'returnWay', 'gender', 'totalLoanAmount', 'totalLoanNum', 'rate', 'financeAmount', 'houseNum', 'due', 'totalReturnNum', 'toBeReturnAmoun', 'occupationYear', 'carBuyDate', 'age', 'carEstimatePrice', 'personalIncome\n']
[[ 0  0  0  0 -9  1  0  3  0  0 -1  3  0  0  0 -1  0  0 -1  0  0  0]
 [ 0  1  1  1  0  1  1  0  0  0  2  1  0 -2  1  2  0  0  0  0  0  0]
 [ 0  1  0  1  0  1  1  0  0  0 -1  1  0  0  0 -1  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0 -1  1  0 -2  0 -1  0  0 -1  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0 -1  1  0  0  0 -1  0  0 -1  0  0  0]]


## regression tree by cart

In [85]:
tree = DecisionTreeRegressor()

使用scikit中自带的交叉验证包进行交叉验证，默认使用R2值来评价算法。The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of squares ((y_true - y_pred) \*\* 2).sum() and v is the residual sum of squares ((y_true - y_true.mean()) \*\* 2).sum(). Best possible score is 1.0, lower values are worse.

In [86]:
scores = cross_validation.cross_val_score(tree,data,targetData,cv=4);
scores

array([ 0.10911144,  0.36370299,  0.52843294, -0.23443236])

In [87]:
"R**2 accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'R**2 accuracy:0.19 (+/- 0.29)'

前面我们做了R方值的计算，下面我们利用sickit的接口重载score函数，实现方差总和的计算和评估。下面的几个算法类似，不再一一赘述

In [88]:
def VarScore(estimator,x,y):
    sy = estimator.predict(x)
    return np.sum((sy-y)**2)

In [89]:
scores = cross_validation.cross_val_score(tree,data,targetData,scoring=VarScore,cv=4);
scores

array([ 243393.94937135,   58211.02218424,   22614.31289472,
         62435.93252859])

In [90]:
"Variance accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'Variance accuracy:96663.80 (+/- 86115.08)'

## forests of randomized tree

In [91]:
tree = RandomForestRegressor(n_estimators=500)
scores = cross_validation.cross_val_score(tree,data,targetData,cv=4);
scores

array([ 0.10565607,  0.64645602,  0.65586768,  0.38465229])

In [92]:
"R**2 accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'R**2 accuracy:0.45 (+/- 0.23)'

In [93]:
scores = cross_validation.cross_val_score(tree,data,targetData,scoring=VarScore,cv=4);
scores

array([ 241536.22287113,   31645.98191715,   13160.12661786,
         30222.03699442])

In [94]:
"Variance accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'Variance accuracy:79141.09 (+/- 94040.58)'

##extremely randomized trees

In [95]:
tree = ExtraTreesRegressor(n_estimators=500)
scores = cross_validation.cross_val_score(tree,data,targetData,cv=4);
scores

array([ 0.11025522,  0.60804014,  0.65517982,  0.4768993 ])

In [96]:
"R**2 accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'R**2 accuracy:0.46 (+/- 0.21)'

In [97]:
scores = cross_validation.cross_val_score(tree,data,targetData,scoring=VarScore,cv=4);
scores

array([ 240351.99637109,   37086.08763625,   13254.39868946,
         27560.39992551])

In [98]:
"Variance accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'Variance accuracy:79563.22 (+/- 93218.15)'

##supported vector regression

In [99]:
svm = SVR()
scores = cross_validation.cross_val_score(svm,data,targetData);
scores

array([ 0.05248411,  0.27286523,  0.25357791])

In [100]:
"R**2 accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'R**2 accuracy:0.19 (+/- 0.10)'

In [101]:
scores = cross_validation.cross_val_score(tree,data,targetData,scoring=VarScore,cv=4);
scores

array([ 240475.15664861,   36545.88761531,   13077.56598608,
         26755.13160821])

In [102]:
"Variance accuracy:%0.2f (+/- %0.2f)" %(scores.mean(),scores.std())

'Variance accuracy:79213.44 (+/- 93476.85)'

#结论

根据上述计算结果，我们可以看到，从预测值与实际值的差值方差的总和的角度来验证时，各类算法效果差距不大，而从最流行的R方检验方法进行模型选择时，我们的结论是extra trees >= forest > cart > SVR，这里注意由于我们采用了交叉验证来做验证（将数据均分为4，每次取3份train，1份test），我们比较的是每一次验证结果的平均值，而且由于森林算法的随机性，这个结果并不是每次都相同，但R方值基本变动不大，所以我们可以大致得出上述的结论

以上只是针对车贷数据进行的模型效果对比，其中并不考虑计算性能的范畴，虽然我们都知道cart的性能肯定会高于随机森林（例子中的forests和extra trees均包含500个tree）