In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('data/train_data.csv')
test_data = pd.read_csv('data/test_data.csv')

In [3]:
y_train = train_data['SalePrice']
x_train = train_data.drop(['Id', 'SalePrice'], axis=1)

x_test = test_data.drop(['Id'], axis=1)

In [4]:
# 定义均方根误差计算函数

from sklearn.model_selection import cross_val_score

def rmse_cv(model, x, y):
    rmse = np.sqrt(-cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=5))
    return rmse

In [5]:
# 计算各个模型的均方根误差

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBRegressor

models = [LinearRegression(), Ridge(), Lasso(alpha=0.01,max_iter=10000), RandomForestRegressor(),
          GradientBoostingRegressor(), SVR(), LinearSVR(), XGBRegressor()]

names = ['LR', 'Ridge', 'Lasso', 'RF', 'GBR', 'SVR', 'LinSVR', 'Xgb']

for name, model in zip(names, models):
    score = rmse_cv(model, x_train, y_train)
    print('{}:{:.6f},{:.4f}'.format(name, score.mean(), score.std()))

LR:8449784977.720961,16899569955.1328
Ridge:0.149114,0.0270
Lasso:0.151502,0.0250
RF:0.153000,0.0103
GBR:0.128597,0.0092
SVR:0.173867,0.0148
LinSVR:0.268582,0.2228
Xgb:0.130202,0.0117


In [6]:
# 可以看到，普通线性回归的均方根误差太大，排除这个模型

# 查找最佳参数
from sklearn.model_selection import GridSearchCV
def grid_get(model, x, y, param):
    grid_search = GridSearchCV(model, param, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(x,y)
    print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
    print('*'*50)
    grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
    print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

In [7]:
# Lasso
grid_get(Lasso(), x_train, y_train, {'alpha': [0.0004,0.0005,0.0007,0.0009],'max_iter':[10000]})

{'alpha': 0.0009, 'max_iter': 10000} 0.15101373974000018
**************************************************
                                 params  mean_test_score  std_test_score
0  {'alpha': 0.0004, 'max_iter': 10000}         0.151260        0.008695
1  {'alpha': 0.0005, 'max_iter': 10000}         0.151199        0.008695
2  {'alpha': 0.0007, 'max_iter': 10000}         0.151101        0.008695
3  {'alpha': 0.0009, 'max_iter': 10000}         0.151014        0.008698


In [8]:
# Ridge
grid_get(Ridge(), x_train, y_train, {'alpha':[35,40,45,50,55,60,65,70,80,90]})

{'alpha': 90} 0.14996932119190648
**************************************************
          params  mean_test_score  std_test_score
0  {'alpha': 35}         0.150670        0.008635
1  {'alpha': 40}         0.150579        0.008625
2  {'alpha': 45}         0.150495        0.008614
3  {'alpha': 50}         0.150417        0.008603
4  {'alpha': 55}         0.150345        0.008592
5  {'alpha': 60}         0.150279        0.008580
6  {'alpha': 65}         0.150217        0.008569
7  {'alpha': 70}         0.150160        0.008558
8  {'alpha': 80}         0.150057        0.008535
9  {'alpha': 90}         0.149969        0.008511


In [9]:
# SVR
grid_get(SVR(), x_train, y_train, {'C':[11,13,15],'kernel':["rbf"],"gamma":[0.0003,0.0004],"epsilon":[0.008,0.009]})

{'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'} 0.1331892139655336
**************************************************
                                               params  mean_test_score  \
0   {'C': 11, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.138261   
1   {'C': 11, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.134483   
2   {'C': 11, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.138338   
3   {'C': 11, 'epsilon': 0.009, 'gamma': 0.0004, '...         0.134423   
4   {'C': 13, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.137734   
5   {'C': 13, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.133830   
6   {'C': 13, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.137759   
7   {'C': 13, 'epsilon': 0.009, 'gamma': 0.0004, '...         0.133838   
8   {'C': 15, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.137198   
9   {'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.133189   
10  {'C': 15, 'epsilon': 0.009, 'gamma': 0.0003, '... 

In [10]:
#xgboost
grid_get(XGBRegressor(), x_train, y_train, {'max_depth':[1,2,3,4,5,6]})

{'max_depth': 6} 0.13040865513956423
**************************************************
             params  mean_test_score  std_test_score
0  {'max_depth': 1}         0.151889        0.002506
1  {'max_depth': 2}         0.135204        0.002240
2  {'max_depth': 3}         0.130727        0.003025
3  {'max_depth': 4}         0.130763        0.002979
4  {'max_depth': 5}         0.130754        0.003078
5  {'max_depth': 6}         0.130409        0.002254


In [11]:
# 综合对比，xgboost效果最好

xgb = XGBRegressor(max_depth=6)
xgb.fit(x_train, y_train)
y_xgb = np.expm1(xgb.predict(x_test))
submission = pd.DataFrame(data= {'Id' : test_data.Id, 'SalePrice': y_xgb})

submission.to_csv('data/submission.csv', index=False)

In [12]:
# 提交之后评分0.13438，2012th，46%，这里只做了简单的数据处理和建模，如果做了特征工程和PCA降维，在进行参数调优应该会有提升