In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib 
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr

# 读取数据集
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 处理 SalePrice 
prices = pd.DataFrame({'price':train['SalePrice'], 'log(price+1)':np.log1p(train['SalePrice'])})
# prices.hist()
train['SalePrice'] = np.log1p(train['SalePrice'])

# GrLivArea 
train['GrLivArea'] = train['GrLivArea'][train['GrLivArea']<4500]

# 合并训练集和测试机
all_data = pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],
                     test.loc[:, "MSSubClass":"SaleCondition"]))

# 处理 偏斜>0.5
numeric_feats = all_data.dtypes[all_data.dtypes!='object'].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.5]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# 转换成虚拟变量
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train["SalePrice"]





## Models

In [7]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNetCV, LassoCV, LassoLarsCV
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline 

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


kfolds = KFold(n_splits=10, shuffle=True, random_state=42)


def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', cv=5))
    return rmse





"""Ridge"""
alpha_ridge = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alpha_ridge, cv=kfolds))

"""Lasso"""
alpha_lasso = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e6,
                                               alphas=alpha_lasso,
                                               random_state=42,
                                               cv=kfolds))
"""Elasticnet"""
alpha_ela = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
ela = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e6,
                                                   alphas=alpha_ela,
                                                   cv=kfolds,
                                                   random_state=42,
                                                    l1_ratio = e_l1ratio
                                                   ))
"""SVR"""
svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003))

"""xgboost"""
xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                      max_depth=3, min_child_weight=0,
                      gamma = 0, subsample=0.7,
                      colsample_bytree=0.7,
                      objective='reg:linear', nthread=-1,
                      scale_pos_weight=1, seed=27,
                      reg_alpha=0.00006, random_state=42)

"""gbr"""
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                          max_depth=4, max_features='sqrt',
                          min_samples_leaf=15, min_samples_split=10,
                          loss='huber', random_state=42)

"""lightgbm"""
lightgbm = LGBMRegressor(objective='regression', num_leaves=4,
                        learning_rate=0.01, n_estimators=5000,
                        max_bin=200, bagging_fraction=0.75,
                        bagging_freq=5, bagging_seed=7,
                        feature_fraction=0.2, feature_fraction_seed=7,
                        verbose=-1)


score = rmse_cv(ridge)
print('Ridge: ', score.mean(), score.std())
score = rmse_cv(lasso)
print('Lasso: ', score.mean(), score.std())
score = rmse_cv(ela)
print('ela: ', score.mean(), score.std())
score = rmse_cv(svr)
print('svr: ', score.mean(), score.std())
score = rmse_cv(xgboost)
print('xgboost: ', score.mean(), score.std())
score = rmse_cv(gbr)
print('gbr: ', score.mean(), score.std())
score = rmse_cv(lightgbm)
print('lightgbm: ', score.mean(), score.std())


ridge = ridge.fit(X_train, y)
lasso = lasso.fit(X_train, y)
ela = ela.fit(X_train, y)
xgboost = xgboost.fit(X_train, y)
gbr = gbr.fit(X_train, y)
lightgbm = lightgbm.fit(X_train, y)


pred = 0.3*lasso.predict(X_test) + \
        0.1*ridge.predict(X_test) + \
        0.1*ela.predict(X_test) + \
        0.3*xgboost.predict(X_test) + \
        0.1*gbr.predict(X_test) + \
        0.1*lightgbm.predict(X_test)


res = pd.DataFrame({'id':test.Id, 'SalePrice': np.expm1(pred)})
res.to_csv('submission_2019_12_19.csv', index=False)

Ridge:  0.11960542780292541 0.009911592210712727
Lasso:  0.11586094759249402 0.00876720013648625
ela:  0.11585838827804329 0.008719850771803548
svr:  0.1178344144911537 0.013002576968564682


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


xgboost:  0.11600221080228863 0.007935821866155839
gbr:  0.12105995120268738 0.010724166654190187
lightgbm:  0.11982326360008769 0.00814791588128775
