### Description：
> 这是从kaggle题解上找到的一个版本， 这个版本可以将准确度提升到0.11473,比之前的版本强了一些，这次主要是分析一下人家的处理方式以及看看能不能进行改进

In [33]:
"""导入包"""
import pandas as pd
import numpy as np
import pandas_profiling as ppf

# 数据预处理
from sklearn.preprocessing import RobustScaler, StandardScaler#去除异常值与数据标准化
from sklearn.preprocessing import Imputer
from scipy.stats import skew  # for some statistics  偏度
from scipy.stats import boxcox_normmax
from scipy.special import boxcox1p
from mlxtend.regressor import StackingCVRegressor

# 模型选择
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# 创建模型
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNetCV, LassoCV, RidgeCV, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 管道机制
from sklearn.pipeline import Pipeline, make_pipeline

# 集成技术
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

In [4]:
"""导入数据集，并查看"""
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
print("Train set size:", train.shape)
print("Test set size:", test.shape)

Train set size: (1460, 81)
Test set size: (1459, 80)


In [6]:
"""数据预处理"""
print('START data processing', datetime.now(), )
train_ID = train['Id']
test_ID = test['Id']

# Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# Deleting outliers
train = train[train.GrLivArea < 4500]   # 数据清洗时，会将带空值的行删除，此时DataFrame或Series类型的数据不再是连续的索引
train.reset_index(drop=True, inplace=True)   # 可以使用reset_index()重置索引。

START data processing 2019-11-17 19:11:28.685282


KeyError: 'Id'

In [7]:
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])      #  对于回归的， 一般会将最后标签转成对数的， 经验
y = train.SalePrice.reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)

(2917, 79)


In [10]:
"""FeatureEDA分析"""
ppf.ProfileReport(features)



In [12]:
"""数据的清洗与处理"""
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 
## 一些明明不代表数值类型的一些属性，是数值型的，需要转成字符串
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)

## 缺失值的处理
features['Functional'] = features['Functional'].fillna('Typ')
features['Electrical'] = features['Electrical'].fillna("SBrkr")
features['KitchenQual'] = features['KitchenQual'].fillna("TA")
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])   # 众数填充
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

features["PoolQC"] = features["PoolQC"].fillna("None")

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    features[col] = features[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    features[col] = features[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    features[col] = features[col].fillna('None')

features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

objects = []
for i in features.columns:
    if features[i].dtype == object:
        objects.append(i)

features.update(features[objects].fillna('None'))

features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# Filling in the rest of the NA's

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics.append(i)
features.update(features[numerics].fillna(0))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)

skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))    # 偏态的转成正态

features = features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

In [13]:
"""特征工程"""
features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])

# simplified features
features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

print(features.shape)
features = StandardScaler().fit_transform(features)
final_features = pd.get_dummies(features).reset_index(drop=True)
print(final_features.shape)

X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(X):, :]

print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
overfit.append('MSZoning_C (all)')

X = X.drop(overfit, axis=1).copy()
X_sub = X_sub.drop(overfit, axis=1).copy()

print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

(2917, 86)
(2917, 333)
X (1458, 333) y (1458,) X_sub (1459, 333)
X (1453, 331) y (1453,) X_sub (1459, 331)


In [14]:
"""评估算法"""
models = {}

models['LR'] = LinearRegression()
models['Ridge'] = Ridge()
models['Lasso'] = Lasso(alpha=0.01,max_iter=10000)
models['RF'] = RandomForestRegressor()
models['GBR'] = GradientBoostingRegressor()
models['LinSVR'] = LinearSVR()
models['SGD'] = SGDRegressor(max_iter=1000,tol=1e-3)
models['Extra'] = ExtraTreesRegressor()
models['Xgb'] = XGBRegressor(n_estimators=400)
models['lgb'] = LGBMRegressor()

# 评估算法
results = []
for key in models:
    kfold = KFold(n_splits=10, random_state=7)   # Kfold 交叉验证函数
    cv_results = np.sqrt(-cross_val_score(models[key], X, y, cv=kfold, scoring='neg_mean_squared_error'))
    results.append(cv_results)
    print('%s: %f (%f)' %(key, cv_results.mean(), cv_results.std()))

LR: 5795.946437 (8639.292496)
Ridge: 0.105527 (0.011103)
Lasso: 0.131867 (0.006973)
RF: 0.140327 (0.013461)
GBR: 0.115423 (0.012350)
LinSVR: 0.209274 (0.057978)
SGD: 7878698944594970.000000 (5816493195796967.000000)
Extra: 0.139130 (0.017945)
Xgb: 0.109480 (0.010629)
lgb: 0.115645 (0.012453)


In [16]:
"""优化模型"""
cv_params = {'n_estimators': [400, 500, 600, 700, 800]}
other_params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = XGBRegressor(**other_params)
grid = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
grid.fit(X, y)

print("最高得分：%.3f" % grid.best_score_)
print("最优参数: %s" % grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:   45.3s finished


最高得分：0.915
最优参数: {'n_estimators': 400}


In [17]:
##定义交叉方式，先指定模型后指定参数，方便测试多个模型，网格交叉验证
class grid():
    def __init__(self,model):
        self.model = model
    
    def grid_get(self,X,y,param_grid):
        grid_search = GridSearchCV(self.model,param_grid,cv=5, scoring="neg_mean_squared_error")
        grid_search.fit(X,y)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

In [18]:
grid(Ridge()).grid_get(X,y,{'alpha':[35,40,45,50,55,60,65,70,80,90]})

{'alpha': 35} 0.10680472377879473
          params  mean_test_score  std_test_score
0  {'alpha': 35}         0.106805        0.001019
1  {'alpha': 40}         0.107146        0.001013
2  {'alpha': 45}         0.107487        0.001008
3  {'alpha': 50}         0.107823        0.001003
4  {'alpha': 55}         0.108153        0.000998
5  {'alpha': 60}         0.108476        0.000993
6  {'alpha': 65}         0.108791        0.000988
7  {'alpha': 70}         0.109098        0.000983
8  {'alpha': 80}         0.109690        0.000972
9  {'alpha': 90}         0.110254        0.000962


In [21]:
"""验证模型"""
model = XGBRegressor(n_estimators=400)
model.fit(X, y)
pred = np.expm1(model.predict(X_sub))  #进行预测

result=pd.DataFrame({'Id':test_ID, 'SalePrice':pred})
result.to_csv("result/submission1.csv",index=False)
"""这个误差率达到0.1263"""



In [23]:
model = Ridge(alpha=35)
model.fit(X, y)
pred = np.expm1(model.predict(X_sub))  #进行预测

result=pd.DataFrame({'Id':test_ID, 'SalePrice':pred})
result.to_csv("result/submission2.csv",index=False)

"""这个误差率达到0.11833"""

### 下面使用模型的集成技术， 单个模型的效果不大好
> 思路： 也是先训练，然后进行模型集成

#### 模型集成技术版本一 

In [24]:
# rmsle
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


# build our model scoring function
def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y,
                                    scoring="neg_mean_squared_error",
                                    cv=kfolds))
    return (rmse)

In [27]:
kfolds = KFold(n_splits=10, random_state=7)
# setup models    
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt, cv=kfolds))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=1e7, alphas=alphas2,
                              random_state=42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                        cv=kfolds, l1_ratio=e_l1ratio))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C= 20, epsilon= 0.008, gamma=0.0003,))


gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)
                                   

lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )
                                       

xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

#stack
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,
                                            gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)
     
    
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
elastic_model_full_data = elasticnet.fit(X, y)
lasso_model_full_data = lasso.fit(X, y)
ridge_model_full_data = ridge.fit(X, y)
svr_model_full_data = svr.fit(X, y)
gbr_model_full_data = gbr.fit(X, y)
xgb_model_full_data = xgboost.fit(X, y)
lgb_model_full_data = lightgbm.fit(X, y)
 

score = cv_rmse(ridge)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )

score = cv_rmse(lasso)
print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = cv_rmse(elasticnet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

score = cv_rmse(svr)
print("SVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )

score = cv_rmse(lightgbm)
print("Lightgbm score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )

score = cv_rmse(gbr)
print("GradientBoosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )

score = cv_rmse(xgboost)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()) )

Kernel Ridge score: 0.1027 (0.0097)

Lasso score: 0.1024 (0.0106)

ElasticNet score: 0.1025 (0.0106)

SVR score: 0.1011 (0.0123)

Lightgbm score: 0.1060 (0.0112)

GradientBoosting score: 0.1066 (0.0109)

Xgboost score: 0.1061 (0.0116)



In [28]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [29]:
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))

RMSLE score on train data:
0.054877057774034385


In [31]:
pred =np.expm1(blend_models_predict(X_sub))

result=pd.DataFrame({'Id':test_ID, 'SalePrice':pred})
result.to_csv("result/submission3.csv",index=False)

"""这个误差率达到0.11473"""

####  模型集成技术版本二

In [39]:
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

In [34]:
##定义加权平均值，就相当于自己写fit_transform（）
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod##模型的个数
        self.weight = weight##权重
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # 针对于每一个数据点，单一的模型是乘以权重，然后加起来
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w

In [35]:
#指定每一个算法的参数
lasso = Lasso(alpha=0.0005,max_iter=10000)
ridge = Ridge(alpha=60)
svr = SVR(gamma= 0.0004,kernel='rbf',C=13,epsilon=0.009)
ker = KernelRidge(alpha=0.2 ,kernel='polynomial',degree=3 , coef0=0.8)
ela = ElasticNet(alpha=0.005,l1_ratio=0.08,max_iter=10000)
bay = BayesianRidge()

In [36]:
##6个权重
w1 = 0.02
w2 = 0.2
w3 = 0.25
w4 = 0.3
w5 = 0.03
w6 = 0.2

In [37]:
weight_avg = AverageWeight(mod = [lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])

In [41]:
rmse_cv(weight_avg,X,y),  rmse_cv(weight_avg,X,y).mean()##计算出交叉验证的均值

"""这个效果不太好"""

(array([0.18867356, 0.18366984, 0.17452098, 0.15902207, 0.18394709]),
 0.17796670996092384)

###  下面使用模型堆叠技术

In [43]:
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self,mod,meta_model):
        self.mod = mod
        self.meta_model = meta_model#元模型
        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)##这就是堆叠的最大特征进行了几折的划分
        
    def fit(self,X,y):
        self.saved_model = [list() for i in self.mod]
        oof_train = np.zeros((X.shape[0], len(self.mod)))
        
        for i,model in enumerate(self.mod):#返回的是索引和模型本身
            for train_index, val_index in self.kf.split(X,y):##返回的是数据本省
                renew_model = clone(model)##模型的复制
                renew_model.fit(X[train_index], y[train_index])#对数据进行训练
                self.saved_model[i].append(renew_model)##把模型添加进去
                oof_train[val_index,i] = renew_model.predict(X[val_index])##用来预测验证集
        
        self.meta_model.fit(oof_train,y)#元模型
        return self
    
    def predict(self,X):
        whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) 
                                      for single_model in self.saved_model]) ##得到的是整个测试集
        return self.meta_model.predict(whole_test)#返回的是利用元模型来对整个测试集进行预测
    
    def get_oof(self,X,y,test_X):
        oof = np.zeros((X.shape[0],len(self.mod)))##初始化为0
        test_single = np.zeros((test_X.shape[0],5))##初始化为0 
        test_mean = np.zeros((test_X.shape[0],len(self.mod)))
        for i,model in enumerate(self.mod):##i是模型
            for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):##j是所有划分好的的数据
                clone_model = clone(model)##克隆模块，相当于把模型复制一下
                clone_model.fit(X[train_index],y[train_index])##把分割好的数据进行训练
                oof[val_index,i] = clone_model.predict(X[val_index])##对验证集进行预测
                test_single[:,j] = clone_model.predict(test_X)##对测试集进行预测
            test_mean[:,i] = test_single.mean(axis=1)##测试集算好均值
        return oof, test_mean

In [42]:
X_train_scaled = StandardScaler().fit_transform(X)
X_test_scaled = StandardScaler().fit_transform(X_sub)
##经过预处理之后才能放到堆叠的模型里面去计算
a = Imputer().fit_transform(X_train_scaled)#相当于x
b = Imputer().fit_transform(y.values.reshape(-1,1)).ravel()#相当于y

In [44]:
stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)#定义了第一层的和第二层的模型

In [45]:
print(rmse_cv(stack_model,a,b))##运用了评估函数
print(rmse_cv(stack_model,a,b).mean())

[0.09740888 0.10577647 0.11208303 0.10062961 0.1058816 ]
0.10435591623893319


In [47]:
X_train_stack, X_test_stack = stack_model.get_oof(a,b,X_test_scaled)#将数据进行变换
X_train_stack.shape, a.shape

((1453, 6), (1453, 331))

In [49]:
X_train_add = np.hstack((a,X_train_stack))
X_test_add = np.hstack((X_test_scaled,X_test_stack))
X_train_add.shape, X_test_add.shape

((1453, 337), (1459, 337))

In [50]:
print(rmse_cv(stack_model,X_train_add,b))
print(rmse_cv(stack_model,X_train_add,b).mean())

[0.08822923 0.09267678 0.10094907 0.08755438 0.09936194]
0.09375427988390461


In [51]:
stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)

In [52]:
stack_model.fit(a,b)#模型进行训练

stacking(meta_model=KernelRidge(alpha=0.2, coef0=0.8, degree=3, gamma=None,
                                kernel='polynomial', kernel_params=None),
         mod=[Lasso(alpha=0.0005, copy_X=True, fit_intercept=True,
                    max_iter=10000, normalize=False, positive=False,
                    precompute=False, random_state=None, selection='cyclic',
                    tol=0.0001, warm_start=False),
              Ridge(alpha=60, copy_X=True, fit_intercept=True, max_iter...
              ElasticNet(alpha=0.005, copy_X=True, fit_intercept=True,
                         l1_ratio=0.08, max_iter=10000, normalize=False,
                         positive=False, precompute=False, random_state=None,
                         selection='cyclic', tol=0.0001, warm_start=False),
              BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False,
                            copy_X=True, fit_intercept=True, lambda_1=1e-06,
                            lambda_2=1e-06, n_iter=300, n

In [53]:
pred = np.exp(stack_model.predict(X_test_scaled))#进行预测

In [54]:
result=pd.DataFrame({'Id':test_ID, 'SalePrice':pred})
result.to_csv("result/submission4.csv",index=False)
"""这个误差率0.12049"""

###  Conclusion：
> [log和box-cox变换](https://www.jianshu.com/p/744284f866fb)
>> * 通常对于y进行log变换，因为诸如线性模型、SVM等要求target variable是服从正态分布的
>> * 对于特征值服从偏态分布的，可以使用box-cox进行变换，可使用  from scipy.special import boxcox1p 