# stacking集成算法
## 1 底层算法

In [1]:
import numpy as np
from mlxtend.regressor import StackingCVRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold
import xgboost as xgb

In [2]:
kfolds=KFold(n_splits=10,shuffle=True,random_state=123)

In [3]:
alphas_alt = np.logspace(-10,2.8,150)

In [4]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))

In [5]:
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=alphas_alt, cv=kfolds))

In [6]:
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(alphas=alphas_alt, cv=kfolds))

In [7]:
xgboost = make_pipeline(RobustScaler(), xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3460,subsample=0.7,reg_alpha=0.00006,gamma=0,nthread=6,scale_pos_weight=1,seed=27))

## 2 上层算法

In [8]:
stack_alg = StackingCVRegressor(regressors=(ridge,lasso,elasticnet,xgboost),meta_regressor=xgboost)

## 3 训练

In [9]:
import pandas as pd

In [10]:
def shuffle_data(X, y, seed=None):
    "将数据集中的X和y打乱"
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

In [11]:
def train_test_split(X, y, test_size=0.5, shuffle=True,seed=None):
    "将数据集根据test_size分成训练集和测试集，可以指定是否随机洗牌"
    if shuffle:
        X, y = shuffle_data(X, y, seed=seed)
    split_i = len(y) - int(len(y) // (1/test_size))
    X_train, X_test = X[ :split_i], X[split_i: ]
    y_train, y_test = y[ :split_i], y[split_i: ]
    return X_train, y_train, X_test, y_test

In [12]:
data = pd.read_csv("../../kaggle/forecast_losa_house_prices/data/train_1.csv")

In [13]:
y=np.asarray(data['SalePrice'])
train1=data.drop(['Id','SalePrice'],axis=1)
X=np.asarray(pd.get_dummies(train1).reset_index(drop=True))
X_train,y_train, X_test,y_test=train_test_split(X,y,test_size=0.2)

In [14]:
stackX = np.array(X_train)

In [15]:
stacky = np.array(y_train)

In [16]:
stack_alg.fit(stackX, stacky)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.897844e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.179720e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.211815e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.807141e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.063271e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.196996e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.824585e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not g

StackingCVRegressor(cv=5,
          meta_regressor=Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('xgbregressor', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_de...    reg_alpha=6e-05, reg_lambda=1, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.7))]),
          refit=True,
          regressors=(Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('ridgecv', RidgeCV(alphas=array([1.00000e-10, 1.21873e-10, ..., 5.17719e+02, 6.30957e+02]),
    cv=KFold(n_splits=10, random_state=123...  reg_alpha=6e-05, reg_lambda=1, scale_pos_weight=1, seed=27,
       silent=True, subsample=0.7))])),
          shuffle=True, store_train_meta_features=False,
          use_features

In [22]:
from sklearn.metrics import mean_squared_error

In [25]:
def benchmark(model,testset,label):
    pred=model.predict(testset)
    if pred[pred<0].shape[0]>0:
        print("Neg Value")
        return -1
    rmse=np.sqrt(mean_squared_error(label,pred))
    lrmse=np.sqrt(mean_squared_error(np.log(label),np.log(abs(pred))))
    print("RMSE:",rmse)
    print("LRMSE:",lrmse)
    return lrmse

In [26]:
benchmark(stack_alg, X_test, y_test)

RMSE: 22003.297724605203
LRMSE: 0.11883883339001983


0.11883883339001983