In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Modelling Algorithms

from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

from sklearn.impute import SimpleImputer as Imputer
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
x_train = np.load('../data/x_train.npz')['x_train']
y_train = np.load('../data/y_train.npz')['y_train']
x_test = np.load('../data/x_test.npz')['x_test']

We choose 13 models and use 5-folds cross-calidation to evaluate these models.
Models include:

+ LinearRegression
+ Ridge
+ Lasso
+ Random Forrest
+ Gradient Boosting Tree
+ Support Vector Regression
+ Linear Support Vector Regression
+ ElasticNet
+ Stochastic Gradient Descent
+ BayesianRidge
+ KernelRidge
+ ExtraTreesRegressor
+ XgBoost

In [3]:
# define cross validation strategy
def rmse_cv(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

In [4]:
models = [LinearRegression(), 
          Ridge(), 
          Lasso(alpha=0.01, max_iter=10000), 
          RandomForestRegressor(), 
          GradientBoostingRegressor(), 
          SVR(), 
          LinearSVR(),
          ElasticNet(alpha=0.001, max_iter=10000), 
          SGDRegressor(max_iter=1000, tol=1e-3),
          BayesianRidge(), 
          KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ExtraTreesRegressor(), 
          XGBRegressor()]

In [5]:
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
for name, model in zip(names, models):
    score = rmse_cv(model, x_train, y_train)
    print("{}: {:.6f}, {:.4f}".format(name, score.mean(), score.std()))

LR: 337794275.207902, 123884034.9788
Ridge: 0.117474, 0.0091
Lasso: 0.120971, 0.0061
RF: 0.131366, 0.0054
GBR: 0.123845, 0.0076
SVR: 0.136137, 0.0126
LinSVR: 0.121072, 0.0086
Ela: 0.110987, 0.0061
SGD: 0.149181, 0.0137
Bay: 0.110506, 0.0060
Ker: 0.109154, 0.0055
Extra: 0.128953, 0.0056
Xgb: 0.123728, 0.0080


In [6]:
class grid():
    def __init__(self,model):
        self.model = model
    
    def grid_get(self,X,y,param_grid):
        grid_search = GridSearchCV(self.model,param_grid,cv=5, scoring="neg_mean_squared_error")
        grid_search.fit(X,y)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

In [7]:
grid(Lasso()).grid_get(x_train, y_train, {'alpha': [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008], 'max_iter':[10000]})

{'alpha': 0.0008, 'max_iter': 10000} 0.11110479398191903
                                 params  mean_test_score  std_test_score
0  {'alpha': 0.0004, 'max_iter': 10000}         0.111468        0.001369
1  {'alpha': 0.0005, 'max_iter': 10000}         0.111178        0.001376
2  {'alpha': 0.0007, 'max_iter': 10000}         0.111122        0.001386
3  {'alpha': 0.0006, 'max_iter': 10000}         0.111146        0.001400
4  {'alpha': 0.0009, 'max_iter': 10000}         0.111162        0.001324
5  {'alpha': 0.0008, 'max_iter': 10000}         0.111105        0.001354


In [8]:
grid(Ridge()).grid_get(x_train, y_train, {'alpha':[35,40,45,50,55,60,65,70,80,90]})

{'alpha': 60} 0.11015360313430884
          params  mean_test_score  std_test_score
0  {'alpha': 35}         0.110311        0.001276
1  {'alpha': 40}         0.110245        0.001257
2  {'alpha': 45}         0.110201        0.001242
3  {'alpha': 50}         0.110173        0.001230
4  {'alpha': 55}         0.110158        0.001220
5  {'alpha': 60}         0.110154        0.001212
6  {'alpha': 65}         0.110158        0.001205
7  {'alpha': 70}         0.110169        0.001200
8  {'alpha': 80}         0.110208        0.001191
9  {'alpha': 90}         0.110266        0.001185


In [9]:
grid(SVR()).grid_get(x_train, y_train, {'C':[11,12,13,14,15], 
                                        'kernel':["rbf"], 
                                        "gamma":[0.0003, 0.0004],
                                        "epsilon":[0.008,0.009]})

{'C': 12, 'epsilon': 0.009, 'gamma': 0.0004, 'kernel': 'rbf'} 0.10803302837011246
                                               params  mean_test_score  \
0   {'C': 11, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.108479   
1   {'C': 11, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.108089   
2   {'C': 11, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.108450   
3   {'C': 11, 'epsilon': 0.009, 'gamma': 0.0004, '...         0.108066   
4   {'C': 12, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.108431   
5   {'C': 12, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.108082   
6   {'C': 12, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.108415   
7   {'C': 12, 'epsilon': 0.009, 'gamma': 0.0004, '...         0.108033   
8   {'C': 13, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.108409   
9   {'C': 13, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.108082   
10  {'C': 13, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.108376   
11  {'C': 13, 'epsilon': 0.009

In [10]:
grid(KernelRidge()).grid_get(x_train, y_train, {'alpha':[0.2,0.3,0.4,0.5], 
                                                'kernel':["polynomial"], 
                                                'degree':[3], 
                                                'coef0':[0.8,1,1.2]})

{'alpha': 0.3, 'coef0': 1, 'degree': 3, 'kernel': 'polynomial'} 0.10820434272136836
                                               params  mean_test_score  \
0   {'alpha': 0.2, 'coef0': 0.8, 'degree': 3, 'ker...         0.108232   
1   {'alpha': 0.2, 'coef0': 1, 'degree': 3, 'kerne...         0.108463   
2   {'alpha': 0.2, 'coef0': 1.2, 'degree': 3, 'ker...         0.108896   
3   {'alpha': 0.3, 'coef0': 0.8, 'degree': 3, 'ker...         0.108340   
4   {'alpha': 0.3, 'coef0': 1, 'degree': 3, 'kerne...         0.108204   
5   {'alpha': 0.3, 'coef0': 1.2, 'degree': 3, 'ker...         0.108426   
6   {'alpha': 0.4, 'coef0': 0.8, 'degree': 3, 'ker...         0.108695   
7   {'alpha': 0.4, 'coef0': 1, 'degree': 3, 'kerne...         0.108213   
8   {'alpha': 0.4, 'coef0': 1.2, 'degree': 3, 'ker...         0.108267   
9   {'alpha': 0.5, 'coef0': 0.8, 'degree': 3, 'ker...         0.109174   
10  {'alpha': 0.5, 'coef0': 1, 'degree': 3, 'kerne...         0.108339   
11  {'alpha': 0.5, 'coef0': 

In [11]:
grid(ElasticNet()).grid_get(x_train, y_train, {'alpha':[0.0005,0.0008,0.004,0.005], 
                                               'l1_ratio':[0.08,0.1,0.3,0.5,0.7], 
                                               'max_iter':[10000]})

{'alpha': 0.005, 'l1_ratio': 0.1, 'max_iter': 10000} 0.11104696892501967
                                               params  mean_test_score  \
0   {'alpha': 0.0005, 'l1_ratio': 0.08, 'max_iter'...         0.116630   
1   {'alpha': 0.0005, 'l1_ratio': 0.1, 'max_iter':...         0.116082   
2   {'alpha': 0.0005, 'l1_ratio': 0.3, 'max_iter':...         0.113106   
3   {'alpha': 0.0005, 'l1_ratio': 0.5, 'max_iter':...         0.112053   
4   {'alpha': 0.0005, 'l1_ratio': 0.7, 'max_iter':...         0.111674   
5   {'alpha': 0.0008, 'l1_ratio': 0.08, 'max_iter'...         0.114770   
6   {'alpha': 0.0008, 'l1_ratio': 0.1, 'max_iter':...         0.114250   
7   {'alpha': 0.0008, 'l1_ratio': 0.3, 'max_iter':...         0.111991   
8   {'alpha': 0.0008, 'l1_ratio': 0.5, 'max_iter':...         0.111415   
9   {'alpha': 0.0008, 'l1_ratio': 0.7, 'max_iter':...         0.111160   
10  {'alpha': 0.004, 'l1_ratio': 0.08, 'max_iter':...         0.111287   
11  {'alpha': 0.004, 'l1_ratio': 0.1, '

模型融合

In [12]:
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w

In [13]:
lasso = Lasso(alpha=0.0005,max_iter=10000)
ridge = Ridge(alpha=60)
svr = SVR(gamma= 0.0004, kernel='rbf', C=13, epsilon=0.009)
ker = KernelRidge(alpha=0.2 ,kernel='polynomial',degree=3 , coef0=0.8)
ela = ElasticNet(alpha=0.005,l1_ratio=0.08,max_iter=10000)
bay = BayesianRidge()

In [14]:
# assign weights based on their gridsearch score
w1 = 0.02
w2 = 0.2
w3 = 0.25
w4 = 0.3
w5 = 0.03
w6 = 0.2

In [15]:
weight_avg = AverageWeight(mod=[lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])

In [16]:
rmse_cv(weight_avg, x_train, y_train), rmse_cv(weight_avg, x_train, y_train).mean()

(array([0.10421222, 0.10936372, 0.11835569, 0.1001449 , 0.1058588 ]),
 0.1075870668393643)

In [17]:
# 只用两个表现最好的模型，效果会更好
weight_avg = AverageWeight(mod=[svr, ker], weight=[0.5, 0.5])

In [18]:
rmse_cv(weight_avg, x_train, y_train),  rmse_cv(weight_avg, x_train, y_train).mean()

(array([0.10272309, 0.10899485, 0.11761903, 0.09857332, 0.10492267]),
 0.1065665894784528)

In [19]:
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self,mod,meta_model):
        self.mod = mod
        self.meta_model = meta_model
        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)
        
    def fit(self,X,y):
        self.saved_model = [list() for i in self.mod]
        oof_train = np.zeros((X.shape[0], len(self.mod)))
        
        for i,model in enumerate(self.mod):
            for train_index, val_index in self.kf.split(X,y):
                renew_model = clone(model)
                renew_model.fit(X[train_index], y[train_index])
                self.saved_model[i].append(renew_model)
                oof_train[val_index,i] = renew_model.predict(X[val_index])
        
        self.meta_model.fit(oof_train,y)
        return self
    
    def predict(self,X):
        whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1) 
                                      for single_model in self.saved_model]) 
        return self.meta_model.predict(whole_test)
    
    def get_oof(self,X,y,test_X):
        oof = np.zeros((X.shape[0],len(self.mod)))
        test_single = np.zeros((test_X.shape[0],5))
        test_mean = np.zeros((test_X.shape[0],len(self.mod)))
        for i,model in enumerate(self.mod):
            for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):
                clone_model = clone(model)
                clone_model.fit(X[train_index],y[train_index])
                oof[val_index,i] = clone_model.predict(X[val_index])
                test_single[:,j] = clone_model.predict(test_X)
            test_mean[:,i] = test_single.mean(axis=1)
        return oof, test_mean

In [20]:
# must do imputer first, otherwise stacking won't work, and i don't know why.
a = Imputer().fit_transform(x_train)
b = Imputer().fit_transform(y_train.reshape(-1,1)).ravel()

In [21]:
stack_model = stacking(mod=[lasso, ridge, svr, ker, ela, bay], meta_model=ker)

In [22]:
print(rmse_cv(stack_model, a, b))
print(rmse_cv(stack_model, a, b).mean())

[0.1027179  0.10890383 0.11692561 0.09781165 0.10390294]
0.10605238917525298


In [23]:
x_train_stack, x_test_stack = stack_model.get_oof(a, b, x_test)

In [24]:
x_train_stack.shape, a.shape

((1458, 6), (1458, 410))

In [25]:
x_train_add = np.hstack((a, x_train_stack))
x_test_add = np.hstack((x_test, x_test_stack))

In [27]:
print(rmse_cv(stack_model, x_train_add, b))
print(rmse_cv(stack_model, x_train_add, b).mean())

[0.09768973 0.10428458 0.11231522 0.09333416 0.09868679]
0.10126209427505639


In [28]:
stack_model = stacking(mod=[lasso, ridge, svr, ker, ela, bay], meta_model=ker)

In [29]:
stack_model.fit(a, b)

stacking(meta_model=KernelRidge(alpha=0.2, coef0=0.8, degree=3, gamma=None,
                                kernel='polynomial', kernel_params=None),
         mod=[Lasso(alpha=0.0005, copy_X=True, fit_intercept=True,
                    max_iter=10000, normalize=False, positive=False,
                    precompute=False, random_state=None, selection='cyclic',
                    tol=0.0001, warm_start=False),
              Ridge(alpha=60, copy_X=True, fit_intercept=True, max_iter...
                         l1_ratio=0.08, max_iter=10000, normalize=False,
                         positive=False, precompute=False, random_state=None,
                         selection='cyclic', tol=0.0001, warm_start=False),
              BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
                            compute_score=False, copy_X=True,
                            fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06,
                            lambda_init=None, n_iter=300, normalize

In [31]:
pred_list = np.exp(stack_model.predict(x_test))
test_df = pd.read_csv("../data/test.csv")
submission = pd.DataFrame({'Id':test_df.Id, 'SalePrice':pred_list})
submission.to_csv("../data/submission.csv", index=False)