In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
X_train = pd.read_csv('X_train.csv',index_col = 0)
X_predict= pd.read_csv('X_predict.csv',index_col = 0)
y = pd.read_csv('y.csv',header = None,  names=['Id', 'price'],index_col = 0).iloc[:,0]

# Validation function and grid search 

In [3]:
n_folds = 5

def rmsle_cv(model,x_in):
    kf = KFold(n_folds, shuffle=True).get_n_splits(x_in)
    rmse= np.sqrt(-cross_val_score(model, x_in, y_train.values, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

class grid():
    def __init__(self,model):
        self.model = model
    
    def grid_get(self,x_in,param_grid):
        kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_in)
        grid_search = GridSearchCV(self.model,param_grid,cv=kf, scoring="neg_mean_squared_error", n_jobs=-1,verbose=2)
        grid_search.fit(x_in, y_train.values)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])


# Examing num of feature used in the test

In [4]:
train, test, y_train, y_test = train_test_split(X_train, y, test_size=0)

In [5]:
#lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0006))

In [6]:
# np.seterr(divide='ignore', invalid='ignore')
# num_feats_ = np.arange(100,286,2)
# cv_score = []
# test_score = []
# for num_feats in num_feats_: 
#     X_new = SelectKBest(f_regression, k= num_feats).fit_transform(train.values, y_train.values)
    
#     kf = KFold(n_folds, shuffle=True).get_n_splits(X_new)
#     score= np.sqrt(-cross_val_score(lasso, X_new, y_train.values, scoring="neg_mean_squared_error", cv = kf))
#     cv_score.append(score.mean())
    

# plt.plot(num_feats_,cv_score,'r')

In [7]:
# Using 580 features to do the training 

# Lasso 

In [8]:
grid(Lasso()).grid_get(train.values,{'alpha': [0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007]})

# lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.001, random_state=1))
# score= rmsle_cv(lasso,train.values)
# print("Lasso score: {:.5f} ({:.5f})\n".format(score.mean(), score.std()))

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    9.0s finished


{'alpha': 0.0004} 0.11722603418075245
              params  mean_test_score  std_test_score
0  {'alpha': 0.0001}         0.121113        0.002922
1  {'alpha': 0.0002}         0.118798        0.002595
2  {'alpha': 0.0003}         0.117754        0.002374
3  {'alpha': 0.0004}         0.117226        0.002259
4  {'alpha': 0.0005}         0.117259        0.002184
5  {'alpha': 0.0006}         0.117444        0.002154
6  {'alpha': 0.0007}         0.117666        0.002123




# Elastic Net Regression :

In [None]:
grid(ElasticNet()).grid_get(X_new,{'alpha': [0.0006,0.0007,0.0008,0.0009],'l1_ratio':[0.8,0.9,1,1.1],'max_iter':[700,800,900,1000,1100]})

# ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0004, l1_ratio=1.4,max_iter = 600))

# score = rmsle_cv(ENet,train.values)
# print("ElasticNet score: {:.5f} ({:.5f})\n".format(score.mean(), score.std()))

# Kernel Ridge Regression :

In [None]:
# grid(KernelRidge()).grid_get(X_new,[{'kernel': ['linear'],'alpha':np.logspace(-4,4,8)},{'kernel': ['polynomial'],'alpha':np.logspace(-4,4,8),'degree':np.logspace(-4,4,8),'coef0':np.logspace(-4,4,8)}])

KRR = KernelRidge(alpha=0.04, kernel='polynomial', degree=1, coef0=0.0008)

score = rmsle_cv(KRR,train.values)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# KNeighborsRegressor

In [None]:
#grid(KNeighborsRegressor()).grid_get(X_new,{'n_neighbors':np.arange(5,50,5), 'weights':['uniform','distance'], 'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size':np.arange(5,50,5), 'p':np.arange(2,20,2)})


KNN = make_pipeline(RobustScaler(), KNeighborsRegressor(n_neighbors=6, weights = 'distance',algorithm='brute', p=1, leaf_size=1))

score = rmsle_cv(KNN,train.values)
print("KNN score: {:.5f} ({:.5f})\n".format(score.mean(), score.std()))

# SVR

In [None]:
#grid(SVR()).grid_get(X_new,{'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'degree':[1,2,3,4,5], 'coef0':[0.0,0.1,0.2,1,10],'C':[0.01,0.1,1.0,10], 'epsilon':[0.01,0.1,1,10]})

#{'coef0': 0.0, 'epsilon': 0.02, 'kernel': 'linear', 'C': 0.004, 'degree': 1} 0.11325162650394656

# SVR = make_pipeline(RobustScaler(), SVR(kernel='poly', degree=1, coef0=1,C=9.0, epsilon=0.01))

# score = rmsle_cv(SVR,X_new)
# print("SVR score: {:.5f} ({:.5f})\n".format(score.mean(), score.std()))

# Random forest Regression

In [None]:
#grid(RandomForestRegressor()).grid_get(X_new,{'n_estimators':[100,1000,5000], 'max_depth':[None,2,5,10,20], 'min_samples_split':[2,5,10], 'min_samples_leaf':[1,2,3,5,10], 'max_features':[0.01,0.05,0.1,0.5,1]})


RF = make_pipeline(RobustScaler(), RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=0.3))

score = rmsle_cv(RF,train.values)
print("RF score: {:.5f} ({:.5f})\n".format(score.mean(), score.std()))


# Gradient Boosting Regression :

In [None]:
#{'min_samples_leaf': 4, 'max_features': 0.04, 'min_samples_split': 30, 'n_estimators': 3700, 'learning_rate': 0.01, 'max_depth': 3, 'loss': 'ls'} 0.1122755329486732

GBoost = GradientBoostingRegressor(n_estimators=3700, learning_rate=0.04,
                                   max_depth=3, max_features=0.04,
                                   min_samples_leaf=4, min_samples_split=30, 
                                   loss='ls')
score = rmsle_cv(GBoost,train.values)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# XGBoost:

In [None]:
#grid(xgb.XGBRegressor()).grid_get({'colsample_bytree':[0.1,0.4603], 'gamma':[0.01,0.0468], 'learning_rate':[0.01,0.05], 'max_depth':[3,10], 'min_child_weight':[1.7817,10], 'n_estimators':[2000,5200],'reg_alpha':[0.4640,1], 'reg_lambda':[0.5,0.9],'subsample':[0.1,0.5213]})

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=5200,reg_alpha=0.4640, reg_lambda=0.9,subsample=0.5213, silent=1,nthread = -1)

score = rmsle_cv(model_xgb,train.values)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# LightGBM :

In [None]:
#grid(lgb.LGBMRegressor()).grid_get(X_new,{'num_leaves':[3],'learning_rate':[0.05], 'n_estimators':[1000],'max_bin' : [220,250,270], 'bagging_fraction' : [0.8],'bagging_freq' : [12,14,16,18], 'feature_fraction' : [0.1],'min_data_in_leaf' :[4,5,6], 'min_sum_hessian_in_leaf' : [2,4,6,8]})

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=3,learning_rate = 0.05, n_estimators=1000, max_bin = 250, bagging_fraction = 0.8,bagging_freq = 16, feature_fraction = 0.1,bagging_seed=9,min_data_in_leaf =5, min_sum_hessian_in_leaf = 2)


score = rmsle_cv(model_lgb,train.values)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

### Stacking models

#### Simplest Stacking approach : Averaging base models

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [None]:
averaged_models = AveragingModels(models = (lasso,ENet, KRR, GBoost,model_xgb, model_lgb))

score = rmsle_cv(averaged_models,train.values)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

#### Less simple Stacking : Adding a Meta-model

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (lasso, GBoost, KRR),
                                                 meta_model = ENet)

score = rmsle_cv(stacked_averaged_models,train.values)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

# submission

In [None]:
train.shape

In [None]:
stacked_averaged_models.fit(train.values, y_train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(X_predict))

In [None]:
model_xgb.fit(train.values, y_train.values)

model_xgb_pred = np.expm1(model_xgb.predict(X_predict.values))

In [None]:
model_lgb.fit(train.values, y_train.values)
model_lgb_pred = np.expm1(model_lgb.predict(X_predict.values))

In [None]:
ensemble = stacked_pred*0.8 + model_xgb_pred*0.1 + model_lgb_pred*0.1

submission_df = pd.DataFrame(data= {'Id' :X_predict.index, 'SalePrice': ensemble})

submission_df.head()

submission_df.to_csv('submit.csv', index=False)