In [255]:
import pandas as pd    
import numpy as np
from scipy.stats import skew

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, RidgeCV, LassoCV

import xgboost as xgb
from hyperopt import hp, tpe, fmin, Trials

from sklearn.ensemble import GradientBoostingRegressor


In [256]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))



In [257]:
train["SalePrice"] = np.log1p(train["SalePrice"])
num_features = data.dtypes[data.dtypes != "object"].index
for x in num_features:
    if skew(data[x].dropna()) > 0.75:
        data[x] = np.log1p(data[x])
data = pd.get_dummies(data)
data = data.fillna(data.mean())
X_train = data[:train.shape[0]]
X_test = data[train.shape[0]:]
y = train.SalePrice


In [248]:
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

def cv_rmse(model, X_train):
    rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring = "neg_mean_squared_error", cv = kf))
    return rmse;

 

# Linear regression - ridge

In [280]:
ridges_alphas = [0.1, 0.5, 1, 3, 5, 7, 9, 10, 12, 14, 16, 18, 20, 22, 25,
                30]
ridge = RidgeCV(alphas = ridges_alphas, cv = kf)
model_ridge = ridge.fit(X_train, y)
lassocv_alpha = ridge.alpha_
lassocv_alpha


10.0

In [281]:
cv_rmse(model_ridge, X_train).mean()


0.12749688124644226

# Linear regression - Lasso

In [279]:
lasso_alphas = [0.0001, 0.0002, 0.0004, 0.0006, 0.0008, 0.001, 0.002, 
               0.004, 0.006, 0.008, 0.01, 0.03, 0.07, 0.1, 0.3, 0.5, 0.7]
lasso = LassoCV(alphas = lasso_alphas, cv = kf)
model_lasso = lasso.fit(X_train, y)
lassocv_alpha = lasso.alpha_
lassocv_alpha

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


0.0004

In [282]:
cv_rmse(model_lasso, X_train).mean()

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


0.12235646631445088

In [271]:
coefficients = pd.Series(model_lasso.coef_, index = X_train.columns)
selected_features = coefficients.sort_values(ascending = False).head(20)
selected_features


GrLivArea               0.420654
Neighborhood_StoneBr    0.118314
Neighborhood_Crawfor    0.108035
Neighborhood_NoRidge    0.088526
Functional_Typ          0.074537
Neighborhood_NridgHt    0.073322
LotArea                 0.072525
KitchenQual_Ex          0.069277
Exterior1st_BrkFace     0.068871
RoofMatl_WdShngl        0.062292
BsmtQual_Ex             0.051444
OverallQual             0.051296
Condition1_Norm         0.045385
OverallCond             0.042387
Neighborhood_BrkSide    0.040869
Neighborhood_Somerst    0.040237
BsmtExposure_Gd         0.038670
GarageCars              0.036823
1stFlrSF                0.031050
LotConfig_CulDSac       0.028828
dtype: float64

# xgboost

In [241]:
#xgb
model_xgb = xgb.XGBRegressor().fit(X_train, y)

cv_rmse(model_xgb, X_train).mean()

0.13858015819132832

### Note: if the following code crashes, remove the "n_job=-1" parameter. This is sklearn's parallization bug. If the parameter is removed, the execution time will be very long since it will only use a single thread.

In [249]:
#xgb+tuning 
xgboost_hyerparameters = {
    'learning_rate': hp.quniform('learning_rate', 0, 0.3, 0.001),
    'max_depth':hp.quniform('max_depth', 3,10,1),
    'n_estimators':hp.quniform('n_estimators', 1000, 5000, 100),
    #'gamma':hp.uniform('gamma', 0, 0.4),
    'reg_lambda': hp.quniform('reg_lambda', 0, 25, 1),
    'subsample':hp.uniform('subsample', 0.60, 0.95),
    'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1),
    'colsample_bylevel':hp.uniform('colsample_bylevel', 0.5, 1),     
}

def hyperparameter_tuning(hyerparameters):
    hyerparameters = {
        'learning_rate': hyerparameters['learning_rate'],
        'max_depth': int(hyerparameters['max_depth']),
        'n_estimators': int(hyerparameters['n_estimators']),
        #'gamma': hyerparameters['gamma'],
        'reg_lambda': hyerparameters['reg_lambda'],
        'subsample': hyerparameters['subsample'],
        'colsample_bytree': hyerparameters['colsample_bytree'],
        'colsample_bylevel': hyerparameters['colsample_bylevel']     
    }

    model_xgboost = xgb.XGBRegressor(**hyerparameters)
    return -cross_val_score(model_xgboost, X_train, y, scoring = 'neg_mean_squared_error', cv = 5, n_jobs = -1).mean()

    
        
xgboost_final_hyerparameters = fmin(
    fn = hyperparameter_tuning, 
    space = xgboost_hyerparameters, 
    max_evals = 20, 
    rstate = np.random.RandomState(1), 
    algo = tpe.suggest)

print(xgboost_final_hyerparameters)

100%|██████████| 20/20 [10:58<00:00, 32.93s/trial, best loss: 0.015351419072119624]
{'colsample_bylevel': 0.5089307368584586, 'colsample_bytree': 0.6660313658284291, 'learning_rate': 0.037, 'max_depth': 7.0, 'n_estimators': 3500.0, 'reg_lambda': 8.0, 'subsample': 0.6716479399150603}


In [133]:
model_xgb = xgb.XGBRegressor(random_state=0,
                        n_estimators=int(xgboost_final_hyerparameters['n_estimators']), 
                        colsample_bytree= xgboost_final_hyerparameters['colsample_bytree'],
                        #gamma= xgboost_final_hyerparameters['gamma'],
                        learning_rate= xgboost_final_hyerparameters['learning_rate'],
                        max_depth= int(xgboost_final_hyerparameters['max_depth']),
                        subsample= xgboost_final_hyerparameters['subsample'],
                        colsample_bylevel= xgboost_final_hyerparameters['colsample_bylevel'],
                        reg_lambda= xgboost_final_hyerparameters['reg_lambda']
                       ).fit(X_train, y)

cv_rmse(model_xgb, X_train).mean()


0.12574854029667773

# boost-GradientBoostingRegressor

In [242]:
#original gbr
model_gbr = GradientBoostingRegressor().fit(X_train, y)
cv_rmse(model_gbr, X_train).mean()

0.12584558500268123

### Note: if the following code crashes, remove the "n_job=-1" parameter. This is sklearn's parallization bug. If the parameter is removed, the execution time will be very long since it will only use a single thread.

In [253]:
#gbr+tuning
gbr_hyerparameters = {
    'learning_rate': hp.quniform('learning_rate', 0, 0.3, 0.0001),
    'n_estimators': hp.quniform('n_estimators', 1000, 5000, 100),
    'max_depth': hp.quniform('max_depth', 3,10,1),
    #'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1)
}
def hyperparameter_tuning(hyerparameters):
    hyerparameters = {
        'learning_rate': hyerparameters['learning_rate'],
        'max_depth': int(hyerparameters['max_depth']),
        'n_estimators': int(hyerparameters['n_estimators']),
             
    }
    model_gbr = GradientBoostingRegressor(**hyerparameters)
    return -cross_val_score(model_gbr, X_train, y, scoring ='neg_mean_squared_error', cv = 5, n_jobs=-1).mean()
    
        
gbr_final_hyerparameters = fmin(
    fn= hyperparameter_tuning, 
    space = gbr_hyerparameters,
    max_evals = 20, 
    rstate = np.random.RandomState(1), 
    algo = tpe.suggest
)

print(gbr_final_hyerparameters)




100%|██████████| 20/20 [17:31<00:00, 52.57s/trial, best loss: 1.412144088192948e-06] 
{'learning_rate': 0.0193, 'max_depth': 4.0, 'n_estimators': 4300.0}


In [258]:
model_gbr = GradientBoostingRegressor(
    n_estimators = int(gbr_final_hyerparameters['n_estimators']), 
    learning_rate = gbr_final_hyerparameters['learning_rate'], 
    max_depth = gbr_final_hyerparameters['max_depth'], 
    max_features = 'sqrt', 
    #min_samples_leaf = int(gbr_final_hyerparameters['min_samples_leaf']), 
    min_samples_leaf = 15,
    min_samples_split = 10, 
    loss='ls', 
    random_state = 42).fit(X_train, y)
cv_rmse(model_gbr, X_train).mean()


0.12027023494988247

In [244]:
#final output
gbr_preds = np.expm1(model_gbr.predict(X_test))
lasso_preds = np.expm1(model_lasso.predict(X_test))
xgb_preds = np.expm1(model_xgb.predict(X_test))

preds = 0.4 * lasso_preds + 0.6 * gbr_preds
solution = pd.DataFrame({"id":test.Id, "SalePrice":preds})
solution.to_csv("final.csv", index = False)


In [1]:
!pip install imblearn
!pip install delayed
from imblearn.over_sampling import SMOTE

You should consider upgrading via the '/Users/lilythegirl/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/lilythegirl/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
