In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('../tools/')
import tools
import numpy as np

# Load Data

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
x_train = train.drop(['SalePrice', 'Id'],axis=1)
y_train = train.SalePrice
x_test = test.drop(['Id'],axis=1)

In [3]:
prc = tools.Process(x_train,x_test,y_train)
prc_init = tools.Process(x_train,x_test,y_train)

# Preprocess

## Reject Outliers

In [None]:
save_index = prc.x_train[prc.x_train.GrLivArea < 4500].index.to_list()
prc.x_train = prc.x_train.loc[save_index]
prc.x_train.reset_index(drop = True, inplace = True)
prc.y_train = prc.y_train.loc[save_index]
prc.y_train.reset_index(drop = True, inplace = True)
prc.update()

## Numeric to Categoric

In [None]:
prc.x_all['MSSubClass'] = prc.x_all['MSSubClass'].apply(str)
prc.x_all['YrSold'] = prc.x_all['YrSold'].astype(str)
prc.x_all['MoSold'] = prc.x_all['MoSold'].astype(str)
prc.split()
prc.update()

## Fill MIssing Data

In [None]:
prc.NANs(get_return=True,bar=False)

In [None]:
prc.NANs(bar=False, plot=True)

In [None]:
### Filling these with MODE , i.e. , the most frequent value in these columns .
prc.x_all['Functional'] = prc.x_all['Functional'].fillna(prc.x_all['Functional'].mode()[0]) 
prc.x_all['Electrical'] = prc.x_all['Electrical'].fillna(prc.x_all['Electrical'].mode()[0]) 
prc.x_all['KitchenQual'] = prc.x_all['KitchenQual'].fillna(prc.x_all['KitchenQual'].mode()[0]) 
prc.x_all['Exterior1st'] = prc.x_all['Exterior1st'].fillna(prc.x_all['Exterior1st'].mode()[0]) 
prc.x_all['Exterior2nd'] = prc.x_all['Exterior2nd'].fillna(prc.x_all['Exterior2nd'].mode()[0])
prc.x_all['SaleType'] = prc.x_all['SaleType'].fillna(prc.x_all['SaleType'].mode()[0])

### Missing data in GarageYrBit most probably means missing Garage , so replace NaN with zero . 
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    prc.x_all[col] = prc.x_all[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    prc.x_all[col] = prc.x_all[col].fillna('None')

### Same with basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    prc.x_all[col] = prc.x_all[col].fillna('None')
    
### Same with pool    
prc.x_all['PoolQC'] = prc.x_all['PoolQC'].fillna('None')
    
### Filll the missing values of MSZoning with the mode (The frequently category appearing) By each MSsubclass  
prc.x_all['MSZoning'] = prc.x_all.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

### Filll the missing values of LotFrontage with the mode (The frequently category appearing) By each Neighborhood 
prc.x_all['LotFrontage'] = prc.x_all.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    
prc.split()
prc.update()

In [None]:
prc.fill(
    fill_method_num=0,
    fill_method_object='None',
    inplace=True)

## Fix skews

In [None]:
prc.transY(method='log', inplace=True)
prc.transF(
    threshold=0.5,
    method='boxcox',
    inplace=True)

## Make Features

In [None]:
prc.x_all = prc.x_all.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

prc.x_all['YrBltAndRemod']=prc.x_all['YearBuilt']+prc.x_all['YearRemodAdd']
prc.x_all['TotalSF']=prc.x_all['TotalBsmtSF'] + prc.x_all['1stFlrSF'] + prc.x_all['2ndFlrSF']

prc.x_all['Total_sqr_footage'] = (prc.x_all['BsmtFinSF1'] + prc.x_all['BsmtFinSF2'] +
                                 prc.x_all['1stFlrSF'] + prc.x_all['2ndFlrSF'])

prc.x_all['Total_Bathrooms'] = (prc.x_all['FullBath'] + (0.5 * prc.x_all['HalfBath']) +
                               prc.x_all['BsmtFullBath'] + (0.5 * prc.x_all['BsmtHalfBath']))

prc.x_all['Total_porch_sf'] = (prc.x_all['OpenPorchSF'] + prc.x_all['3SsnPorch'] +
                              prc.x_all['EnclosedPorch'] + prc.x_all['ScreenPorch'] +
                              prc.x_all['WoodDeckSF'])

prc.x_all['haspool'] = prc.x_all['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
prc.x_all['has2ndfloor'] = prc.x_all['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
prc.x_all['hasgarage'] = prc.x_all['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
prc.x_all['hasbsmt'] = prc.x_all['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
prc.x_all['hasfireplace'] = prc.x_all['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

prc.split()
prc.update()

## Get Dummies

In [None]:
prc.x_all = pd.get_dummies(prc.x_all).reset_index(drop=True)
prc.split()
prc.update()
prc.x_all.shape

## Split

In [None]:
X = prc.x_train
X_test = prc.x_test
y = prc.y_train
X.shape, y.shape, X_test.shape

## Option

In [None]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
X = X.drop(overfit, axis=1)
X_test = X_test.drop(overfit, axis=1)
overfit

In [None]:
X.shape, y.shape, X_test.shape

# Regression

## Modules

In [None]:
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

## Scaling

In [None]:
tmp_all = pd.concat([X, X_test], sort=False, ignore_index=True, axis=0)
transformer = RobustScaler().fit(tmp_all)
X = transformer.transform(X)
X_test = transformer.transform(X_test)

## Modeling

### Setup for Cross Validation

In [None]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

### Models

In [None]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [None]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [None]:
gbr = GradientBoostingRegressor(
    n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt',
    min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)                             

In [None]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [None]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

### Each Model's Scores

In [None]:
score = cv_rmse(ridge , X)
score = cv_rmse(lasso , X)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("Elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("LightGBM: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("GBM: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("XGBoost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

## Fit

In [None]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(X, y)

print('Elastic net')
elastic_model_full_data = elasticnet.fit(X, y)

print('Lasso')
lasso_model_full_data = lasso.fit(X, y)

print('Ridge')
ridge_model_full_data = ridge.fit(X, y)

print('Svr')
svr_model_full_data = svr.fit(X, y)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)

print('XGBoost')
xgb_model_full_data = xgboost.fit(X, y)

print('LightGBM')
lgb_model_full_data = lightgbm.fit(X, y)

## Blending

In [None]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [None]:
# def blend_models_predict(X):
#     return (
#             (0.4 * ridge_model_full_data.predict(X)) + \
#             (0.3 * svr_model_full_data.predict(X)) + \
#             (0.3 * lgb_model_full_data.predict(X)))

In [None]:
print('RMSLE score on train data:')
print('Elastic : ', rmsle(y, elastic_model_full_data.predict(X)))
print('Lasso : ', rmsle(y, lasso_model_full_data.predict(X)))
print('Ridge : ', rmsle(y, ridge_model_full_data.predict(X)))
print('SVR : ', rmsle(y, svr_model_full_data.predict(X)))
print('GBR : ', rmsle(y, gbr_model_full_data.predict(X)))
print('XGB : ', rmsle(y, xgb_model_full_data.predict(X)))
print('LightGB : ', rmsle(y, lgb_model_full_data.predict(X)))
print('Staking model : ', rmsle(y, stack_gen_model.predict(np.array(X))))
print('Blending model : ', rmsle(y, blend_models_predict(X)))

# Submit

In [None]:
print('Predict submission')
submission = pd.read_csv("../data/sample_submission.csv")
submission.iloc[:,1] = (np.expm1(blend_models_predict(X_test)))
# submission.iloc[:,1] = (np.expm1(ridge_model_full_data.predict(X_test)))

In [None]:
# q1 = submission['SalePrice'].quantile(0.0042)
# q2 = submission['SalePrice'].quantile(0.99)
# # Quantiles helping us get some extreme values for extremely low or high values 
# submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.77)
# submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)

In [None]:
submission.to_csv("../data/submission_modified.csv", index=False)

In [None]:
submission.head()

# References

- referenced kernel\
https://www.kaggle.com/niteshx2/top-50-beginners-stacking-lgb-xgb