# Importing Required Packages

In [18]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import BaggingRegressor

# Reading Data

In [3]:
train_df = pd.read_csv("../Data/train.csv")
test_df = pd.read_csv("../Data/train.csv")

train_features_df = pd.read_csv("../Data/train_features_df.csv")
test_features_df = pd.read_csv("../Data/test_features_df.csv")

In [4]:
train_features_df.head()

Unnamed: 0,OverallQual,ExterQual,Foundation,BsmtQual,KitchenQual,FireplaceQu,GarageFinish,YearDiff,YearRemodAdd,1stFlrSF,...,GarageType,OverallCond,LotArea,BsmtFinSF1,LotFrontage,2ndFlrSF,SalePrice,no_garage,is_BsmtFinSF1_0,is_2ndFlrSF_0
0,7,2,6,3,2,1,2,2.472136,87.509776,6.75227,...,5,8,9.041922,706,4.174387,854,208500,0,0,0
1,6,1,3,3,1,3,2,9.135529,86.904443,7.140453,...,5,4,9.169518,978,4.382027,0,181500,0,0,1
2,7,2,6,3,2,3,2,3.291503,87.487429,6.824374,...,5,8,9.328123,486,4.219508,866,223500,0,0,0
3,7,1,2,2,2,4,1,17.078784,86.769364,6.867974,...,2,8,9.164296,216,4.094345,756,140000,0,0,0
4,8,2,6,3,2,3,2,3.656854,87.442719,7.04316,...,5,8,9.565214,655,4.430817,1053,250000,0,0,0


In [5]:
test_features_df.head()

Unnamed: 0,OverallQual,ExterQual,Foundation,BsmtQual,KitchenQual,FireplaceQu,GarageFinish,YearDiff,YearRemodAdd,1stFlrSF,...,BsmtFinType1,GarageType,OverallCond,LotArea,BsmtFinSF1,LotFrontage,2ndFlrSF,no_garage,is_BsmtFinSF1_0,is_2ndFlrSF_0
0,5,1,3,2,1,1,1,12.0,86.566359,6.79794,...,3,5,6,9.360655,468.0,4.382027,0,0,0,1
1,6,1,3,2,2,1,1,12.422205,86.498588,7.192182,...,4,5,6,9.565704,923.0,4.394449,0,0,0,1
2,5,1,6,3,1,3,3,5.211103,87.397987,6.833032,...,6,5,5,9.534595,791.0,4.304065,701,0,0,0
3,6,1,6,2,2,4,3,4.928203,87.397987,6.830874,...,6,5,6,9.208138,602.0,4.356709,678,0,0,0
4,8,2,6,3,2,1,2,6.485281,87.263654,7.154615,...,4,5,5,8.518193,263.0,3.7612,0,0,0,1


# Train Test Split Function

In [6]:
def split_data(df, test_size = 0.2, random_state = 42):
    X, y = df.drop(["SalePrice"], axis = 1), df["SalePrice"]
    return train_test_split(X, y, test_size = test_size, random_state = random_state)

# Linear, Lasso, Ridge, ElasticNet Regressions

## Linear Regression

In [38]:
lr_pipe = Pipeline([("scaler", StandardScaler()),
                    ("lr", LinearRegression())])

X_train, X_test, y_train, y_test = split_data(train_features_df)
lr_pipe.fit(X_train, np.log(y_train))

lr_preds = np.exp(lr_pipe.predict(X_test))
lr_rmsle = np.sqrt(mean_squared_log_error(y_true = y_test, y_pred = lr_preds))

In [39]:
lr_rmsle

0.14414189518443585

## Lasso Regression

**Base Model**

In [9]:
lasso_model = Lasso(random_state = 42)

X_train, X_test, y_train, y_test = split_data(train_features_df)

**Grid Search Over Large Range of Parameters**

In [10]:
params = {
    "alpha": np.logspace(-8, 8, 100)
}

lasso_grid = GridSearchCV(lasso_model, params, cv = 10, n_jobs = -1)
lasso_grid.fit(X_train, np.log(y_train))
lasso_grid.best_estimator_

Lasso(alpha=0.0014849682622544665, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

**Grid Search over smaller range**

In [11]:
params = {
    "alpha": np.logspace(-4, -2, 100)
}

lasso_grid = GridSearchCV(lasso_model, params, cv = 10, n_jobs = -1)
lasso_grid.fit(X_train, np.log(y_train))
lasso_grid.best_estimator_

Lasso(alpha=0.0014174741629268048, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

**Grid Search over even smaller range**

In [12]:
params = {
    "alpha": np.logspace(-4, -2, 500)
}

lasso_grid = GridSearchCV(lasso_model, params, cv = 10, n_jobs = -1)
lasso_grid.fit(X_train, np.log(y_train))
lasso_grid.best_estimator_

Lasso(alpha=0.0014135155848353958, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

**Final Lasso Model**

In [19]:
# lasso_model = Lasso(alpha = 0.0014, random_state = 42).fit(X_train, np.log(y_train))
lasso_model = make_pipeline(StandardScaler(), 
                            Lasso(alpha = 0.0014, random_state = 42)).fit(X_train, np.log(y_train))

lasso_preds = np.exp(lasso_model.predict(X_test))
lasso_rmsle = np.sqrt(mean_squared_log_error(y_true = y_test, y_pred = lasso_preds))

In [20]:
lasso_rmsle

0.14265225668187792

## Ridge Regression

**Base Model**

In [21]:
ridge_model = Ridge(random_state = 42)

X_train, X_test, y_train, y_test = split_data(train_features_df)

**Grid Search Over Large Range of Parameters**

In [22]:
params = {
    "alpha": np.logspace(-8, 8, 100)
}

ridge_grid = GridSearchCV(ridge_model, params, cv = 10, n_jobs = -1)
ridge_grid.fit(X_train, np.log(y_train))
ridge_grid.best_estimator_

Ridge(alpha=11.233240329780312, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

**Concentrated Grid Search**

In [23]:
params = {
    "alpha": np.arange(1, 100, 0.5)
}

ridge_grid = GridSearchCV(ridge_model, params, cv = 10, n_jobs = -1)
ridge_grid.fit(X_train, np.log(y_train))
ridge_grid.best_estimator_

Ridge(alpha=9.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

**Concentrated Grid Search**

In [24]:
params = {
    "alpha": np.arange(1, 10, 0.05)
}

ridge_grid = GridSearchCV(ridge_model, params, cv = 10, n_jobs = -1)
ridge_grid.fit(X_train, np.log(y_train))
ridge_grid.best_estimator_

Ridge(alpha=9.700000000000008, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

**Concentrated Grid Search**

In [25]:
params = {
    "alpha": np.arange(9, 10, 0.005)
}

ridge_grid = GridSearchCV(ridge_model, params, cv = 10, n_jobs = -1)
ridge_grid.fit(X_train, np.log(y_train))
ridge_grid.best_estimator_

Ridge(alpha=9.680000000000106, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

**Final Ridge Model**

In [27]:
# ridge_model = Ridge(alpha = 9.7, random_state = 42).fit(X_train, np.log(y_train))
ridge_model = make_pipeline(StandardScaler(), 
                            Ridge(alpha = 9.7, random_state = 42)).fit(X_train, np.log(y_train))

ridge_preds = np.exp(ridge_model.predict(X_test))
ridge_rmsle = np.sqrt(mean_squared_log_error(y_true = y_test, y_pred = ridge_preds))

In [28]:
ridge_rmsle

0.14394055317563353

# Ensembling 

**Models Scores**

In [40]:
model_perf = pd.DataFrame({
                            "Model": ["Linear", "Lasso", "Ridge"], 
                            "Performance": [lr_rmsle, lasso_rmsle, ridge_rmsle]
                        })
model_perf.sort_values(by = "Performance")

Unnamed: 0,Model,Performance
1,Lasso,0.142652
2,Ridge,0.143941
0,Linear,0.144142


## Bagging

In [30]:
def bagging(estimator, X, y, test_set):
    bagg_reg = BaggingRegressor(base_estimator = estimator, 
                                n_estimators = 10, 
                                max_samples = 1.0, 
                                bootstrap = True, 
                                n_jobs = -1, 
                                random_state = 42).fit(X, np.log(y))
    
    return (bagg_reg, np.exp(bagg_reg.predict(test_set)))

### Linear Regression

In [41]:
X_train, X_test, y_train, y_test = split_data(train_features_df)

lr_bagg, lr_bagg_pred = bagging(lr_pipe, X_train, y_train, X_test)
lr_bagg_rmsle = np.sqrt(mean_squared_log_error(y_true = y_test, y_pred = lr_bagg_pred))
lr_bagg_rmsle

0.14327890285835243

### Lasso Regression

In [32]:
lasso_model

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=0.0014, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=42, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [33]:
X_train, X_test, y_train, y_test = split_data(train_features_df)

lasso_bagg, lasso_bagg_pred = bagging(lasso_model, X_train, y_train, X_test)
lasso_bagg_rmsle = np.sqrt(mean_squared_log_error(y_true = y_test, y_pred = lasso_bagg_pred))
lasso_bagg_rmsle

0.14243046564636333

### Ridge Regression

In [34]:
ridge_model

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge',
                 Ridge(alpha=9.7, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=42,
                       solver='auto', tol=0.001))],
         verbose=False)

In [35]:
X_train, X_test, y_train, y_test = split_data(train_features_df)

ridge_bagg, ridge_bagg_pred = bagging(ridge_model, X_train, y_train, X_test)
ridge_bagg_rmsle = np.sqrt(mean_squared_log_error(y_true = y_test, y_pred = ridge_bagg_pred))
ridge_bagg_rmsle

0.1432034173279199

### Mean of Bagged Predictions

In [36]:
bagged_preds_arr = np.column_stack([lr_bagg_pred, lasso_bagg_pred, ridge_bagg_pred])
bagged_preds = np.mean(bagged_preds_arr, axis = 1)

bagged_rmsle = np.sqrt(mean_squared_log_error(y_true = y_test, y_pred = bagged_preds))
bagged_rmsle

0.14289520625193708

In [78]:
df = pd.DataFrame({"Model": ["Bagg Linear", "Bagg Lasso", "Bagg Ridge", "Bagg Overall"], 
                   "Performance": [lr_bagg_rmsle, lasso_bagg_rmsle, ridge_bagg_rmsle, bagged_rmsle]})

model_perf = pd.concat([model_perf, df], axis = 0, ignore_index = True)
model_perf.sort_values(by = "Performance")

Unnamed: 0,Model,Performance
4,Bagg Lasso,0.14243
1,Lasso,0.142652
6,Bagg Overall,0.142895
5,Bagg Ridge,0.143203
3,Bagg Linear,0.143279
2,Ridge,0.143941
0,Linear,0.144142


# Submission File

## Non-Ensembled Models

**Linear Model**

In [50]:
X, y = train_features_df.drop(["SalePrice"], axis = 1), train_features_df["SalePrice"]
sub_linear = make_pipeline(StandardScaler(), LinearRegression()).fit(X, np.log(y))
sub_linear_preds = np.exp(sub_linear.predict(test_features_df.values))

sub3_linear_standard_scaler = pd.read_csv("../Data/sample_submission.csv")
sub3_linear_standard_scaler["SalePrice"] = sub_linear_preds
sub3_linear_standard_scaler.to_csv("sub3_linear_standard_scaler.csv", index = False)

**Lasso Model**

In [53]:
X, y = train_features_df.drop(["SalePrice"], axis = 1), train_features_df["SalePrice"]
sub_lasso = make_pipeline(StandardScaler(), 
                          Lasso(alpha = 0.0014, random_state = 42)).fit(X, np.log(y))
sub_lasso_preds = np.exp(sub_lasso.predict(test_features_df.values))

sub3_lasso_standard_scaler = pd.read_csv("../Data/sample_submission.csv")
sub3_lasso_standard_scaler["SalePrice"] = sub_lasso_preds
sub3_lasso_standard_scaler.to_csv("sub3_lasso_standard_scaler.csv", index = False)

**Ridge Model**

In [57]:
X, y = train_features_df.drop(["SalePrice"], axis = 1), train_features_df["SalePrice"]
sub_ridge = make_pipeline(StandardScaler(), 
                          Ridge(alpha = 9.7, random_state = 42)).fit(X, np.log(y))
sub_ridge_preds = np.exp(sub_ridge.predict(test_features_df.values))

sub3_ridge_standard_scaler = pd.read_csv("../Data/sample_submission.csv")
sub3_ridge_standard_scaler["SalePrice"] = sub_ridge_preds
sub3_ridge_standard_scaler.to_csv("sub3_ridge_standard_scaler.csv", index = False)

## Bagged

**Bagged Lasso**

In [66]:
X, y = train_features_df.drop(["SalePrice"], axis = 1), train_features_df["SalePrice"]
sub_lasso_std_scale_bagg, sub_lasso_std_scale_bagg_preds = bagging(make_pipeline(StandardScaler(), 
                                                                                 Lasso(alpha = 0.0014, 
                                                                                       random_state = 42)), 
                                                                   X, y, test_features_df.values)

sub3_lasso_std_scale_bagg = pd.read_csv("../Data/sample_submission.csv")
sub3_lasso_std_scale_bagg["SalePrice"] = sub_lasso_std_scale_bagg_preds
sub3_lasso_std_scale_bagg.to_csv("sub3_lasso_std_scale_bagg.csv", index = False)

**Bagged Linear**

In [69]:
X, y = train_features_df.drop(["SalePrice"], axis = 1), train_features_df["SalePrice"]
sub_linear_std_scale_bagg, sub_linear_std_scale_bagg_preds = bagging(make_pipeline(StandardScaler(), 
                                                                                   LinearRegression()), 
                                                                   X, y, test_features_df.values)

sub3_linear_std_scale_bagg = pd.read_csv("../Data/sample_submission.csv")
sub3_linear_std_scale_bagg["SalePrice"] = sub_linear_std_scale_bagg_preds
sub3_linear_std_scale_bagg.to_csv("sub3_linear_std_scale_bagg.csv", index = False)

**Bagged Ridge**

In [103]:
ridge_bagg

BaggingRegressor(base_estimator=Ridge(alpha=9.7, copy_X=True,
                                      fit_intercept=True, max_iter=None,
                                      normalize=False, random_state=42,
                                      solver='auto', tol=0.001),
                 bootstrap=True, bootstrap_features=False, max_features=1.0,
                 max_samples=1.0, n_estimators=10, n_jobs=-1, oob_score=False,
                 random_state=42, verbose=0, warm_start=False)

In [72]:
X, y = train_features_df.drop(["SalePrice"], axis = 1), train_features_df["SalePrice"]
sub_ridge_std_scale_bagg, sub_ridge_std_scale_bagg_preds = bagging(make_pipeline(StandardScaler(), 
                                                                                 Ridge(alpha = 9.7, 
                                                                                       random_state = 42)), 
                                                                   X, y, test_features_df.values)

sub3_ridge_std_scale_bagg = pd.read_csv("../Data/sample_submission.csv")
sub3_ridge_std_scale_bagg["SalePrice"] = sub_ridge_std_scale_bagg_preds
sub3_ridge_std_scale_bagg.to_csv("sub3_ridge_std_scale_bagg.csv", index = False)

**Bagged Overall**

In [82]:
sub_bagged_std_scale_preds_arr = np.column_stack([sub_linear_std_scale_bagg_preds, 
                                                  sub_lasso_std_scale_bagg_preds, 
                                                  sub_ridge_std_scale_bagg_preds])
sub_bagged_std_scale_preds = np.mean(sub_bagged_std_scale_preds_arr, axis = 1)

sub3_std_scale_bagged = pd.read_csv("../Data/sample_submission.csv")
sub3_std_scale_bagged["SalePrice"] = sub_bagged_std_scale_preds
sub3_std_scale_bagged.to_csv("sub3_std_scale_bagged.csv", index = False)

**Weighted Bagged Overall**

In [80]:
model_perf.sort_values(by = "Performance")

Unnamed: 0,Model,Performance
4,Bagg Lasso,0.14243
1,Lasso,0.142652
6,Bagg Overall,0.142895
5,Bagg Ridge,0.143203
3,Bagg Linear,0.143279
2,Ridge,0.143941
0,Linear,0.144142


In [86]:
sub_bagged_std_scale_preds_wt_arr = np.column_stack([sub_linear_std_scale_bagg_preds, 
                                                     sub_lasso_std_scale_bagg_preds, 
                                                     sub_ridge_std_scale_bagg_preds])
sub_bagged_std_scale_preds_wt = np.average(sub_bagged_std_scale_preds_wt_arr, axis = 1, weights = [2, 1, 3])

sub3_std_scale_bagged_wt = pd.read_csv("../Data/sample_submission.csv")
sub3_std_scale_bagged_wt["SalePrice"] = sub_bagged_std_scale_preds_wt
sub3_std_scale_bagged_wt.to_csv("sub3_std_scale_bagged_wt.csv", index = False)

**Weighted Overall**

In [90]:
# sub_bagged_std_scale_preds_wt_arr = np.column_stack([sub_linear_std_scale_bagg_preds,3 
#                                                      sub_lasso_std_scale_bagg_preds, 
#                                                      sub_ridge_std_scale_bagg_preds,2 
#                                                      sub_linear_preds, 
#                                                      sub_lasso_preds,1 
#                                                      sub_ridge_preds])
sub_std_scale_preds_wt_arr = np.column_stack([sub_linear_std_scale_bagg_preds, 
                                              sub_ridge_std_scale_bagg_preds, 
                                              sub_lasso_preds])
sub_std_scale_preds_wt = np.average(sub_std_scale_preds_wt_arr, 
                                    axis = 1, 
                                    weights = [1, 2, 3])

sub3_std_scale_wt = pd.read_csv("../Data/sample_submission.csv")
sub3_std_scale_wt["SalePrice"] = sub_std_scale_preds_wt
sub3_std_scale_wt.to_csv("sub3_std_scale_wt.csv", index = False)

In [89]:
sub_std_scale_preds_wt

array([118881.31167434, 153830.43797096, 183875.92540568, ...,
       169871.28194223, 125365.95642198, 233340.50915023])