# <ins> Imports

In [546]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from scipy.stats import chi2
from scipy.stats import chi2_contingency
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor



# <ins> Load Data

In [547]:
test = pd.read_csv('./test.csv')
train = pd.read_csv('./train.csv')
test_ID = test.Id
# test.drop(['Id'],axis=1,inplace=True)
# train.drop(['Id'],axis=1,inplace=True)


# <ins> EDA

# <ins> Simple Imputation of Missing Data

In [548]:
fill_with_zero =['MasVnrArea','LotFrontage','GarageYrBlt','TotalBsmtSF','GarageCars','GarageArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','BsmtFullBath','BsmtHalfBath',]
fill_with_none = ['Fence','MiscFeature','MasVnrType','BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtQual','BsmtCond','PoolQC','Alley','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond']


for col in fill_with_zero:
    train[col].fillna(0,inplace=True)
    test[col].fillna(0,inplace=True)

for col in fill_with_none:
    train[col].fillna("None",inplace=True)
    test[col].fillna("None",inplace=True)



# Specific
test = test.fillna({"MSZoning": "RL"})
test = test.fillna({"Exterior1st": "VinylSd"})
test = test.fillna({"Exterior2nd": "VinylSd"})
train = train.fillna({"Electrical": "SBrkr"})
test = test.fillna({"KitchenQual": "TA"})
test = test.fillna({"Functional": "Typ"})
test = test.fillna({"SaleType": "WD"})
test.Utilities.fillna('AllPub',inplace=True)

train = train.drop(["Fence", "MiscFeature", "Utilities"], axis=1)
test = test.drop(["Fence", "MiscFeature", "Utilities"], axis=1)

y = train["SalePrice"]


# <ins> Simple Handling of categorical with get_dummies And Aligning dataframes

In [549]:
train_dummies = pd.get_dummies(pd.concat((train.drop(["SalePrice", "Id"], axis=1), test.drop(["Id"], axis=1)), axis=0)).iloc[: train.shape[0]]
test_dummies = pd.get_dummies(pd.concat((train.drop(["SalePrice", "Id"], axis=1), test.drop(["Id"], axis=1)), axis=0)).iloc[train.shape[0]:]



# <ins> Creating Baseline Model

In [550]:

# Define the parameter grid
param_grid = {'max_depth': [2, 4, 6, 8, 10],
              'min_samples_split': [2, 4, 6, 8, 10],
              'min_samples_leaf': [15,30,50],
              'random_state': [42]}

# Create the regressor
regressor = DecisionTreeRegressor()

# Create the grid search object
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

# Fit the grid search to the data
grid_search.fit(train_dummies, y)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best score
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best parameters:", best_params)
baseline_model = grid_search.best_estimator_

Best parameters: {'max_depth': 10, 'min_samples_leaf': 15, 'min_samples_split': 2, 'random_state': 42}


# <ins> Save Submission

In [551]:
baseline_model.fit(train_dummies,y)
pred = baseline_model.predict(test_dummies)
output = pd.DataFrame({'Id': test_ID,
                       'SalePrice': pred})

output.to_csv('Baseline_submission.csv', index=False)

# <ins> Improving Data Preprocessing

## <ins> Removing Outliers

In [552]:
train.drop(train[train.LotFrontage> 200].index, inplace= True)
train.drop(train[train.LotArea > 100000].index, inplace= True)
train.drop(train[train.BsmtFinSF1 > 4000].index, inplace= True)
train.drop(train[train.TotalBsmtSF > 6000].index, inplace= True)
train.drop(train[train['1stFlrSF'] > 4000].index, inplace= True)
train.drop(train[(train.GrLivArea > 4000) & (y<300000)].index, inplace= True)
train.drop( train[ train.LowQualFinSF> 550].index, inplace= True)
y = train.SalePrice

  train.drop(train[(train.GrLivArea > 4000) & (y<300000)].index, inplace= True)


## <ins> Converting Ordinal Categories

In [553]:
# ordinal_cols = ['ExterCond','ExterQual', 'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual']


# mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0,
#     'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1,
#     'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1,
#     'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 2, 'Sev': 1, 'Sal': 0,
#     'Fin': 3, 'RFn': 2, 'Unf': 1}


# for col in ordinal_cols:
#     train[col] = train[col].map(mapping)
#     test[col] = test[col].map(mapping)

# train.MSZoning = train.MSZoning.astype(str)
# test.MSZoning = test.MSZoning.astype(str)


## <ins> Feature Engineering

In [554]:
# n_rooms = ['BedroomAbvGr','TotRmsAbvGrd']
# n_bathrooms = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath','HalfBath']
# porch_area = [ 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch']

# train['n_rooms'] = train[n_rooms].sum(axis=1)
# train['n_bathrooms'] = train.BsmtFullBath + train.BsmtHalfBath * 0.5 + train.FullBath + train.HalfBath * 0.5
# train['porch_area'] = train[porch_area].sum(axis=1)
# train['years_till_remod'] = train.YearBuilt - train.YearRemodAdd 
# train['years_since_remod'] = train.YrSold - train.YearRemodAdd 
# train['age_at_sell'] = train.YrSold - train.YearRemodAdd
# train['LowQualSF'] = train.LowQualFinSF + train.BsmtUnfSF


# test['n_rooms'] = test[n_rooms].sum(axis=1)
# test['n_bathrooms'] = test.BsmtFullBath + test.BsmtHalfBath * 0.5 + test.FullBath + test.HalfBath * 0.5
# test['porch_area'] = test[porch_area].sum(axis=1)
# test['years_till_remod'] = test.YearBuilt - test.YearRemodAdd 
# test['years_since_remod'] = test.YrSold - test.YearRemodAdd 
# test['age_at_sell'] = test.YrSold - test.YearBuilt
# test['LowQualSF'] = test.LowQualFinSF + test.BsmtUnfSF

<ins> Convert y to Log to reduce skew for regression models and create dummies

In [555]:
y = np.log(y)
train_dummies = pd.get_dummies(pd.concat((train.drop(["SalePrice", "Id"], axis=1), test.drop(["Id"], axis=1)), axis=0)).iloc[: train.shape[0]]
test_dummies = pd.get_dummies(pd.concat((train.drop(["SalePrice", "Id"], axis=1), test.drop(["Id"], axis=1)), axis=0)).iloc[train.shape[0]:]


## <ins> Normalizing Data

In [556]:
cols_to_scale = train.select_dtypes(exclude='object').drop(['Id','SalePrice'],axis=1).columns

# create StandardScaler object for train
scaler = StandardScaler()
train_dummies[cols_to_scale] = scaler.fit_transform(train_dummies[cols_to_scale])
# scale the test columns using the scaler object from train
test_dummies[cols_to_scale] = scaler.transform(test_dummies[cols_to_scale])

# <ins> Moving From Basic Model to Ensemble

# <ins> Hypertuning models

<ins> Gradient Boosting

In [557]:
param_grid = {
    'n_estimators': [100,150,200],
    'learning_rate': [0.05,0.01, 0.1],
    'max_depth': [2,4, 5,10],
    'random_state': [42]}
best_GB = GradientBoostingRegressor()

# -------------------------------------------------------
grid_search = GridSearchCV(best_GB, param_grid, cv=5, 
                           scoring='neg_mean_squared_error')
grid_search.fit(train_dummies, y)
best_GB = GradientBoostingRegressor(**grid_search.best_params_)
print(grid_search.best_params_)
print("Best Score: ", -grid_search.best_score_)



{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'random_state': 42}
Best Score:  0.013922279403980617


<ins> Ridge Regression

In [558]:

param_grid = {
    'alpha': [0.01,0.1, 1.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'random_state': [42]}
best_RR = Ridge()
grid_search = GridSearchCV(best_RR, param_grid, cv=5, 
                           scoring='neg_mean_squared_error')
grid_search.fit(train_dummies, y)
best_RR = Ridge(**grid_search.best_params_)
print(grid_search.best_params_)
print("Best Score: ", -grid_search.best_score_)


{'alpha': 10.0, 'random_state': 42, 'solver': 'lsqr'}
Best Score:  0.012903122225878546


<ins> Elastic Network

In [559]:
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 0.01],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0,0.1,0.05],
    'random_state': [42]
}

best_EN = ElasticNet()

grid_search = GridSearchCV(best_EN, param_grid, cv=5, 
                           scoring='neg_mean_squared_error')

grid_search.fit(train_dummies, y)

best_EN = ElasticNet(**grid_search.best_params_)
print(grid_search.best_params_)
print("Best Score: ", -grid_search.best_score_)



{'alpha': 0.01, 'l1_ratio': 0.05, 'random_state': 42}
Best Score:  0.012631858699742126


<ins> RandomForest

In [560]:

param_grid = {
    'n_estimators': [100,150,200],
    'max_depth': [2, 5, 10, 15,20],
    'max_features': [10,20,30,'auto'],
    'random_state': [42]
}


best_RF = RandomForestRegressor()

grid_search = GridSearchCV(best_RF, param_grid, cv=5, 
                           scoring='neg_mean_squared_error')

grid_search.fit(train_dummies, y)

best_RF = RandomForestRegressor(**grid_search.best_params_)
print(grid_search.best_params_)
print("Best Score: ", -grid_search.best_score_)



{'max_depth': 15, 'max_features': 30, 'n_estimators': 150, 'random_state': 42}
Best Score:  0.01782439590799286


<ins> Lasso

In [561]:
param_grid = [{'alpha': [0.1,0.01,0.5,0.05,0.005,0.0005], 'random_state':[42]}]
best_lasso = Lasso()
grid_search = GridSearchCV(best_lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_dummies, y)
best_lasso = Lasso(**grid_search.best_params_)
print(grid_search.best_params_)
print("Best Score: ", -grid_search.best_score_)


{'alpha': 0.0005, 'random_state': 42}
Best Score:  0.012166685090905991


### <ins> Create best models (so i dont run gridsearch each time) 

In [562]:
# best_RF =  RandomForestRegressor(**{'max_depth': 15, 'max_features': 30, 'n_estimators': 200, 'random_state': 42})
# best_EN = ElasticNet(**{'alpha': 0.01, 'l1_ratio': 0.05, 'random_state': 42})
# best_RR = Ridge(**{'alpha': 10.0, 'max_iter': 1000, 'random_state': 42, 'solver': 'sparse_cg'})
# # best_GB = GradientBoostingRegressor(**{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'random_state': 42})
# best_lasso = Lasso(**{'alpha': 0.0005, 'random_state': 42})

# <ins> Fit Best Models

In [563]:
best_EN.fit(train_dummies,y)
best_GB.fit(train_dummies,y)
best_lasso.fit(train_dummies,y)
best_RR.fit(train_dummies,y)
best_RF.fit(train_dummies,y)

RandomForestRegressor(max_depth=15, max_features=30, n_estimators=150,
                      random_state=42)

# <ins> Predict

In [564]:
EN_pred = np.exp(best_EN.predict(test_dummies))
GB_pred = np.exp(best_GB.predict(test_dummies))
RR_pred = np.exp(best_RR.predict(test_dummies))
lasso_pred = np.exp(best_lasso.predict(test_dummies))
RF_pred = np.exp(best_RF.predict(test_dummies))
final_pred = (EN_pred +  + lasso_pred + +RR_pred+GB_pred)/5

# <ins> Save Submission

In [565]:
output = pd.DataFrame({'Id': test_ID,
                       'SalePrice': final_pred})

output.to_csv('all_models_normalized_OrdinalEncoding.csv', index=False)