# 1. Business Objective

This notebook is for the Kaggle competition hosted by Aston University. In this competition, I came in first place. The competition's results can be found here: https://www.kaggle.com/competitions/bnm861-2022/leaderboard

# 2. Setting up Environment

In [None]:
# import the required libraries

# base libraries
import os

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#statistics package
from scipy import stats
from scipy.stats import norm, skew

# Data manipulation
import numpy as np
import pandas as pd

# Data Pre-processing and transformation
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from functools import reduce

# do not print warnings
import warnings
warnings.filterwarnings("ignore")

# 3. Data Loading


In [None]:
#load the training and testing set
train = pd.read_csv("CW_training_final.csv")
test = pd.read_csv("CW_testing_kaggle.csv")

In [None]:
#Save the 'id' column
train_ID = train['id']
test_ID = test['id']

#drop id
train.drop("id", axis = 1, inplace = True)
test.drop("id", axis = 1, inplace = True)

# 4. Exploratory Data Analysis

## 4.1 Numerical Data Exploration

In [None]:
# Visualization of numerical variables 
numerical_vis = train.drop(columns=['YearBuilt','MoSold',
                                   'YrSold','SalePrice','Fireplaces','YearRemodAdd'])
numerical_vis.hist(bins=60, figsize=(30,30), histtype='bar')

## 4.2 Categorical Data Exploration

In [None]:
#Visualization of categorical variables 

categorical_features = ["MSZoning", "Street","Alley","LotShape","LandContour","Utilities","LotConfig","LandSlope"
                       ]
fig, ax = plt.subplots(1, len(categorical_features))
for i, categorical_feature in enumerate(train[categorical_features]):
    train[categorical_feature].value_counts().plot(kind='bar', figsize=(25, 4),ax=ax[i]).set_title(categorical_feature)
fig.show()

In [None]:
categorical_features = ["Exterior2nd", "Condition1","Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl","Exterior1st"
                       ]
fig, ax = plt.subplots(1, len(categorical_features))
for i, categorical_feature in enumerate(train[categorical_features]):
    train[categorical_feature].value_counts().plot(kind='bar', figsize=(25, 4),ax=ax[i]).set_title(categorical_feature)
fig.show()

In [None]:
train.groupby("Neighborhood").size().plot(kind='bar')

In [None]:
categorical_features = ["MasVnrType", "ExterQual","ExterCond","Foundation","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1"
                       ]
fig, ax = plt.subplots(1, len(categorical_features))
for i, categorical_feature in enumerate(train[categorical_features]):
    train[categorical_feature].value_counts().plot(kind='bar', figsize=(25, 4),ax=ax[i]).set_title(categorical_feature)
fig.show()

In [None]:
categorical_features = ["BsmtFinType2", "Heating","HeatingQC","CentralAir","Electrical","KitchenQual","Functional","FireplaceQu"
                       ]
fig, ax = plt.subplots(1, len(categorical_features))
for i, categorical_feature in enumerate(train[categorical_features]):
    train[categorical_feature].value_counts().plot(kind='bar', figsize=(25, 4),ax=ax[i]).set_title(categorical_feature)
fig.show()

In [None]:
categorical_features = ["GarageType", "GarageFinish","GarageQual","GarageCond","PavedDrive","PoolQC","Fence","MiscFeature",
                        "SaleType", "SaleCondition"]
fig, ax = plt.subplots(1, len(categorical_features))
for i, categorical_feature in enumerate(train[categorical_features]):
    train[categorical_feature].value_counts().plot(kind='bar', figsize=(25, 4),ax=ax[i]).set_title(categorical_feature)
fig.show()

In [None]:
#Check the data
train.info()

In [None]:
test.info()

# 5. Data Cleaning and Transformation

## 5.1 Feature Engineering

In [None]:
#add features on trainset
train['Age Sold'] = train['YrSold'] - train['YearBuilt'] 
train['Age Sold Renovation'] = train['YrSold'] - train['YearRemodAdd'] 
train['Age Sold Garage'] = train['YrSold'] - train['GarageYrBlt']
train["SqFtPerRoom"] = train["GrLivArea"] / (train["TotRmsAbvGrd"] +
                                                       train["FullBath"] +
                                                       train["HalfBath"] +
                                                       train["KitchenAbvGr"])
train['Total_Bathrooms'] = train['FullBath'] +  train['HalfBath'] + train['BsmtFullBath'] + train['BsmtHalfBath']
train['Total Area'] = train['TotalBsmtSF'] +  train['1stFlrSF'] + train['2ndFlrSF'] 

In [None]:
#add features on trainset
test['Age Sold'] = test['YrSold'] - test['YearBuilt'] 
test['Age Sold Renovation'] = test['YrSold'] - test['YearRemodAdd'] 
test['Age Sold Garage'] = test['YrSold'] - test['GarageYrBlt']
test["SqFtPerRoom"] = test["GrLivArea"] / (test["TotRmsAbvGrd"] +
                                                       test["FullBath"] +
                                                       test["HalfBath"] +
                                                       test["KitchenAbvGr"])
test['Total_Bathrooms'] = test['FullBath'] +  test['HalfBath'] + test['BsmtFullBath'] + test['BsmtHalfBath']
test['Total Area'] = test['TotalBsmtSF'] +  test['1stFlrSF'] + test['2ndFlrSF'] 

## 5.2 Handle Missing Values

### 5.2.1 Numerical Missing Values

Since Lot Frontage has a lot of missing values, I'll build a regression model to predict the missing values in Lot Frontage.

In [None]:
# use a supervised model to predict the missing values
#catboost regressor to predict lotfrontage

#only use numerical variables to predict
train1 = train.drop('SalePrice',1)
train1 = train1.select_dtypes(include=['float64','int64'])


#set null values of lot frontage in trainset as small testset
test_lotfrontage = train1[train1['LotFrontage'].isnull()]

#drop all null values in train1
train1.dropna(inplace = True)

#take the available values of lot frontage and set them as the y_train
y_train_lotfrontage =train1['LotFrontage']

#the x_train will be all columns in train1 except 'Lot Frontage'
x_train_lotfrontage=train1.drop('LotFrontage', axis =1)

# the x_test for lot frontage will be all columns in test_lotfrontage except 'Lot Frontage'
x_test_lotfrontage= test_lotfrontage.drop('LotFrontage', axis =1)

In [None]:
from sklearn.pipeline import Pipeline
from tune_sklearn import TuneGridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

model = CatBoostRegressor()

parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [30, 50, 80]
              }

grid = make_pipeline(RobustScaler(),TuneGridSearchCV(estimator=model, param_grid = parameters, cv = 10, n_jobs=-1,refit=True))
grid.fit(x_train_lotfrontage, y_train_lotfrontage)

In [None]:
lf_pred_train = grid.predict(x_test_lotfrontage)
train.loc[train.LotFrontage.isnull(),'LotFrontage']=lf_pred_train

In [None]:
#repeat the same prediction on testset for lot frontage

# use a supervised model to predict the missing values
#catboost regressor to predict lotfrontage

#only use numerical variables to predict
test1 = test.select_dtypes(include=['float64','int64'])

#set null values of lot frontage in trainset as small testset
test_lotfrontage_te = test1[test1['LotFrontage'].isnull()]

#drop all null values in train1
test1.dropna(inplace = True)

#take the available values of lot frontage and set them as the y_train
y_train_lotfrontage_te =test1['LotFrontage']

#the x_train will be all columns in train1 except 'Lot Frontage'
x_train_lotfrontage_te =test1.drop('LotFrontage', axis =1)

# the x_test for lot frontage will be all columns in test_lotfrontage except 'Lot Frontage'
x_test_lotfrontage_te = test_lotfrontage_te.drop('LotFrontage', axis =1)

In [None]:
from sklearn.pipeline import Pipeline
from tune_sklearn import TuneGridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

model = CatBoostRegressor()

parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [30, 50, 80]
              }

grid = make_pipeline(RobustScaler(),TuneGridSearchCV(estimator=model, param_grid = parameters, cv = 10, n_jobs=-1,refit=True))
grid.fit(x_train_lotfrontage_te, y_train_lotfrontage_te)

In [None]:
lf_pred_test = grid.predict(x_test_lotfrontage_te)
test.loc[test.LotFrontage.isnull(),'LotFrontage']=lf_pred_test

Since other numerical variables do not have to many missing values, I'll impute median values for all the NAN.

In [None]:
# drop labels for training set, but keep all others
Xtrain = train.drop("SalePrice", axis=1)
Xtest = test


ytrain = train["SalePrice"].copy()


In [None]:
# import the SimpleImputer class and instantiate the object
from sklearn.impute import SimpleImputer

# instantiate imputer object with median
imputer = SimpleImputer(strategy ='median')

def imputefunc(Xtrain, Xtest):
    
    # seperate features into numerical and categorical sets
    x_train_num = Xtrain.select_dtypes(include=['float64','int64'])
    x_train_cat = Xtrain.select_dtypes(include=['object'])

    x_test_num = Xtest.select_dtypes(include=['float64','int64'])
    x_test_cat = Xtest.select_dtypes(include=['object'])

    # fit the impute on our training set and subsequently transform both sets
    Xtrain = pd.DataFrame(imputer.fit_transform(x_train_num),columns = x_train_num.columns)
    Xtest = pd.DataFrame(imputer.transform(x_test_num), columns = x_test_num.columns)
    
    # reset the index of both our sets as concatenation requires consistent indexes
    x_train_cat.reset_index(level=0, inplace=True)
    x_test_cat.reset_index(level=0, inplace=True)  

    # drop the old indexes of the train and test set categoricals
    x_train_cat.drop(columns='index', axis=1, inplace=True)
    x_test_cat.drop(columns='index', axis=1, inplace=True)

    # next we join back the two dataframes 
    Xtrain = pd.concat([Xtrain, x_train_cat], axis=1)
    Xtest = pd.concat([Xtest, x_test_cat], axis=1)

    return Xtrain, Xtest


Xtrain, Xtest = imputefunc(Xtrain, Xtest)


In this dataset, for the categorical variables, some houses does not have the features so the values were left blank. I will fill the NAN with the right string values.

In [None]:
#fill the NA with string value for the trainset
Xtrain['PoolQC'].fillna("No Pool", inplace=True)
Xtrain['Alley'].fillna("No Alley Access", inplace=True)
Xtrain['FireplaceQu'].fillna("No Fireplace", inplace=True)
Xtrain['Fence'].fillna("No Fence", inplace=True)
Xtrain['MiscFeature'].fillna("No MiscFeature", inplace=True)


In [None]:
#fill the NA with string value for the testset
Xtest['PoolQC'].fillna("No Pool", inplace=True)
Xtest['Alley'].fillna("No Alley Access", inplace=True)
Xtest['FireplaceQu'].fillna("No Fireplace", inplace=True)
Xtest['Fence'].fillna("No Fence", inplace=True)
Xtest['MiscFeature'].fillna("No MiscFeature", inplace=True)

The rest will be imputed with mode value of the columns

In [None]:
# import the SimpleImputer class and instantiate the object
from sklearn.impute import SimpleImputer

# instantiate imputer object with median
imputer = SimpleImputer(strategy ='most_frequent')

# define a function that imputes missing values on a given train,test dataset pair.
def imputefunc(Xtrain, Xtest):
    
    # seperate features into numerical and categorical sets
    Xtrain_num = Xtrain.select_dtypes(include=['float64','int64'])
    Xtrain_cat = Xtrain.select_dtypes(include=['object'])

    Xtest_num = Xtest.select_dtypes(include=['float64','int64'])
    Xtest_cat = Xtest.select_dtypes(include=['object'])

    # fit the impute on our training set and subsequently transform both sets
    Xtrain = pd.DataFrame(imputer.fit_transform(Xtrain_cat),columns = Xtrain_cat.columns)
    Xtest = pd.DataFrame(imputer.transform(Xtest_cat), columns = Xtest_cat.columns)
    
    # reset the index of both our sets as concatenation requires consistent indexes
    Xtrain_num.reset_index(level=0, inplace=True)
    Xtest_num.reset_index(level=0, inplace=True)  

    # drop the old indexes of the train and test set categoricals
    Xtrain_num.drop(columns='index', axis=1, inplace=True)
    Xtest_num.drop(columns='index', axis=1, inplace=True)

    # next we join back the two dataframes 
    Xtrain = pd.concat([Xtrain, Xtrain_num], axis=1)
    Xtest = pd.concat([Xtest, Xtest_num], axis=1)

    return Xtrain, Xtest


Xtrain, Xtest = imputefunc(Xtrain, Xtest)

## 5.3 Encoding Dummy Variables

In [None]:
# import the onehotencoder class to implement encoding
from sklearn.preprocessing import OneHotEncoder


# set aside the categorical columns as a list object 
catcols = Xtrain.select_dtypes(['object']).columns.tolist()

# define the get dummies function to return encoded train and test sets
def get_dummies(Xtrain, Xtest, old_col_name):
    """Given a trainset, a testset, and the name of a column holding a 
    categorical variable, fit an encoder on the trainset, and use the 
    encoder to add dummy columns into the trainset and testset
    """
    
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    # the input to the encoder must be a 2-d numpy array,
    # so we take the column, extract their values and reshape the array to be 2-d
    # the old_col_name will be taken from the categorical columns list defined previously
    cat_vals = Xtrain[old_col_name].values.reshape(-1,1)

    # fit an encoder and transform the **trainset**
    # the output is a new numpy array
    transformed = one_hot_encoder.fit_transform(cat_vals)
    
    # in the list of new columns, convert numpy array to a list and
    # drop the first column, because we requested "drop='first'"
    new_col_names = one_hot_encoder.categories_[0].tolist()[1:]
    
    for i, new_col_name in enumerate(new_col_names):
        
        # put the transformed data as columns in the trainset dataframe
        Xtrain[new_col_name] = transformed[:,i]
    
    # transform the **testset** using the fitted encoder
    
    cat_vals = Xtest[old_col_name].values.reshape(-1,1)
    transformed = one_hot_encoder.transform(cat_vals)

    for i, new_col_name in enumerate(new_col_names):
        
        # put the transformed data as columns in the testset dataframe
        Xtest[new_col_name] = transformed[:,i]
    
    return Xtrain, Xtest


for col_name in catcols:
    Xtrain, Xtest = get_dummies(Xtrain, Xtest, col_name)

# check if the dummies are produced correctly in the trainset
Xtrain.head()


In [None]:
# with our dummy variables in place, we no longer need old columns
def dropfunc(data, column_list):
    for column in data:
        if column in column_list:
            del data[column]

# implement the above function on the categorical columns list previously defined
dropfunc(Xtrain, catcols)
dropfunc(Xtest, catcols)

Xtrain.head()

## 5.4 Feature scaling

In [None]:
#trainset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Xtrain_YearBuilt = Xtrain["YearBuilt"].values
Xtrain_YearRemodAdd = Xtrain["YearRemodAdd"].values
Xtrain_GarageYrBlt = Xtrain["GarageYrBlt"].values
Xtrain_YrSold = Xtrain["YrSold"].values
Xtrain_MoSold = Xtrain["MoSold"].values
Xtrain_new = Xtrain.drop(columns=["YearBuilt", "YearRemodAdd", "GarageYrBlt", "YrSold","MoSold"], axis=1)

scaled_vals = scaler.fit_transform(Xtrain_new)
Xtrain = pd.DataFrame(scaled_vals, columns=Xtrain_new.columns)

# put the non-scaled target back in
Xtrain["YearBuilt"] = Xtrain_YearBuilt
Xtrain["YearRemodAdd"] = Xtrain_YearRemodAdd
Xtrain["GarageYrBlt"] = Xtrain_GarageYrBlt
Xtrain["YrSold"] = Xtrain_YrSold
Xtrain["MoSold"] = Xtrain_MoSold
# inspect the data
Xtrain.head()

In [None]:
#testset
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Xtest_YearBuilt = Xtest["YearBuilt"].values
Xtest_YearRemodAdd = Xtest["YearRemodAdd"].values
Xtest_GarageYrBlt = Xtest["GarageYrBlt"].values
Xtest_YrSold = Xtest["YrSold"].values
Xtest_MoSold = Xtest["MoSold"].values
Xtest_new = Xtest.drop(columns=["YearBuilt", "YearRemodAdd", "GarageYrBlt", "YrSold","MoSold"], axis=1)

scaled_vals = scaler.fit_transform(Xtest_new)
Xtest = pd.DataFrame(scaled_vals, columns=Xtest_new.columns)

# put the non-scaled target back in
Xtest["YearBuilt"] = Xtest_YearBuilt
Xtest["YearRemodAdd"] = Xtest_YearRemodAdd
Xtest["GarageYrBlt"] = Xtest_GarageYrBlt
Xtest["YrSold"] = Xtest_YrSold
Xtest["MoSold"] = Xtest_MoSold
# inspect the data
Xtest.head()

# 6. Modeling

1. Random Forest
2. Adaboost
3. SVR
4. Decision Tree
5. Catboost
6. Elastic Net
7. Ridge
8. Lightgbm
9. Xgboost
10. Stacking method and blending

## 6.1 Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

# specify the hyperparameters and their values
param_grid = [
    {'n_estimators': [30, 50, 100, 150, 200], 'max_depth': [None]},
]

forest_reg = RandomForestRegressor(random_state=2022)

# we'll use 10-fold cross-validation
rf_grid_search = GridSearchCV(forest_reg, param_grid, cv=10, 
                              scoring='neg_root_mean_squared_error',
                              return_train_score=True, verbose=2)

rf_grid_search.fit(Xtrain, ytrain)

In [None]:
# the best model
rf_grid_search.best_estimator_

In [None]:
# the best model's RMSE
-rf_grid_search.best_score_

In [None]:
rf_rmse_score=np.sqrt(-rf_grid_search.best_score_)
print(f'The best Random Forest model has a RMSE of: {rf_rmse_score}')

In [None]:
yhat = rf_grid_search.predict(Xtest)

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = yhat
sub.to_csv('submission_rf.csv',index=False)

In [None]:
#looking at the 20 most important features
feature_scores = pd.Series(rf_grid_search.best_estimator_.feature_importances_, index=x_train.columns)
feature_scores.nlargest(20).plot(kind='barh')
plt.show()

In [None]:
feature_scores.nlargest(20)

In [None]:
#I will remove some features that have  very low scores
x1_train = Xtrain[['OverallQual','FullBath','GarageCars','TotRmsAbvGrd','LotArea','LotFrontage','TotalBsmtSF',
                  'BedroomAbvGr','1stFlrSF','BsmtFinSF1','ScreenPorch','GarageArea','YearBuilt','2ndFlrSF',
                  'BsmtFinSF2','MasVnrArea','YearRemodAdd','OverallCond','WoodDeckSF']]
x1_test = Xtest[['OverallQual','FullBath','GarageCars','TotRmsAbvGrd','LotArea','LotFrontage','TotalBsmtSF',
                  'BedroomAbvGr','1stFlrSF','BsmtFinSF1','ScreenPorch','GarageArea','YearBuilt','2ndFlrSF',
                  'BsmtFinSF2','MasVnrArea','YearRemodAdd','OverallCond','WoodDeckSF']]


In [None]:
#run random forest again
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

# specify the hyperparameters and their values
param_grid = [
    {'n_estimators': [30, 50, 100, 150, 200], 'max_depth': [None]},
]

forest_reg = RandomForestRegressor(random_state=2022)

# we'll use 10-fold cross-validation
rf_grid_search = GridSearchCV(forest_reg, param_grid, cv=10, 
                              scoring='neg_root_mean_squared_error',
                              return_train_score=True, verbose=2)

rf_grid_search.fit(x1_train, ytrain)

In [None]:
# the best model

rf_grid_search.best_estimator_

In [None]:
# the best model's RMSE

-rf_grid_search.best_score_

The score slightly decreases. 

## 6.2 Support Vector Machine

Testing between using the original dataset and the dataset that has some columns removed, I found that the new dataset generates a much lower score.

In [None]:
from sklearn.svm import SVR
svr_param_grid = [
 {'C': [1.0, 10, 100,10000],
 'gamma': ["scale", "auto", 0.01, 0.1, 1, 3, 5, 10]
 },
]
svr = SVR(kernel="rbf")

#10 fold cross_validation and access to train score for later
sv_grid_search = GridSearchCV(svr, svr_param_grid, cv=10, scoring='neg_root_mean_squared_error',
 return_train_score=True, verbose=2)
# fit the best model and hyperparameters to the training set
sv_grid_search.fit(x1_train, ytrain)

In [None]:
#Best svr model
best_sv = sv_grid_search.best_estimator_
best_sv

In [None]:
# the best model's RMSE
-sv_grid_search.best_score_

In [None]:
svr_yhat = best_sv.predict(x1_test)

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = svr_yhat
sub.to_csv('submission.csv',index=False)

## 6.3 Adaboost

In [None]:
#adaboost
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
adb_reg=AdaBoostRegressor()
adb_param_grid = {'n_estimators': [3, 10, 20, 50], 'learning_rate': [0.001, 0.01, 0.1, 0.25, 0.5, 0.75,
1],
 'loss' : ['linear', 'square', 'exponential']}
adb_reg=AdaBoostRegressor(random_state=2022)

# we'll use 10-fold cross-validation and want to have access to the train score
adb_random_grid_search = RandomizedSearchCV(adb_reg, adb_param_grid, cv=10, n_iter=10,
 scoring='neg_root_mean_squared_error', random_state=2022, return_train_score=True)
#fit the best model and hyperparameters to the training dataset
adb_random_grid_search.fit(Xtrain, ytrain)

In [None]:
 # the best model
best_adb = adb_random_grid_search.best_estimator_
best_adb

In [None]:
# the best model's RMSE
-adb_random_grid_search.best_score_

## 6.4 Decision Tree

In [None]:
#decision tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
#We will search for the best hyperparameters for the decision trees, using GridSearch
# and thus cross-validation. We give here several combinations for the hyperparameters to compare.
dt_param_grid= {'min_samples_split': [2, 3, 4, 5], 'max_depth': [2, 4, 6, 8, None]}

#n_estimators: Number of trees in random forest
#max_depth: Maximum number of levels in tree
# min_samples_split: Minimum number of samples required to split a node
Dec_tree_reg = DecisionTreeRegressor()

#Cross-validation with 10 splits
# we also want it to return the train score later
dt_grid_search = GridSearchCV(Dec_tree_reg, dt_param_grid, cv=10,
 scoring='neg_root_mean_squared_error', return_train_score=True)
#We fit the training data to the best model (and thus estimators)
dt_grid_search.fit(Xtrain, ytrain)

In [None]:
# details on the best model for the decision tree algorithm
best_dt=dt_grid_search.best_estimator_
best_dt

In [None]:
# the best model's RMSE
-dt_grid_search.best_score_

## 6.5 Catboost

In [None]:
from sklearn.model_selection import GridSearchCV
from tune_sklearn import TuneGridSearchCV
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.02,0.05,0.1],
              'iterations'    : [70,80,500],
              }

catboost = TuneGridSearchCV(estimator=model, scoring="neg_root_mean_squared_error", param_grid = parameters, cv = 10, n_jobs=-1,refit=True)
catboost.fit(Xtrain, ytrain)

In [None]:
#the best model 
best_cb = catboost.best_estimator

In [None]:
# the best model's RMSE
-catboost.best_score_

In [None]:
#check the important features for catboost
feature_scores = pd.Series(best_cb.feature_importances_, index=Xtrain.columns)
feature_scores.nlargest(40)

In [None]:
#predict
cat_yhat = catboost.predict(Xtest)

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = cat_yhat
sub.to_csv('submission_catboost.csv',index=False)

In [None]:
#test catboost when removing an unimportant feature
x2_train =Xtrain.drop(columns=['MasVnrArea'])
x2_test =Xtest.drop(columns=['MasVnrArea'])

In [None]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.02,0.05,0.1],
              'iterations'    : [70,80,500]
              }

grid = TuneGridSearchCV(estimator=model, scoring="neg_root_mean_squared_error", param_grid = parameters, cv = 10, n_jobs=-1)
grid.fit(x2_train, ytrain)

In [None]:
#the best score
-grid.best_score_

In [None]:
best_cb = grid.best_estimator_
best_cb 
cat1_yhat = best_cb.predict(x2_test) 

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = cat_yhat
sub.to_csv('submission_catboost2.csv',index=False)

## 6.6 Elastic Net

In [None]:
#elastic net
# evaluate an elastic net model on the dataset
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from tune_sklearn import TuneGridSearchCV

eNet = ElasticNet()

parametersGrid = {"max_iter": [1, 5, 10],
                      "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                      "l1_ratio": np.arange(0.0, 1.0, 0.1)}

elastic = TuneGridSearchCV(estimator=eNet, scoring="neg_root_mean_squared_error", param_grid = parametersGrid, cv = 10, n_jobs=-1,refit=True)
elastic.fit(Xtrain, ytrain)

In [None]:
#the best score
-elastic.best_score_

In [None]:
ypred = elastic.predict(Xtest)

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = ypred
sub.to_csv('submission_Enet.csv',index=False)

## 6.7 Ridge

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.kernel_ridge import KernelRidge
model_ridge = Ridge()
parameters = {'alpha':[0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]}



model_ridge= TuneGridSearchCV(estimator=model_ridge, scoring="neg_root_mean_squared_error", param_grid = parameters, cv = 10, n_jobs=-1,refit=True).fit(Xtrain, ytrain)


In [None]:
#the best score
-model_ridge.best_score_

In [None]:
# prediction 
model_ridge_pred = model_ridge.predict(Xtest)

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = model_ridge_pred
sub.to_csv('submission_ridge.csv',index=False)

# 6.8. XGBoost

Since xgboost takes a lot of time to run, I'll test the value of the parameters beforehand.

In [None]:
#testing grid search
from xgboost import XGBRegressor
import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:linear',tree_method = "hist")
XGBRegressor_search = GridSearchCV(model, {'min_child_weight': [1, 5, 10]}, cv=10, scoring="neg_root_mean_squared_error")
XGBRegressor_search.fit(Xtrain, ytrain)
XGBRegressor_search.best_estimator_

In [None]:
from xgboost import XGBRegressor
import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:linear',tree_method = "hist", random_state=2022)
parameters = {'max_depth': [5],
              'gamma': [0.5],
              'colsample_bytree': [1.0],
              'subsample': [1.0],
              'min_child_weight': [5]
              }

xgboost = TuneGridSearchCV(estimator=model,scoring="neg_root_mean_squared_error", param_grid = parameters, cv = 10, n_jobs=-1, refit=True)
xgboost.fit(Xtrain, ytrain)

In [None]:
# The best score
-xgboost.best_score_

## 6.9 Light GBM

In [None]:
#testing grid search
import scipy as scipy
import numpy as np
from scipy import stats
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from lightgbm import LGBMRegressor, LGBMClassifier, Booster
import lightgbm
from lightgbm import LGBMRegressor
model = lightgbm.LGBMRegressor()
lightgbm_search = GridSearchCV(model, {'num_leaves': [6,50,100]}, cv=10, scoring="neg_root_mean_squared_error")
lightgbm_search.fit(Xtrain, ytrain)
lightgbm_search.best_estimator_

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from lightgbm import LGBMRegressor, LGBMClassifier, Booster
import lightgbm
from lightgbm import LGBMRegressor
#light GBM

model = lightgbm.LGBMRegressor()
parameters = {'num_leaves': [50],
    'reg_alpha': [5],
    'min_data_in_leaf': [30],
    'lambda_l1': [0.7000000000000001],
    'lambda_l2':[0.2],
    'reg_lambda': [0],
    'min_child_weight': [1e-5],
    'boosting':['gbdt'],
    'learning_rate':[0.02],
    'drop_rate':[0.1],
    'subsample':[0.3],
    'extra_trees':[True],
    'skip_drop':[0],
            
        
        
              }



lightgbm = TuneGridSearchCV(estimator=model, scoring="neg_root_mean_squared_error", param_grid = parameters, cv = 10, n_jobs=-1,refit=True)
lightgbm.fit(Xtrain, ytrain)

In [None]:
#The best score
-lightgbm.best_score_

In [None]:
lgbm_yhat =lightgbm.predict(Xtest)

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = lgbm_yhat
sub.to_csv('submission_lgbm.csv',index=False)

## 6.10 Stack Regressor

In [None]:
#stack
from mlxtend.regressor import StackingCVRegressor
stack_gen = TuneGridSearchCV(StackingCVRegressor(regressors=(model_ridge, catboost,xgboost, lightgbm,forest_reg),
                                meta_regressor=catboost,
                                use_features_in_secondary=True)

In [None]:
stack_gen.fit(np.array(Xtrain), np.array(ytrain))


In [None]:
n_folds = 5

def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=2022).get_n_splits(Xtrain.values)
    rmse= -cross_val_score(stack_gen, Xtrain.values, ytrain, scoring="neg_root_mean_squared_error", cv = kf)
    return(rmse)
rmse_cv(stack_gen)

In [None]:
#Prediction
y_pred_stack = stack_gen.predict(np.array(Xtest))

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['Predicted'] = y_pred_stack
sub.to_csv('submission_stack.csv',index=False)

In [None]:
#Printing and submitting the result
sub1 = pd.read_csv("submission_ridge.csv")
sub2 = pd.read_csv("submission_catboost.csv")
sub3 = pd.read_csv("submission_stack.csv")
sub4 = pd.read_csv("blend_submission2.csv")
blend= pd.read_csv("submission.csv")
blend['Predicted'] = sub1['Predicted']*0.3 + sub2['Predicted']*0.7
blend.to_csv('blend_submission1.csv', index=False )