In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import trim_outliers_by_percentile, plot_box_and_dist, trim_outliers_by_percentile

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ShuffleSplit, cross_validate

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor, XGBRFRegressor, DMatrix
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor

from scipy import stats

import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
plt.style.use('ggplot')
sns.set_style('white')
# pylab.rcParams['figure.figsize'] = 12,8

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

drop_features = []
drop_rows = set()

# Data analysis
## Find missing numerical data

In [None]:
missing_data = train_data.isnull().sum()
missing_data = missing_data[missing_data > 0].reset_index()
missing_data.columns = ['Columns', 'Missing Value Count']
missing_data = missing_data.sort_values(by='Missing Value Count', ascending=False)

print(missing_data)

In [None]:
drop_features.extend(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'GarageYrBlt', 'GarageCond', 'BsmtFinType2'])

In [None]:
train_data.dtypes[train_data.dtypes != 'object']

In [None]:
plot_box_and_dist(train_data, 'MSSubClass')

In [None]:
MSSubClass_drop_ids, MSSubClass_percentile_trimmed_train_data = trim_outliers_by_percentile(train_data, 'MSSubClass', 0.05, 0.95)
drop_rows.update(MSSubClass_drop_ids)
print(drop_rows)

In [None]:
plot_box_and_dist(MSSubClass_percentile_trimmed_train_data, 'MSSubClass')

In [None]:
plot_box_and_dist(train_data, 'LotFrontage')

In [None]:
LotFrontage_drop_ids, LotFrontage_percentile_trimmed_train_data = trim_outliers_by_percentile(train_data, 'LotFrontage', 0.03, 0.99)

drop_rows.update(LotFrontage_drop_ids)

print(LotFrontage_drop_ids)

In [None]:
plot_box_and_dist(LotFrontage_percentile_trimmed_train_data, 'LotFrontage')

In [None]:
plot_box_and_dist(train_data, 'LotArea')

In [None]:
train_data.query('LotArea > 100000')

In [None]:
sns.scatterplot(data=train_data, x='LotArea', y='SalePrice')

In [None]:
LotArea_drop_ids, LotArea_trimmed_train_data = trim_outliers_by_percentile(train_data, 'LotArea', 0.01, 0.99)
drop_rows.update(LotArea_drop_ids)

In [None]:
plot_box_and_dist(LotArea_trimmed_train_data, 'LotArea')

In [None]:
plot_box_and_dist(train_data, 'OverallQual')

In [None]:
sns.scatterplot(data=train_data, x='OverallQual', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'OverallCond')

In [None]:
sns.scatterplot(data=train_data, x='OverallCond', y='SalePrice')

In [None]:
train_data.query('OverallCond == 6 and SalePrice > 500000')

In [None]:
OverallCond_drop_ids = [ 379, 692 ]
drop_rows.update(OverallCond_drop_ids)

In [None]:
print(drop_rows)

In [None]:
plot_box_and_dist(train_data, 'YearBuilt')

In [None]:
sns.scatterplot(data=train_data, x='YearBuilt', y='SalePrice')

In [None]:
train_data.query('YearBuilt > 1985 and SalePrice > 700000')

In [None]:
YearBuilt_drop_ids = [ 186, 692, 1183 ]
drop_rows.update(YearBuilt_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'YearRemodAdd')

In [None]:
sns.scatterplot(data=train_data, x='YearRemodAdd', y='SalePrice')

In [None]:
train_data.query('YearRemodAdd > 1990 and YearRemodAdd < 2000 and SalePrice > 600000')

In [None]:
YearRemodAdd_drop_ids = [ 692, 1183, 1170 ]
drop_rows.update(YearRemodAdd_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'MasVnrArea')

In [None]:
sns.scatterplot(data=train_data, x='MasVnrArea', y='SalePrice')

In [None]:
MasVnrArea_drop_ids, MasVnrArea_trimmed_train_data = trim_outliers_by_percentile(train_data, 'MasVnrArea', 0.01, 0.997)

In [None]:
plot_box_and_dist(MasVnrArea_trimmed_train_data, 'MasVnrArea')

In [None]:
sns.scatterplot(data=MasVnrArea_trimmed_train_data, x='MasVnrArea', y='SalePrice')

In [None]:
drop_rows.update(MasVnrArea_drop_ids)

In [None]:
print(drop_rows)

In [None]:
plot_box_and_dist(train_data, 'BsmtFinSF1')

In [None]:
sns.scatterplot(data=train_data, x='BsmtFinSF1', y='SalePrice')

In [None]:
train_data.query('BsmtFinSF1 > 4000')

In [None]:
BsmtFinSF1_drop_ids = [ 1299 ]
drop_rows.update(BsmtFinSF1_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'BsmtFinSF2')

In [None]:
sns.scatterplot(data=train_data, x='BsmtFinSF2', y='SalePrice')

In [None]:
train_data.query('BsmtFinSF2 > 1200')

In [None]:
BsmtFinSF2_drop_ids = [ 323 ]
drop_rows.update(BsmtFinSF2_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'BsmtUnfSF')

In [None]:
sns.scatterplot(data=train_data, x='BsmtUnfSF', y='SalePrice')

In [None]:
train_data.query('BsmtUnfSF < 500 and SalePrice > 600000')

In [None]:
BsmtUnfSF_drop_ids = [ 692, 1183 ]
drop_rows.update(BsmtUnfSF_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'TotalBsmtSF')

In [None]:
sns.scatterplot(data=train_data, x='TotalBsmtSF', y='SalePrice')

In [None]:
train_data.query('TotalBsmtSF > 6000')

In [None]:
TotalBsmtSF_drop_ids = [ 1299 ]
drop_rows.update(TotalBsmtSF_drop_ids)

In [None]:
plot_box_and_dist(train_data, '1stFlrSF')

In [None]:
sns.scatterplot(data=train_data, x='1stFlrSF', y='SalePrice')

In [None]:
train_data.query('`1stFlrSF` > 4000')

In [None]:
FirstFlrSF_drop_ids = [ 1299 ]
drop_rows.update(FirstFlrSF_drop_ids)

In [None]:
plot_box_and_dist(train_data, '2ndFlrSF')

In [None]:
sns.scatterplot(data=train_data, x='2ndFlrSF', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'LowQualFinSF')

In [None]:
sns.scatterplot(data=train_data, x='LowQualFinSF', y='SalePrice')

In [None]:
train_data.query('LowQualFinSF > 550')

In [None]:
LowQualFinSF_drop_ids = [ 186 ]
drop_rows.update(LowQualFinSF_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'GrLivArea')

In [None]:
sns.scatterplot(data=train_data, x='GrLivArea', y='SalePrice')

In [None]:
train_data.query('GrLivArea > 4000')

In [None]:
GrLivArea_drop_ids = [ 524, 1299 ]
drop_rows.update(GrLivArea_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'BsmtFullBath')

In [None]:
sns.scatterplot(data=train_data, x='BsmtFullBath', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'BsmtHalfBath')

In [None]:
sns.scatterplot(data=train_data, x='BsmtHalfBath', y='SalePrice')    

In [None]:
train_data.query('BsmtHalfBath == 1 and SalePrice > 600000')

In [None]:
BsmtHalfBath_drop_ids = [ 692 ]
drop_rows.update(BsmtHalfBath_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'FullBath')

In [None]:
sns.scatterplot(data=train_data, x='FullBath', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'HalfBath')

In [None]:
sns.scatterplot(data=train_data, x='HalfBath', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'BedroomAbvGr')

In [None]:
sns.scatterplot(data=train_data, x='BedroomAbvGr', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'KitchenAbvGr')

In [None]:
sns.scatterplot(data=train_data, x='KitchenAbvGr', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'TotRmsAbvGrd')

In [None]:
sns.scatterplot(data=train_data, x='TotRmsAbvGrd', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'Fireplaces')

In [None]:
sns.scatterplot(data=train_data, x='Fireplaces', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'GarageYrBlt')

In [None]:
sns.scatterplot(data=train_data, x='GarageYrBlt', y='SalePrice')

In [None]:
train_data.query('GarageYrBlt > 1980 and SalePrice > 700000')

In [None]:
GarageYrBlt_drop_ids = [ 692, 1183 ]
drop_rows.update(GarageYrBlt_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'GarageCars')

In [None]:
sns.scatterplot(data=train_data, x='GarageCars', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'GarageArea')

In [None]:
sns.scatterplot(data=train_data, x='GarageArea', y='SalePrice')

In [None]:
GarageYrBlt_drop_ids, GarageArea_trimmed_train_data = trim_outliers_by_percentile(train_data, 'GarageArea', 0.025, 0.9)

In [None]:
plot_box_and_dist(GarageArea_trimmed_train_data, 'GarageArea')

In [None]:
sns.scatterplot(data=GarageArea_trimmed_train_data, x='GarageArea', y='SalePrice')

In [None]:
train_data.query('GarageArea < 100')

In [None]:
plot_box_and_dist(train_data, 'WoodDeckSF')

In [None]:
sns.scatterplot(data=train_data, x='WoodDeckSF', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'OpenPorchSF')

In [None]:
sns.scatterplot(data=train_data, x='OpenPorchSF', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'EnclosedPorch')

In [None]:
sns.scatterplot(data=train_data, x='EnclosedPorch', y='SalePrice')

In [None]:
train_data.query('EnclosedPorch > 500')

In [None]:
EnclosedPorch_drop_ids = [ 198 ]
drop_rows.update(EnclosedPorch_drop_ids)

In [None]:
plot_box_and_dist(train_data, '3SsnPorch')

In [None]:
sns.scatterplot(data=train_data, x='3SsnPorch', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'ScreenPorch')

In [None]:
sns.scatterplot(data=train_data, x='ScreenPorch', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'PoolArea')

In [None]:
sns.scatterplot(data=train_data, x='PoolArea', y='SalePrice')

In [None]:
train_data.query('PoolArea > 500 and SalePrice > 700000')

In [None]:
PoolArea_drop_ids = [ 1183 ]
drop_rows.update(PoolArea_drop_ids)

In [None]:
plot_box_and_dist(train_data, 'MiscVal')

In [None]:
sns.scatterplot(data=train_data, x='MiscVal', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'MoSold')

In [None]:
sns.scatterplot(data=train_data, x='MoSold', y='SalePrice')

In [None]:
plot_box_and_dist(train_data, 'YrSold')

In [None]:
sns.scatterplot(data=train_data, x='YrSold', y='SalePrice')

In [None]:
train_data.query('YrSold == 2007 and SalePrice > 700000')

In [None]:
YrSold_drop_ids = [ 692, 1183 ]
drop_rows.update(YrSold_drop_ids)

In [None]:
print(len(drop_rows))

In [None]:
print(drop_features)

In [None]:
train_data.loc[train_data.duplicated()]

In [None]:
test_data.columns.to_list()

# Feature Engineering

In [None]:
train_data['houseage'] = train_data['YrSold'] - train_data['YearBuilt']
test_data['houseage'] = test_data['YrSold'] - test_data['YearBuilt']

train_data['houseremodelage'] = train_data['YrSold'] - train_data['YearRemodAdd']
test_data['houseremodelage'] = test_data['YrSold'] - test_data['YearRemodAdd']

train_data['totalsf'] = train_data['1stFlrSF'] + train_data['2ndFlrSF'] + train_data['BsmtFinSF1'] + train_data['BsmtFinSF2']
test_data['totalsf'] = test_data['1stFlrSF'] + test_data['2ndFlrSF'] + test_data['BsmtFinSF1'] + test_data['BsmtFinSF2']

train_data['totalarea'] = train_data['GrLivArea'] + train_data['TotalBsmtSF']
test_data['totalarea'] = test_data['GrLivArea'] + test_data['TotalBsmtSF']

train_data['totalbaths'] = train_data['BsmtFullBath'] + train_data['FullBath'] + 0.5 * (train_data['BsmtHalfBath'] + train_data['HalfBath']) 
test_data['totalbaths'] = test_data['BsmtFullBath'] + test_data['FullBath'] + 0.5 * (test_data['BsmtHalfBath'] + test_data['HalfBath'])

train_data['totalporchsf'] = train_data['OpenPorchSF'] + train_data['3SsnPorch'] + train_data['EnclosedPorch'] + train_data['ScreenPorch'] + train_data['WoodDeckSF']
test_data['totalporchsf'] = test_data['OpenPorchSF'] + test_data['3SsnPorch'] + test_data['EnclosedPorch'] + test_data['ScreenPorch'] + test_data['WoodDeckSF']

In [None]:
drop_features.extend(['YrSold', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'GrLivArea', 'TotalBsmtSF','BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath', 'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch','WoodDeckSF'])

In [None]:
for column in train_data.columns:
    if column not in drop_features:
        print(column)

In [None]:
print(drop_features)

In [None]:
train_data = train_data.drop(columns=list(drop_features))
train_data = train_data.drop('Id', axis=1)
test_data = test_data.drop(columns=list(drop_features))

In [None]:
correlation_matrix = train_data.corr(numeric_only=True)
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)

In [None]:
categorical_but_numerical_values = [
    'OverallQual',
    'BedroomAbvGr', 
    'KitchenAbvGr', 
    'TotRmsAbvGrd', 
    'Fireplaces', 
    'GarageCars', 
    'MoSold',
]

numerical_features = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_features = numerical_features.drop('SalePrice')


categorical_features_onehot = [
    'MSSubClass', 
    'MSZoning', 
    'Street', 
    'LotConfig',
    'LotShape', 
    'LandContour', 
    'Utilities', 
    'LotConfig', 
    'LandSlope', 
    'Neighborhood',
    'Condition1', 
    'Condition2', 
    'BldgType', 
    'HouseStyle', 
    'RoofStyle', 
    'Exterior1st', 
    'Exterior2nd',
    'MasVnrType',
    'Foundation',  
    'Electrical',  
    'SaleType', 
    'SaleCondition', 
    'Heating', 
    'GarageType', 
    'RoofMatl',
    'ExterQual', 
    'ExterCond', 
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1',
    'HeatingQC',
    'CentralAir', 
    'KitchenQual', 
    'Functional', 
    'GarageFinish', 
    'PavedDrive'
]

categorical_features_ordinal = [
    'LotShape', 
    'LandContour',
    'Utilities',
    'LandSlope',  
    'BsmtQual',  
    'BsmtFinType1',  
    'CentralAir',  
    'Functional', 
    'FireplaceQu', 
    'GarageFinish', 
    'GarageQual', 
    'PavedDrive', 
    'ExterCond', 
    'KitchenQual', 
    'BsmtExposure', 
    'HeatingQC','ExterQual', 
    'BsmtCond',
    'OverallQual',
    'OverallCond',
    'MoSold',
    'GarageQual'
]

## Build Data Preprocessing pipelines

In [None]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
    # ('normalizer', Normalizer())
])

categorical_pipeline_onehot = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

categorical_pipeline_ordinal = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [None]:
preprocessor = ColumnTransformer(
    [
        ('num_processed', numerical_pipeline, numerical_features),
        ('ohe_processed', categorical_pipeline_onehot, categorical_features_onehot),
        ('ord_processed', categorical_pipeline_ordinal, categorical_features_ordinal)
    ], 
    remainder='passthrough',
    n_jobs=-1
)

y = np.log1p(train_data['SalePrice'])
y = y.drop(index=list(drop_rows))
X = train_data.drop('SalePrice', axis=1)
X = X.drop(index=list(drop_rows))

pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [None]:
X_train_preprocessed = pipeline.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_preprocessed, y, test_size=0.2, random_state=25)

In [None]:
train_data.describe()

# Model Selection

In [None]:
MLA_algorithms = [
    # LinearRegression(),
    Ridge(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    XGBRegressor(),
    XGBRFRegressor(),
    CatBoostRegressor(),
    LGBMRegressor()
]

## Cross Validation to assess the model performance

In [None]:
cv_split = ShuffleSplit(n_splits=10, train_size=0.6, test_size=0.3, random_state=25)

MLA_columns = [
    'MLA Name', 
    'MLA Parameters', 
    'MLA Train Accuracy Mean', 
    'MLA Test Accuracy Mean', 
    'MLA Test Accuracy 3*STD', 
    'MLA Time'
]

MLA_compare = pd.DataFrame(columns=MLA_columns)

row_index = 0

for alg in MLA_algorithms:

    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

    cv_results = cross_validate(alg, X_train, y_train, cv=cv_split, return_train_score=True)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std() * 3

    row_index += 1


MLA_compare.sort_values(by=['MLA Test Accuracy Mean'], ascending=False, inplace=True)
MLA_compare

In [None]:
sns.barplot(x='MLA Test Accuracy Mean', y='MLA Name', data=MLA_compare, color='m')

plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')

# Hyperparameter tuning

### Cat Boost Regressor

In [None]:
cat_boost_regressor_params_grid = {
    'iterations': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7]
}

train_pool = Pool(X_train, y_train)

cat_boost_regressor = CatBoostRegressor(task_type='GPU', devices='0', verbose=1)

cat_boost_grid_search_result = cat_boost_regressor.grid_search(cat_boost_regressor_params_grid, train_pool, cv=5, refit=True, plot=True)

In [None]:
print(cat_boost_regressor.best_score_.get('learn').get('RMSE'))

### Gradient Boost Regressor

In [None]:
gradient_boosting_regressor_params_grid = {
    'max_depth': [12, 15, 20],
    'n_estimators': [200, 300, 1000],
    'min_samples_leaf': [10, 25, 50],
    'learning_rate': [0.001, 0.01, 0.1],
    'max_features': [0.01, 0.1, 0.7]
}

gradient_boosting_regressor = GradientBoostingRegressor()

grad_boost_grid_search_result = GridSearchCV(gradient_boosting_regressor, gradient_boosting_regressor_params_grid, cv=5, n_jobs=-1, verbose=1, refit=True)

grad_boost_grid_search_result.fit(X_train, y_train)

### Light GBM Regressor

In [None]:
lgbm_regressor_params_grid = {
    'n_estimators': [100, 200, 300, 500],
    'num_leaves': [20, 30, 50, 100],
    'max_depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_samples': [20, 30, 40],
    'boosting_type': ['gbdt', 'dart']
}

lgbm_regressor = LGBMRegressor(verbose=-1)

lgbm_grid_search_result = GridSearchCV(lgbm_regressor, lgbm_regressor_params_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=0)

lgbm_grid_search_result.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * lgbm_grid_search_result.best_score_)

### Ridge

In [None]:
ridge_regressor_params_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

ridge_regressor = Ridge()

ridge_grid_search_result = GridSearchCV(ridge_regressor, ridge_regressor_params_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=0)

ridge_grid_search_result.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * ridge_grid_search_result.best_score_)

### Random Forest Regressor

In [None]:
random_forest_regressor_params_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

random_forest_regressor = RandomForestRegressor()

random_forest_grid_search_result = GridSearchCV(random_forest_regressor, random_forest_regressor_params_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=0)

random_forest_grid_search_result.fit(X_train, y_train)


In [None]:
np.sqrt(-1 * random_forest_grid_search_result.best_score_)

### XGB Regressor

In [None]:
xgb_regressor_params_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0]
}

xgb_regressor = XGBRegressor(device='cuda', predictor='gpu_predictor')

xgb_grid_search_result = GridSearchCV(xgb_regressor, xgb_regressor_params_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=0)

xgb_grid_search_result.fit(X_train, y_train)

### Voting Regressor

In [None]:
voting_regressor = VotingRegressor(
    estimators=[
        ('lgbm', lgbm_grid_search_result.best_estimator_),
        ('catboost', cat_boost_regressor),
        ('xgb', xgb_grid_search_result.best_estimator_),
    ]
)

voting_regressor.fit(X_train, y_train)

In [None]:
mean_squared_error(y_test, voting_regressor.predict(X_test), squared=False)

### Stacking Regressor

In [None]:
estimators = [
    ('catboost', cat_boost_regressor),
    ('lgbm', lgbm_grid_search_result.best_estimator_),
    ('xgb', xgb_grid_search_result.best_estimator_),
    ('ridge', ridge_grid_search_result.best_estimator_),
    ('random_forest', random_forest_grid_search_result.best_estimator_),
    ('gradient_boosting', grad_boost_grid_search_result.best_estimator_)
]

stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=voting_regressor
)

stacking_regressor.fit(X_train, y_train)

In [None]:
y_pred = stacking_regressor.predict(X_test)

mean_squared_error(y_test, y_pred, squared=False)

# Final Test

In [None]:
print(test_data.columns.to_list())

In [None]:
print(test_data.columns.to_list())

In [None]:
print(len(test_data.columns.tolist()))

In [None]:
print(len(X.columns.tolist()))

In [None]:
submission = test_data[['Id']]

In [None]:
#test_data = test_data.drop('Id', axis=1)

transformed_test_data = pipeline.transform(test_data)

In [None]:
y_pred_test = np.exp(stacking_regressor.predict(transformed_test_data))

In [None]:
# submission = test_data[['Id']]
# submission['SalePrice'] = y_pred_test

# submission.to_csv('submission.csv', index=False)