# Kaggle House Price Prediction 

##  Step 1: Load Cleaned Data


#### **Why?**
- Load the trained data with numpy for model trainings.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

In [4]:
cleaned_train_df = pd.read_csv("data/cleaned_train_data.csv")
cleaned_test_df = pd.read_csv("data/cleaned_test_data.csv")


## Step 2: Parameter Tune Models

#### **Why?**
- Train the cleaned data with different ML algorithms/models.
- Tune the hyperparameters with GridSearchCV.
- Compare and take the model with the best result.

In [6]:
# Split train dataset into train and test set
X_preprocessed = cleaned_train_df.drop('SalePrice', axis=1)
y = cleaned_train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


In [7]:
# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Define base estimators for the stacking model
base_estimators = [
    ('catboost', CatBoostRegressor(
        iterations=500,
        learning_rate=0.1,
        depth=6,
        random_state=42,
        verbose=0
    )),
    ('xgboost', XGBRegressor(
        n_estimators=500,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    )),
    ('lgbm', lgb.LGBMRegressor(
        n_estimators=100,
        learning_rate=0.1,
        num_leaves=50,
        random_state=42
    ))
]

meta_model = Ridge(alpha=1.0) # Define the meta-model for stacking model

stacking_model = StackingRegressor( # Create the stacking regressor
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=cv,
)

# Define the models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(random_state=42, verbose=0),
    'LightGBM': lgb.LGBMRegressor(random_state=42, verbose=0),
    'StackingRegressor': stacking_model,
}

# Define the hyperparameter grids for each model for tunning
param_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 5, 10],
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10],
    },
    'CatBoost': {
        'iterations': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'depth': [3, 6, 10],
    },
    'LightGBM': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'num_leaves': [31, 50, 70]
    },
    'StackingRegressor': {}
}

In [8]:
# Train and tune each model with GridSearchCV
grids = {}
for model_name, model in models.items():
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grids[model_name].fit(X_train, y_train)
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best RMSE for {model_name}: {best_score}\n')


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters for LinearRegression: {}
Best RMSE for LinearRegression: 0.12604582227243558

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Best RMSE for RandomForest: 0.13903844592654707

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Best RMSE for XGBoost: 0.12674960989045744

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for CatBoost: {'depth': 3, 'iterations': 500, 'learning_rate': 0.1}
Best RMSE for CatBoost: 0.12071856697062064

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   3.0s
[CV] END max_depth=None, min_samples_split=5, n_estimators=500; total time=   6.2s
[CV] END max_depth=10, min_

In [9]:
# Calculate RMSE of the best model
best_rmse = float('inf')
best_model = None
best_model_name = None
for i in grids.keys():
    rmse = np.sqrt(mean_squared_error(grids[i].predict(X_test), y_test))
    print(i + ': ' + str(rmse))

    if best_rmse > rmse:
        best_rmse = rmse
        best_model = grids[i]
        best_model_name = i

print(f"Best model: {best_model_name}, with RMSE: {best_rmse}")


LinearRegression: 0.12086778603314495
RandomForest: 0.12874220798702674
XGBoost: 0.12302234828438113
CatBoost: 0.11745166828390521
LightGBM: 0.1266565378590037
StackingRegressor: 0.11715661218238796
Best model: StackingRegressor, with RMSE: 0.11715661218238796


In [10]:
# Export result from best model to submission.csv file
y_pred = np.exp(best_model.predict(cleaned_test_df))

df_out = pd.DataFrame({
    "Id": cleaned_test_df['Id'],
    "SalePrice": y_pred
})
df_out.to_csv('submission.csv', index=False)
