<font face="Times New Roman" size=5>
<div dir=rtl align="center">
<font face="Times New Roman" size=5>
</font>
<br>
<img src="https://static.tildacdn.one/tild3639-3035-4131-a461-363737393037/noroot.png" alt="University Logo" width="400" height="224">
<br>
<font face="Times New Roman" size=5 align=center>
Sharif University of Technology
<br>
Electrical Engineering Department
</font>
<br>
<font size=6>
Kaggle Project: House Prices - Advanced Regression Techniques
</font>
<br>
<font size=4>
Zahra Helalizadeh 400102193
<br>
</font>
<font size=4>
Spring 2025
<br>
</font>
<font face="Times New Roman" size=4>
</font>
</div></font>

# Installations

In [3]:
!pip3 install catboost



# Libraries

In [4]:
# =========================
# LIBRARIES & PATH SETUP
# =========================
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, Lasso, Ridge, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from mlxtend.regressor import StackingRegressor
import warnings

warnings.filterwarnings("ignore")

# Data Import

In [5]:
# =========================
# DATA IMPORT
# =========================
train_df = pd.read_csv('AmesHousing.csv')
test_df = pd.read_csv('test.csv')

train_df.columns = train_df.columns.str.replace(' ', '')
y_target = train_df['SalePrice']
test_ids = test_df['Id']
test_df = test_df.drop(columns=['Id'])
train_features = train_df.drop(columns=['SalePrice','Order','PID'])

# Combine datasets for preprocessing
combined = pd.concat([train_features, test_df], axis=0, sort=False)

# Handle Missing Values

In [6]:
# =========================
# HANDLE MISSING VALUES
# =========================
# Convert numeric codes to categorical
for col in ['MSSubClass','YrSold','MoSold']:
    combined[col] = combined[col].astype(str)

fill_map = {
    'Functional':'Typ','Electrical':'SBrkr','KitchenQual':'TA',
    'Exterior1st':combined['Exterior1st'].mode()[0],'Exterior2nd':combined['Exterior2nd'].mode()[0],
    'SaleType':combined['SaleType'].mode()[0],'PoolQC':'None','Alley':'None',
    'FireplaceQu':'None','Fence':'None','MiscFeature':'None'
}
combined.fillna(value=fill_map, inplace=True)

for col in ['GarageArea','GarageCars']:
    combined[col].fillna(0, inplace=True)

for col in ['GarageType','GarageFinish','GarageQual','GarageCond']:
    combined[col].fillna('None', inplace=True)

for col in ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']:
    combined[col].fillna('None', inplace=True)

combined.drop(columns=['GarageYrBlt','YearRemodAdd'], inplace=True)

# KNN Imputation for remaining numeric NaNs
def knn_impute(df):
    num_df = df.select_dtypes(include=[np.number])
    cat_df = df.select_dtypes(exclude=[np.number])
    nan_cols = num_df.columns[num_df.isna().any()]
    not_nan_cols = num_df.columns.difference(nan_cols)
    for col in nan_cols:
        test_subset = num_df[num_df[col].isna()]
        train_subset = num_df.dropna()
        knn = KNeighborsRegressor(n_neighbors=5)
        knn.fit(train_subset[not_nan_cols], train_subset[col])
        num_df.loc[num_df[col].isna(), col] = knn.predict(test_subset[not_nan_cols])
    return pd.concat([num_df, cat_df], axis=1)

combined = knn_impute(combined)
cat_cols = combined.select_dtypes(include=['object']).columns
combined[cat_cols] = combined[cat_cols].fillna('None')

# Feature Engineering

In [7]:
# =========================
# FEATURE ENGINEERING
# =========================
combined['SqFtPerRoom'] = combined['GrLivArea'] / (combined['TotRmsAbvGrd'] + combined['FullBath'] + combined['HalfBath'] + combined['KitchenAbvGr'])
combined['Total_Home_Quality'] = combined['OverallQual'] + combined['OverallCond']
combined['Total_Bathrooms'] = combined['FullBath'] + 0.5*combined['HalfBath'] + combined['BsmtFullBath'] + 0.5*combined['BsmtHalfBath']
combined['HighQualSF'] = combined['1stFlrSF'] + combined['2ndFlrSF']
combined['Age_House'] = 2025 - combined['YearBuilt']  # house age feature

# Encoding and Skewness

In [8]:
# =========================
# ENCODING & SKEWNESS
# =========================
combined = pd.get_dummies(combined)
num_feats = combined.select_dtypes(include=['float64','int64']).columns
skewed = combined[num_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
skewed_feats = skewed[skewed>0.5].index
for feat in skewed_feats:
    combined[feat] = np.log1p(combined[feat])

# Split Back Train and Test

In [9]:
# =========================
# SPLIT BACK TRAIN & TEST
# =========================
X_train_all = combined[:len(y_target)]
X_test_all = combined[len(y_target):]

# RMSE and CV Functions

In [10]:
# =========================
# RMSE & CV FUNCTION
# =========================
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
def cv_rmse(model):
    scores = -cross_val_score(model, X_train_all, np.log1p(y_target), scoring="neg_mean_squared_error", cv=kf)
    return np.sqrt(scores)

# Model Stacking

In [11]:
# =========================
# MODEL STACKING
# =========================
# Base models
cat_model = CatBoostRegressor(iterations=6000, learning_rate=0.005, depth=4, l2_leaf_reg=1, eval_metric='RMSE', verbose=0, random_seed=42)
lgb_model = LGBMRegressor(n_estimators=5000, learning_rate=0.01, max_depth=6)
xgb_model = xgb.XGBRegressor(n_estimators=5000, learning_rate=0.01, max_depth=6, verbosity=0)

stack_model = StackingRegressor(
    regressors=[cat_model, lgb_model, xgb_model],
    meta_regressor=CatBoostRegressor(iterations=3000, learning_rate=0.01, depth=4, verbose=0),
    use_features_in_secondary=True
)

# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X_train_all, np.log1p(y_target), test_size=0.1, random_state=42)

# Fit stacked model
stack_model.fit(X_train, y_train)

# Validation predictions
y_val_pred = stack_model.predict(X_val)
print("Stacked Ensemble RMSE:", rmse(y_val, y_val_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002703 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4555
[LightGBM] [Info] Number of data points in the train set: 2637, number of used features: 257
[LightGBM] [Info] Start training from score 12.015622
Stacked Ensemble RMSE: 0.10402060376375544


# Submission

In [12]:
# =========================
# SUBMISSION
# =========================
y_test_pred = np.expm1(stack_model.predict(X_test_all))
submission_df = pd.DataFrame({'Id': test_ids, 'SalePrice': y_test_pred})
submission_df.to_csv("submission.csv", index=False)
submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,106448.127961
1,1462,171736.773638
2,1463,187953.306192
3,1464,194481.578564
4,1465,191395.068885
