In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Load datasets
train_path = '/Users/tanishq/Desktop/Projects/home-data-for-ml-course/train.csv'
test_path = '/Users/tanishq/Desktop/Projects/home-data-for-ml-course/test.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Handle Missing Values
# Drop columns with excessive missing values or irrelevant ones
columns_to_drop = ['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature']
data = train_data.drop(columns=columns_to_drop, axis=1)
test_ids = test_data['Id']
test_data = test_data.drop(columns=columns_to_drop, axis=1)

# Impute missing numerical values with median and categorical with mode
num_cols = data.select_dtypes(include=['float64', 'int64']).columns.drop('SalePrice')
cat_cols = data.select_dtypes(include=['object']).columns

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

data[num_cols] = num_imputer.fit_transform(data[num_cols])
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

test_data[num_cols] = num_imputer.transform(test_data[num_cols])
test_data[cat_cols] = cat_imputer.transform(test_data[cat_cols])

# Feature Engineering
# Add property age feature
data['Age'] = data['YrSold'] - data['YearBuilt']
test_data['Age'] = test_data['YrSold'] - test_data['YearBuilt']

# Drop original date-related columns
data = data.drop(['YearBuilt', 'YearRemodAdd', 'YrSold', 'MoSold'], axis=1)
test_data = test_data.drop(['YearBuilt', 'YearRemodAdd', 'YrSold', 'MoSold'], axis=1)

# Encode categorical variables using OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cats = pd.DataFrame(encoder.fit_transform(data[cat_cols]), columns=encoder.get_feature_names_out(cat_cols))
encoded_test_cats = pd.DataFrame(encoder.transform(test_data[cat_cols]), columns=encoder.get_feature_names_out(cat_cols))

# Combine encoded categorical features with the rest of the data
data = pd.concat([data.drop(columns=cat_cols), encoded_cats], axis=1)
test_data = pd.concat([test_data.drop(columns=cat_cols), encoded_test_cats], axis=1)

# Scale numerical features
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(data.drop(columns=['SalePrice'])), columns=data.drop(columns=['SalePrice']).columns)
scaled_test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)

# Split data into features and target
y_price = data['SalePrice']
X = scaled_data

# Train-Test Split
X_train, X_test, y_train_price, y_test_price = train_test_split(
    X, y_price, test_size=0.2, random_state=42
)

# Train XGBoost Model
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=7, random_state=42)
xgb_model.fit(X_train, y_train_price)
y_pred_xgb = xgb_model.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test_price, y_pred_xgb))
r2_xgb = r2_score(y_test_price, y_pred_xgb)

# Train LightGBM Model
lgbm_model = LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=7, random_state=42)
lgbm_model.fit(X_train, y_train_price)
y_pred_lgbm = lgbm_model.predict(X_test)
rmse_lgbm = np.sqrt(mean_squared_error(y_test_price, y_pred_lgbm))
r2_lgbm = r2_score(y_test_price, y_pred_lgbm)

# Train CatBoost Model
cat_model = CatBoostRegressor(iterations=500, learning_rate=0.05, depth=7, verbose=0, random_state=42)
cat_model.fit(X_train, y_train_price)
y_pred_cat = cat_model.predict(X_test)
rmse_cat = np.sqrt(mean_squared_error(y_test_price, y_pred_cat))
r2_cat = r2_score(y_test_price, y_pred_cat)

# Results
print(f"XGBoost RMSE: {rmse_xgb}, R²: {r2_xgb * 100:.2f}%")
print(f"LightGBM RMSE: {rmse_lgbm}, R²: {r2_lgbm * 100:.2f}%")
print(f"CatBoost RMSE: {rmse_cat}, R²: {r2_cat * 100:.2f}%")

# Test Data Predictions (Average of all models)
test_sale_price = (xgb_model.predict(scaled_test_data) + 
                   lgbm_model.predict(scaled_test_data) + 
                   cat_model.predict(scaled_test_data)) / 3

# Save Predictions
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_sale_price
})
submission.to_csv('/Users/tanishq/Desktop/Projects/home-data-for-ml-course/submission.csv', index=False)
print("Predictions saved to submission.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3191
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 148
[LightGBM] [Info] Start training from score 181441.541952
XGBoost RMSE: 26656.926002302273, R²: 90.74%
LightGBM RMSE: 28984.715238377703, R²: 89.05%
CatBoost RMSE: 27045.088092213595, R²: 90.46%
Predictions saved to submission.csv
