In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
train_df = pd.read_csv('../data/cleaned_train_with_amenities.csv', dtype={'postal_code': str})
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162691 entries, 0 to 162690
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   town                       162691 non-null  object 
 1   flat_type                  162691 non-null  object 
 2   block                      162691 non-null  object 
 3   street                     162691 non-null  object 
 4   floor_area_sqm             162691 non-null  float64
 5   flat_model                 162691 non-null  object 
 6   lease_commence_data        162691 non-null  int64  
 7   resale_price               162691 non-null  float64
 8   postal_code                162691 non-null  object 
 9   latitude                   162691 non-null  float64
 10  longitude                  162691 non-null  float64
 11  max_floor                  162691 non-null  int64  
 12  subzone                    162691 non-null  object 
 13  height                     16

In [None]:
# Remove outliers based on floor_area_sqm more than 200 sqm
train_df = train_df[train_df['floor_area_sqm'] <= 200]
selected_numeric_features = ['resale_price', 'floor_area_sqm', 'max_floor', 'height', 'index', 'mrt_status','age_at_sale','flat_type_num', 'flat_model_num','longitude','latitude']
distance_features = ['mrt_distance', 'mall_distance', 'primary_school_distance', 'secondary_school_distance', 'hawker_distance']
selected_numeric_features += distance_features

In [42]:
# Select features for model
selected_categorical_features = ['subzone','flat_model']
target_variable = 'resale_price'
train_df = train_df[selected_numeric_features + selected_categorical_features]
# Split the data into training and validation sets (80% train, 20% val) using random sampling
training_set, validation_set = train_test_split(train_df, test_size=0.2, random_state=42)
# separate features and target variable
X_train = training_set.drop(columns=[target_variable])
y_train = training_set[target_variable]
X_val = validation_set.drop(columns=[target_variable])
y_val = validation_set[target_variable]
# Ensure categorical columns have appropriate dtype
for col in selected_categorical_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype('category')
    if col in X_val.columns:
        X_val[col] = X_val[col].astype('category')
# Use CatBoost Regressor
from catboost import CatBoostRegressor
# Compute categorical feature indices from X_train (safer than using train_df)
cat_features_indices = [X_train.columns.get_loc(col) for col in selected_categorical_features if col in X_train.columns]
print('cat_features_indices used for CatBoost:', cat_features_indices)
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, eval_metric='RMSE', random_seed=42, logging_level='Silent')
model.fit(X_train, y_train, cat_features=cat_features_indices, eval_set=(X_val, y_val), use_best_model=True)
# Make predictions on validation set
y_val_pred = model.predict(X_val)
# Calculate RMSE (works with older scikit-learn versions that don't accept `squared` keyword)
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
print(f"Validation RMSE: {rmse:.2f}")

cat_features_indices used for CatBoost: [13, 14]
Validation RMSE: 26744.63
Validation RMSE: 26744.63
