In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('../data/cleaned_train_with_amenities.csv', dtype={'postal_code': str})
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162691 entries, 0 to 162690
Data columns (total 28 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   town                       162691 non-null  object 
 1   flat_type                  162691 non-null  object 
 2   block                      162691 non-null  object 
 3   street                     162691 non-null  object 
 4   floor_area_sqm             162691 non-null  float64
 5   flat_model                 162691 non-null  object 
 6   lease_commence_data        162691 non-null  int64  
 7   resale_price               162691 non-null  float64
 8   postal_code                162691 non-null  object 
 9   latitude                   162691 non-null  float64
 10  longitude                  162691 non-null  float64
 11  max_floor                  162691 non-null  int64  
 12  subzone                    162691 non-null  object 
 13  height                     16

In [3]:
# Split the data into training and validation sets (80% train, 20% val) using random sampling
train_indices = train_df.sample(frac=0.8, random_state=42).index
val_indices = train_df.drop(train_indices).index
training_set = train_df.loc[train_indices]
validation_set = train_df.loc[val_indices]

In [4]:
# create column 'loc_rank' = average (resale price / floor_area_sqm) for each location postal_code
training_set['psf'] = training_set['resale_price'] / training_set['floor_area_sqm'] / training_set['index']
training_set['loc_rank'] = training_set.groupby('postal_code')['psf'].transform('mean')
# create postal_df with postal_code and loc_rank (for validation set)
postal_df = training_set[['postal_code', 'loc_rank']].drop_duplicates().reset_index(drop=True)
# merge postal_df with validation_set to get loc_rank for validation_set
validation_set = validation_set.merge(postal_df, on='postal_code', how='left')

In [7]:
# create trainknn_df and valknn_df with selected features
selected_numeric_features = ['resale_price', 'floor_area_sqm', 'max_floor', 'height', 'index', 'mrt_status','age_at_sale']
distance_features = ['mrt_distance', 'mall_distance', 'primary_school_distance', 'secondary_school_distance', 'hawker_distance']
selected_numeric_features += distance_features
selected_categorical_features = ['postal_code']
trainknn_df = training_set[selected_numeric_features + selected_categorical_features].copy()
valknn_df = validation_set[selected_numeric_features + selected_categorical_features].copy()
# change mrt_status to binary feature: 1 if 'open', 0 otherwise
trainknn_df['mrt_status'] = training_set['mrt_status'].apply(lambda x: 1 if x == 'open' else 0)
valknn_df['mrt_status'] = validation_set['mrt_status'].apply(lambda x: 1 if x == 'open' else 0)
for feature in distance_features:
    trainknn_df[feature] = 1/(trainknn_df[feature] + 1) # add 1 to avoid division by zero
    valknn_df[feature] = 1/(valknn_df[feature] + 1) # add 1 to avoid division by zero

# separate features and target variable
Xknn_train = trainknn_df.drop(columns=['resale_price'])
yknn_train = trainknn_df['resale_price']
Xknn_val = valknn_df.drop(columns=['resale_price'])
yknn_val = valknn_df['resale_price']
# align X_val to have same columns as X_train
Xknn_val = Xknn_val.reindex(columns=Xknn_train.columns, fill_value=0)

In [8]:
# create KNN class based on postal code
class KNNPostalCode:
    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.models = {}
        self.scaler = StandardScaler()

    def fit(self, X, y):
        # Scale numeric features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        X[numeric_features] = self.scaler.fit_transform(X[numeric_features])

        # Train a KNN model for each postal code
        for postal_code in X['postal_code'].unique():
            mask = X['postal_code'] == postal_code
            X_subset = X[mask].drop(columns=['postal_code'])
            y_subset = y[mask]
            if len(X_subset) >= self.n_neighbors:  # Ensure enough samples to train
                model = KNeighborsRegressor(n_neighbors=self.n_neighbors)
                model.fit(X_subset, y_subset)
                self.models[postal_code] = model

    def predict(self, X):
        # Scale numeric features
        numeric_features = X.select_dtypes(include=[np.number]).columns
        X[numeric_features] = self.scaler.transform(X[numeric_features])

        predictions = []
        for _, row in X.iterrows():
            postal_code = row['postal_code']
            if postal_code in self.models:
                model = self.models[postal_code]
                # drop postal_code column for prediction
                pred = model.predict([row.drop(labels=['postal_code'])])[0]
            else:
                pred = np.nan  # or some default value or strategy
            predictions.append(pred)
        return np.array(predictions)

# Train and evaluate KNNPostalCode model
knn_postal_model = KNNPostalCode(n_neighbors=5)
knn_postal_model.fit(Xknn_train, yknn_train)
print("KNN postal code model trained.")
# check if any postal codes in validation set are not in training set
missing_postal_codes = set(Xknn_val['postal_code'].unique()) - set(Xknn_train['postal_code'].unique())
print(f'Missing postal codes in validation set: {missing_postal_codes}')
# for missing postal codes, fill in with the nearest postal code in training set (based on string similarity)
from difflib import get_close_matches
for postal_code in missing_postal_codes:
    closest_match = get_close_matches(postal_code, Xknn_train['postal_code'].unique(), n=1)
    if closest_match:
        # Avoid SettingWithCopyWarning by using .loc on a copy and then assigning back
        idx = Xknn_val.index[Xknn_val['postal_code'] == postal_code]
        Xknn_val.loc[idx, 'postal_code'] = closest_match[0]

# predict and evaluate
y_pred_knn_postal = knn_postal_model.predict(Xknn_val)
# Handle NaN predictions (e.g., by using the mean of y_train)
y_pred_knn_postal = np.where(np.isnan(y_pred_knn_postal), yknn_train.mean(), y_pred_knn_postal)
knn_postal_rmse = np.sqrt(mean_squared_error(yknn_val, y_pred_knn_postal))
print(f'KNN Postal Code RMSE: {knn_postal_rmse}')


KNN postal code model trained.
Missing postal codes in validation set: {'530157', '640424', '164026', '570162', '400410', '360082', '652464', '650355', '822406', '730164', '760779', '682672', '141057', '350123', '550119', '530604', '570311', '510420', '470111', '730678', '400328', '650124', '550304', '670222', '670424', '600326', '650356', '150010', '560448', '760285', '650375', '530612', '320060', '560722', '760848', '650371', '550244'}




KNN Postal Code RMSE: 75006.79178469494


