In [None]:
# Setup imports
from typing import Union
import config
import pandas as pd
import numpy as np
import pathlib
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from IPython.display import display

Matplotlib is building the font cache; this may take a moment.


In [2]:
# Helper function to load data
def load_data_splits(dataset_prefix: str, base_path: pathlib.Path) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
  """
  Loads the train/test data splits for a given dataset prefix.

  Parameters:
      dataset_prefix (str): The prefix for the dataset files (e.g., 'orig', 'rem', 'cap').
      base_path (pathlib.Path): The path to the processed data directory from config.

  Returns:
      tuple: A tuple containing (X_train, X_test, y_train, y_test) as DataFrames/Series.
  """
  X_train = pd.read_csv(base_path / f'X_{dataset_prefix}_train.csv')
  X_test = pd.read_csv(base_path / f'X_{dataset_prefix}_test.csv')
  y_train = pd.read_csv(base_path / f'y_{dataset_prefix}_train.csv').squeeze()
  y_test = pd.read_csv(base_path / f'y_{dataset_prefix}_test.csv').squeeze()
  return X_train, X_test, y_train, y_test

# Load the data splits, origina, removed, and capped
X_original_train, X_original_test, y_original_train, y_original_test = load_data_splits('original', config.PROCESSED_DATA_DIR)
X_removed_train, X_removed_test, y_removed_train, y_removed_test = load_data_splits('removed', config.PROCESSED_DATA_DIR)
X_capped_train, X_capped_test, y_capped_train, y_capped_test = load_data_splits('capped', config.PROCESSED_DATA_DIR)

In [3]:
# Load the numerical features that need to be scaled
numerical_features_to_scale = np.loadtxt(config.PROCESSED_DATA_DIR / 'numerical_features_original.txt', dtype=str)

print("Numerical features to be scaled:")
for i, feature in enumerate(numerical_features_to_scale, 1):
    print(f"{i}. {feature}")

# Verify these features exist in our datasets
print("\nVerifying features in datasets:")
print("Original dataset features:", all(feature in X_original_train.columns for feature in numerical_features_to_scale))
print("Removed dataset features:", all(feature in X_removed_train.columns for feature in numerical_features_to_scale))
print("Capped dataset features:", all(feature in X_capped_train.columns for feature in numerical_features_to_scale))


Numerical features to be scaled:
1. BHK_NO.
2. SQUARE_FT
3. LONGITUDE
4. LATITUDE
5. PRICE_PER_SQFT
6. LAT_LONG_INTERACTION

Verifying features in datasets:
Original dataset features: True
Removed dataset features: True
Capped dataset features: True


In [4]:
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame, numerical_features: list) -> tuple[pd.DataFrame, pd.DataFrame, StandardScaler]:
    """
    Applies StandardScaler to the numerical features of the dataset.
    Scaler is fitted only on training data to prevent data leakage.

    Parameters:
        X_train (pd.DataFrame): The training feature set
        X_test (pd.DataFrame): The testing feature set
        numerical_features (list): Column names to be scaled

    Returns:
        tuple: (X_train_scaled, X_test_scaled, scaler_object)
    """
    scaler = StandardScaler()
    
    # Create copies to avoid modifying the original dataframes
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    
    # Fit on training data and transform both sets
    X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
    X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])
    
    return X_train_scaled, X_test_scaled, scaler

# Test the scaling function on a small subset of data
print("Testing scaling function on original dataset:")
print("\nBefore scaling (first 3 rows):")
print(X_original_train[numerical_features_to_scale].head(3))

X_train_scaled, X_test_scaled, _ = scale_features(X_original_train, X_original_test, numerical_features_to_scale)

print("\nAfter scaling (first 3 rows):")
print(X_train_scaled[numerical_features_to_scale].head(3))


Testing scaling function on original dataset:

Before scaling (first 3 rows):
   BHK_NO.    SQUARE_FT  LONGITUDE   LATITUDE  PRICE_PER_SQFT  \
0        2   709.113608  22.486964  88.313191         0.04005   
1        3  1800.370665  27.400000  82.960000         0.03777   
2        6  3280.182232  24.690280  78.418890         0.10975   

   LAT_LONG_INTERACTION  
0           1985.895547  
1           2273.104000  
2           1936.184351  

After scaling (first 3 rows):
    BHK_NO.  SQUARE_FT  LONGITUDE  LATITUDE  PRICE_PER_SQFT  \
0 -0.444198  -0.007245   0.192632  1.088281       -0.307132   
1  0.684258  -0.006587   0.981127  0.580803       -0.326953   
2  4.069628  -0.005695   0.546243  0.150311        0.298797   

   LAT_LONG_INTERACTION  
0              0.622449  
1              1.117382  
2              0.536784  


In [5]:
def train_evaluate_model(
    model: Union[LinearRegression, KNeighborsRegressor, keras.Model],
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    dataset_name: str,
    model_name: str
) -> tuple[dict, Union[LinearRegression, KNeighborsRegressor, keras.Model]]:
    """
    Trains a given model and evaluates its performance.

    Parameters:
        model: The machine learning model instance to train
        X_train, y_train: The training data and labels
        X_test, y_test: The testing data and labels
        dataset_name: Name of the dataset treatment (e.g., 'Original')
        model_name: Name of the model (e.g., 'Linear Regression')

    Returns:
        tuple: (results_dict, trained_model_object)
    """
    # Train the model
    if model_name == 'Neural Network':
        model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
        y_pred = model.predict(X_test).flatten()
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Calculate performance metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results in a dictionary
    results = {
        'Dataset': dataset_name,
        'Model': model_name,
        'RMSE': round(rmse, 2),
        'MAE': round(mae, 2),
        'R-squared': round(r2, 4)
    }
    
    return results, model

# Test the function with a simple linear regression on the original dataset
test_model = LinearRegression()
test_results, _ = train_evaluate_model(
    test_model, 
    X_train_scaled, 
    y_original_train, 
    X_test_scaled, 
    y_original_test,
    'Original',
    'Linear Regression'
)

print("Test model performance:")
for metric, value in test_results.items():
    print(f"{metric}: {value}")


Test model performance:
Dataset: Original
Model: Linear Regression
RMSE: 542.75
MAE: 149.03
R-squared: 0.4594


In [6]:
# Test k-NN model on the original dataset
knn_model = KNeighborsRegressor(n_neighbors=7)
knn_results, _ = train_evaluate_model(
    knn_model,
    X_train_scaled,
    y_original_train,
    X_test_scaled,
    y_original_test,
    'Original',
    'k-NN Regression'
)

print("k-NN model performance:")
for metric, value in knn_results.items():
    print(f"{metric}: {value}")


k-NN model performance:
Dataset: Original
Model: k-NN Regression
RMSE: 522.24
MAE: 46.66
R-squared: 0.4995


In [7]:
# Create and test neural network model
def create_nn_model(input_dim):
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_dim=input_dim),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Initialize and train neural network
input_dim = X_train_scaled.shape[1]
nn_model = create_nn_model(input_dim)
nn_results, _ = train_evaluate_model(
    nn_model,
    X_train_scaled,
    y_original_train,
    X_test_scaled,
    y_original_test,
    'Original',
    'Neural Network'
)

print("Neural Network model performance:")
for metric, value in nn_results.items():
    print(f"{metric}: {value}")

# Compare all models so far
all_results = pd.DataFrame([test_results, knn_results, nn_results])
print("\nComparison of all models:")
display(all_results)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-06-15 23:16:48.362647: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Max
2025-06-15 23:16:48.362762: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2025-06-15 23:16:48.362770: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2025-06-15 23:16:48.362959: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-06-15 23:16:48.362974: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-06-15 23:16:49.083420: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plu

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Neural Network model performance:
Dataset: Original
Model: Neural Network
RMSE: 530.68
MAE: 139.62
R-squared: 0.4831

Comparison of all models:


Unnamed: 0,Dataset,Model,RMSE,MAE,R-squared
0,Original,Linear Regression,542.75,149.03,0.4594
1,Original,k-NN Regression,522.24,46.66,0.4995
2,Original,Neural Network,530.68,139.62,0.4831


In [8]:
# Scale the removed outliers dataset
X_removed_train_scaled, X_removed_test_scaled, _ = scale_features(
    X_removed_train, X_removed_test, numerical_features_to_scale
)

# Scale the capped outliers dataset
X_capped_train_scaled, X_capped_test_scaled, _ = scale_features(
    X_capped_train, X_capped_test, numerical_features_to_scale
)

# Function to test all models on a dataset
def test_all_models(X_train_scaled: pd.DataFrame, X_test_scaled: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, dataset_name: str) -> list[dict]:
    results = []
    
    # Linear Regression
    lr_model = LinearRegression()
    lr_results, _ = train_evaluate_model(
        lr_model, X_train_scaled, y_train, X_test_scaled, y_test,
        dataset_name, 'Linear Regression'
    )
    results.append(lr_results)
    
    # k-NN
    knn_model = KNeighborsRegressor(n_neighbors=7)
    knn_results, _ = train_evaluate_model(
        knn_model, X_train_scaled, y_train, X_test_scaled, y_test,
        dataset_name, 'k-NN Regression'
    )
    results.append(knn_results)
    
    # Neural Network
    input_dim = X_train_scaled.shape[1]
    nn_model = create_nn_model(input_dim)
    nn_results, _ = train_evaluate_model(
        nn_model, X_train_scaled, y_train, X_test_scaled, y_test,
        dataset_name, 'Neural Network'
    )
    results.append(nn_results)
    
    return results

# Test on removed outliers dataset
removed_results = test_all_models(
    X_removed_train_scaled, X_removed_test_scaled,
    y_removed_train, y_removed_test,
    'Removed Outliers'
)

# Test on capped outliers dataset
capped_results = test_all_models(
    X_capped_train_scaled, X_capped_test_scaled,
    y_capped_train, y_capped_test,
    'Capped Outliers'
)

# Combine all results
all_results = pd.DataFrame(
    [test_results, knn_results, nn_results] +
    removed_results +
    capped_results
)

# Display comprehensive comparison
print("Comprehensive comparison of all models across datasets:")
display(all_results)

# Create a pivot table for easier comparison
pivot_results = all_results.pivot(index='Model', columns='Dataset', values=['RMSE', 'MAE', 'R-squared'])
print("\nPivot table of results:")
display(pivot_results)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 855us/step
Comprehensive comparison of all models across datasets:


Unnamed: 0,Dataset,Model,RMSE,MAE,R-squared
0,Original,Linear Regression,542.75,149.03,0.4594
1,Original,k-NN Regression,522.24,46.66,0.4995
2,Original,Neural Network,530.68,139.62,0.4831
3,Removed Outliers,Linear Regression,30.16,22.39,0.4437
4,Removed Outliers,k-NN Regression,12.87,8.14,0.8987
5,Removed Outliers,Neural Network,31.75,20.12,0.3837
6,Capped Outliers,Linear Regression,46.71,33.13,0.5948
7,Capped Outliers,k-NN Regression,22.18,11.93,0.9086
8,Capped Outliers,Neural Network,76.18,52.47,-0.078



Pivot table of results:


Unnamed: 0_level_0,RMSE,RMSE,RMSE,MAE,MAE,MAE,R-squared,R-squared,R-squared
Dataset,Capped Outliers,Original,Removed Outliers,Capped Outliers,Original,Removed Outliers,Capped Outliers,Original,Removed Outliers
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Linear Regression,46.71,542.75,30.16,33.13,149.03,22.39,0.5948,0.4594,0.4437
Neural Network,76.18,530.68,31.75,52.47,139.62,20.12,-0.078,0.4831,0.3837
k-NN Regression,22.18,522.24,12.87,11.93,46.66,8.14,0.9086,0.4995,0.8987


In [9]:
# Create a more readable summary for CSV export
summary_df = all_results.copy()

# Sort the dataframe for better readability
summary_df = summary_df.sort_values(['Dataset', 'Model'])

# Round the numbers for cleaner display
summary_df['RMSE'] = summary_df['RMSE'].round(2)
summary_df['MAE'] = summary_df['MAE'].round(2)
summary_df['R-squared'] = summary_df['R-squared'].round(4)

# Save to CSV
summary_df.to_csv(config.RESULTS_DIR / 'model_performance_summary.csv', index=False)

print("\nPreview of the summary:")
display(summary_df)



Preview of the summary:


Unnamed: 0,Dataset,Model,RMSE,MAE,R-squared
6,Capped Outliers,Linear Regression,46.71,33.13,0.5948
8,Capped Outliers,Neural Network,76.18,52.47,-0.078
7,Capped Outliers,k-NN Regression,22.18,11.93,0.9086
0,Original,Linear Regression,542.75,149.03,0.4594
2,Original,Neural Network,530.68,139.62,0.4831
1,Original,k-NN Regression,522.24,46.66,0.4995
3,Removed Outliers,Linear Regression,30.16,22.39,0.4437
5,Removed Outliers,Neural Network,31.75,20.12,0.3837
4,Removed Outliers,k-NN Regression,12.87,8.14,0.8987


In [10]:
# Create a summary of best performing models for each dataset
best_models = pd.DataFrame()

for dataset in summary_df['Dataset'].unique():
    dataset_results = summary_df[summary_df['Dataset'] == dataset]
    
    best_rmse = dataset_results.loc[dataset_results['RMSE'].idxmin()]
    best_mae = dataset_results.loc[dataset_results['MAE'].idxmin()]
    best_r2 = dataset_results.loc[dataset_results['R-squared'].idxmax()]
    
    dataset_summary = pd.DataFrame({
        'Dataset': [dataset],
        'Best RMSE Model': [f"{best_rmse['Model']} ({best_rmse['RMSE']:.2f})"],
        'Best MAE Model': [f"{best_mae['Model']} ({best_mae['MAE']:.2f})"],
        'Best R² Model': [f"{best_r2['Model']} ({best_r2['R-squared']:.4f})"]
    })
    
    best_models = pd.concat([best_models, dataset_summary], ignore_index=True)

# Save best models summary to CSV
best_models.to_csv(config.RESULTS_DIR / 'best_models_summary.csv', index=False)

print("\nBest performing models for each dataset:")
display(best_models)



Best performing models for each dataset:


Unnamed: 0,Dataset,Best RMSE Model,Best MAE Model,Best R² Model
0,Capped Outliers,k-NN Regression (22.18),k-NN Regression (11.93),k-NN Regression (0.9086)
1,Original,k-NN Regression (522.24),k-NN Regression (46.66),k-NN Regression (0.4995)
2,Removed Outliers,k-NN Regression (12.87),k-NN Regression (8.14),k-NN Regression (0.8987)
