In [None]:
# Setup imports
from typing import Union
import config
import pandas as pd
import numpy as np
import pathlib
import joblib
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from IPython.display import display

Matplotlib is building the font cache; this may take a moment.


In [None]:
# Helper function to load data
def load_data_splits(dataset_prefix: str, base_path: pathlib.Path) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
  """
  Loads the train/test data splits for a given dataset prefix.

  Parameters:
      dataset_prefix (str): The prefix for the dataset files (e.g., 'orig', 'rem', 'cap').
      base_path (pathlib.Path): The path to the processed data directory from config.

  Returns:
      tuple: A tuple containing (X_train, X_test, y_train, y_test) as DataFrames/Series.
  """
  X_train = pd.read_csv(base_path / f'X_{dataset_prefix}_train.csv')
  X_test = pd.read_csv(base_path / f'X_{dataset_prefix}_test.csv')
  y_train = pd.read_csv(base_path / f'y_{dataset_prefix}_train.csv').squeeze()
  y_test = pd.read_csv(base_path / f'y_{dataset_prefix}_test.csv').squeeze()
  return X_train, X_test, y_train, y_test

# Load the data splits, origina, removed, and capped
X_original_train, X_original_test, y_original_train, y_original_test = load_data_splits('original', config.PROCESSED_DATA_DIR)
X_removed_train, X_removed_test, y_removed_train, y_removed_test = load_data_splits('removed', config.PROCESSED_DATA_DIR)
X_capped_train, X_capped_test, y_capped_train, y_capped_test = load_data_splits('capped', config.PROCESSED_DATA_DIR)

# Load the list of numerical features
with open(config.PROCESSED_DATA_DIR / 'numerical_features_to_scale.json', 'r') as f:
  numerical_features_to_scale = json.load(f)

In [None]:
def scale_features(X_train: pd.DataFrame, X_test: pd.DataFrame, numerical_features: list) -> tuple[pd.DataFrame, pd.DataFrame, StandardScaler]:
    """
    Applies StandardScaler to the numerical features of the dataset.
    Scaler is fitted only on training data to prevent data leakage.

    Parameters:
        X_train (pd.DataFrame): The training feature set
        X_test (pd.DataFrame): The testing feature set
        numerical_features (list): Column names to be scaled

    Returns:
        tuple: (X_train_scaled, X_test_scaled, scaler_object)
    """
    scaler = StandardScaler()
    
    # Create copies to avoid modifying the original dataframes
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    
    # Fit on training data and transform both sets
    X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
    X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])
    
    return X_train_scaled, X_test_scaled, scaler


In [None]:
def train_evaluate_model(
    model: Union[LinearRegression, KNeighborsRegressor, keras.Model],
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    dataset_name: str,
    model_name: str
) -> tuple[dict, Union[LinearRegression, KNeighborsRegressor, keras.Model]]:
    """
    Trains a given model and evaluates its performance.

    Parameters:
        model: The machine learning model instance to train
        X_train, y_train: The training data and labels
        X_test, y_test: The testing data and labels
        dataset_name: Name of the dataset treatment (e.g., 'Original')
        model_name: Name of the model (e.g., 'Linear Regression')

    Returns:
        tuple: (results_dict, trained_model_object)
    """
    # Train the model
    if model_name == 'Neural Network':
        model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
        y_pred = model.predict(X_test).flatten()
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Calculate performance metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results in a dictionary
    results = {
        'Dataset': dataset_name,
        'Model': model_name,
        'RMSE': round(rmse, 2),
        'MAE': round(mae, 2),
        'R-squared': round(r2, 4)
    }
    
    return results, model


In [None]:
# Define Models and Execute Training Pipeline
def create_nn_model(input_shape: int) -> keras.Model:
    """
    Creates and compiles a simple Keras Sequential model for regression.

    This function defines a multi-layer perceptron (MLP) with two hidden
    layers using the ReLU activation function, dropout for regularization,
    and a final linear output layer suitable for predicting a continuous value.
    The model is compiled with the Adam optimizer and Mean Squared Error loss.

    Parameters:
        input_shape (int): The number of features in the input data. This is
                           used to correctly shape the initial Input layer.

    Returns:
        keras.Model: A compiled, untrained Keras model instance.
    """
    model = keras.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1) # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Define the models to be trained
models_to_train = {
    'Linear Regression': LinearRegression(),
    'k-NN Regression': KNeighborsRegressor(n_neighbors=7),
    'Neural Network': None  # Placeholder, will be created in the loop
}

# Prepare for the experiment loop 
datasets = {
    'Original': (X_original_train, X_original_test, y_original_train, y_original_test),
    'Removed Outliers': (X_removed_train, X_removed_test, y_removed_train, y_removed_test),
    'Capped Outliers': (X_capped_train, X_capped_test, y_capped_train, y_capped_test)
}

all_results = []
trained_artifacts = {} # To store trained models and scalers

print("Starting model training pipeline for all 9 experiments...")

# --- Main Experiment Loop ---
for d_name, (X_train, X_test, y_train, y_test) in datasets.items():
    print(f"\n--- Processing Dataset: {d_name} ---")
    
    # Step 1: Scale the features for this dataset
    X_train_s, X_test_s, scaler = scale_features(X_train, X_test, numerical_features_to_scale)
    trained_artifacts[f'{d_name}_scaler'] = scaler
    
    for m_name, model_instance in models_to_train.items():
        print(f"  -> Training Model: {m_name}...")
        
        # Step 2: Train and evaluate the model
        # A new Neural Network instance must be created for each run
        if m_name == 'Neural Network':
            current_model = create_nn_model(X_train_s.shape[1])
        else:
            current_model = model_instance
        
        results, trained_model = train_evaluate_model(
            current_model, X_train_s, y_train, X_test_s, y_test, d_name, m_name
        )
        
        # Step 3: Store results and artifacts
        all_results.append(results)
        trained_artifacts[f'{d_name}_{m_name}'] = trained_model

print("\nAll model training experiments completed.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-06-15 23:16:48.362647: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Max
2025-06-15 23:16:48.362762: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2025-06-15 23:16:48.362770: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2025-06-15 23:16:48.362959: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-06-15 23:16:48.362974: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-06-15 23:16:49.083420: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plu

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Neural Network model performance:
Dataset: Original
Model: Neural Network
RMSE: 530.68
MAE: 139.62
R-squared: 0.4831

Comparison of all models:


Unnamed: 0,Dataset,Model,RMSE,MAE,R-squared
0,Original,Linear Regression,542.75,149.03,0.4594
1,Original,k-NN Regression,522.24,46.66,0.4995
2,Original,Neural Network,530.68,139.62,0.4831


In [None]:
# Compile, Display, and Save Results
# Compile results into a DataFrame
results_df = pd.DataFrame(all_results)

print("\nFinal Model Performance Comparison")
display(results_df.sort_values(by=['Dataset', 'RMSE']))

# Create a pivot table for easier comparison
pivot_results = results_df.pivot(index='Model', columns='Dataset', values=['RMSE', 'MAE', 'R-squared'])
print("\nPivot Table of Results")
display(pivot_results)

# Save the results tables to the results folder
results_df.to_csv(config.RESULTS_DIR / 'model_performance_summary.csv', index=False)
pivot_results.to_csv(config.RESULTS_DIR / 'model_performance_pivot.csv')

# Save all trained models and scalers
print("\nSaving all trained models and scalers...")
for name, artifact in trained_artifacts.items():
    if 'scaler' in name:
        joblib.dump(artifact, config.MODELS_DIR / f'{name}.joblib')
    elif 'Neural Network' in name:
        artifact.save(config.MODELS_DIR / f'{name}.keras')
    else:
        joblib.dump(artifact, config.MODELS_DIR / f'{name}.joblib')

print("\nAll artifacts saved successfully to the 'models/' directory.")