In [2]:
import os

requirements_installed = False
max_retries = 3
retries = 0


def install_requirements():
    """Installs the requirements from requirements.txt file"""
    global requirements_installed
    if requirements_installed:
        print("Requirements already installed.")
        return

    print("Installing requirements...")
    install_status = os.system("pip install -r requirements.txt")
    if install_status == 0:
        print("Requirements installed successfully.")
        requirements_installed = True
    else:
        print("Failed to install requirements.")
        if retries < max_retries:
            print("Retrying...")
            retries += 1
            return install_requirements()
        exit(1)
    return

In [3]:
install_requirements()

Installing requirements...
Requirements installed successfully.


In [4]:
from dotenv import load_dotenv
import os


def setup_env():
    """Sets up the environment variables"""
    def check_env(env_var):
        value = os.getenv(env_var)
        if value is None:
            print(f"Please set the {env_var} environment variable.")
            exit(1)
        else:
            print(f"{env_var} is set.")
    load_dotenv()

    variables_to_check = []

    for var in variables_to_check:
        check_env(var)

In [5]:
setup_env()

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
from copy import deepcopy

# Adjust the random seed for reproducibility
random_seed = 42

torch.manual_seed(random_seed)

class HousingPriceDataset:
    """Class to load and preprocess the housing dataset."""
    def __init__(self, csv_path, target_column, scale_factor=1_000_000):
        # Load the dataset
        self.data = pd.read_csv(csv_path)

        # Separate features and target
        self.X = self.data.drop(columns=[target_column])
        self.y = self.data[target_column] / scale_factor  # Scale target

        # Identify categorical and numeric columns
        categorical_cols = self.X.select_dtypes(include=['object']).columns
        numeric_cols = self.X.select_dtypes(include=['number']).columns

        # Preprocess the data
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_cols),
                ('cat', OneHotEncoder(), categorical_cols)
            ]
        )

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

        # Fit and transform the training data, transform the testing data
        self.X_train = torch.tensor(preprocessor.fit_transform(X_train), dtype=torch.float32)
        self.X_test = torch.tensor(preprocessor.transform(X_test), dtype=torch.float32)
        self.y_train = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
        self.y_test = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

    def get_input_size(self):
        return self.X_train.shape[1]
    
    def get_train_data(self):
        return self.X_train, self.y_train

class HousingPriceModel(nn.Module):
    """Class to define the neural network model for predicting housing prices."""
    def __init__(self, dataset: HousingPriceDataset, training_config=None):
        if training_config is None:
            training_config = {
                "epochs": 500,
                "learning_rate": 0.01,
                "dropout_rate": 0.5,
                "l2_lambda": 0.01,
                "layers": [64, 32],
                "activation": "ReLU"
            }
        input_size = dataset.get_input_size()
        self.X_train, self.y_train = dataset.get_train_data()
        self.training_config = training_config
        super(HousingPriceModel, self).__init__()

        # Build the model dynamically based on config
        layers = []
        previous_size = input_size
        for size in training_config["layers"]:
            layers.append(nn.Linear(previous_size, size))
            if training_config["activation"] == "ReLU":
                layers.append(nn.ReLU())
            elif training_config["activation"] == "Tanh":
                layers.append(nn.Tanh())
            elif training_config["activation"] == "Sigmoid":
                layers.append(nn.Sigmoid())
            layers.append(nn.Dropout(training_config["dropout_rate"]))
            previous_size = size
        layers.append(nn.Linear(previous_size, 1))  # Output layer
        self.model = nn.Sequential(*layers)
        self.cached_model = None

    def forward(self, x):
        return self.model(x)
    
    def set_training_config(self, config):
        self.training_config = deepcopy(config)

    def get_training_config(self):
        return deepcopy(self.training_config)

    def train_model(self):
        training_config = self.get_training_config()
        epochs = training_config["epochs"]
        learning_rate = training_config["learning_rate"]
        l2_lambda = training_config["l2_lambda"]
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=learning_rate, weight_decay=l2_lambda)

        for epoch in range(epochs):
            # Forward pass
            outputs = self(self.X_train)
            loss = criterion(outputs, self.y_train)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print loss every 50 epochs
            if (epoch + 1) % 50 == 0:
                print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')

        # Cache the trained model
        self.cached_model = self.state_dict()

    def _assert_model_state(self):
        if self.cached_model is None:
            print("Model is not trained. Training the model now...")
            self.train_model()

    def predict_price(self, X):
        self._assert_model_state()
        self.eval() 
        with torch.no_grad():
            prediction = self(X)
        return prediction.item()

    def batch_predict(self, X, y=None):
        self._assert_model_state()
        self.eval()  
        metrics = {
            "loss": None,
            "rms": None,
            "mape": None,
            "mae": None
        }
        with torch.no_grad():
            predictions = self(X)
            if y is not None:
                loss = nn.MSELoss()(predictions, y).item()
                rms = torch.sqrt(torch.tensor(loss)).item()
                mape = torch.mean(torch.abs((predictions - y) / y) * 100).item()

                metrics["loss"] = loss
                metrics["rms"] = rms
                metrics["mape"] = mape
                metrics["mae"] = nn.L1Loss()(predictions, y).item()
        return predictions, metrics

In [7]:
from IPython.display import clear_output


def run_experiment(
    csv_path,
    target_column,
    training_config={
        "epochs": 500,
        "learning_rate": 0.01,
        "dropout_rate": 0.5,
        "l2_lambda": 0.01,
        "layers": [128, 64, 32],
        "activation": "ReLU",
    },
):
    """Method to run the experiment"""
    dataset = HousingPriceDataset(csv_path, target_column)
    model = HousingPriceModel(dataset, training_config)
    model.train_model()
    predictions, metrics = model.batch_predict(dataset.X_test, dataset.y_test)
    clear_output()

    print("METRICS:")
    print(f"MSE: {metrics['loss']:.4f}")
    print(f"RMSE: {metrics['rms']:.4f}")
    print(f"MAPE: {metrics['mape']:.4f}%")
    print(f"MAE: {metrics['mae']:.4f}")

    # Print sample predictions
    print("Sample predictions vs actual values:")
    for i in range(5):
        print(
            f"Predicted: {predictions[i].item():.2f}, Actual: {dataset.y_test[i].item():.2f}"
        )

In [8]:
# Experiment 1: Basic stuff.

training_config = {
        "epochs": 1000,
        "learning_rate": 0.01,
        "dropout_rate": 0.5,
        "l2_lambda": 0.01,
        "layers": [128, 64],
        "activation": "ReLU"
    }

csv_path = "data/housing_prices/housing_prices.csv"
target_column = "price"  
run_experiment(csv_path, target_column, training_config=training_config)

METRICS:
MSE: 1.9478
RMSE: 1.3956
MAPE: 21.4607%
MAE: 1.0152
Sample predictions vs actual values:
Predicted: 4.94, Actual: 4.06
Predicted: 7.71, Actual: 6.65
Predicted: 3.39, Actual: 3.71
Predicted: 4.44, Actual: 6.44
Predicted: 3.49, Actual: 2.80


In [9]:
# Experiment 2: 1000 epochs, 3 layers, ReLU activation
# This works the best. 

training_config = {
        "epochs": 1000,
        "learning_rate": 0.01,
        "dropout_rate": 0.5,
        "l2_lambda": 0.01,
        "layers": [128, 64, 32],
        "activation": "ReLU"
    }

csv_path = "data/housing_prices/housing_prices.csv"
target_column = "price"  
run_experiment(csv_path, target_column, training_config=training_config)

METRICS:
MSE: 2.1347
RMSE: 1.4611
MAPE: 22.1002%
MAE: 1.0446
Sample predictions vs actual values:
Predicted: 5.19, Actual: 4.06
Predicted: 7.17, Actual: 6.65
Predicted: 3.29, Actual: 3.71
Predicted: 4.39, Actual: 6.44
Predicted: 3.39, Actual: 2.80


In [10]:
# Experiment 3: 5x the epochs and add more hidden layers

training_config = {
    "epochs": 5000,
    "learning_rate": 0.05,
    "dropout_rate": 0.5,
    "l2_lambda": 0.01,
    "layers": [128, 64, 32, 64, 128],
    "activation": "ReLU",
}

csv_path = "data/housing_prices/housing_prices.csv"
target_column = "price"
run_experiment(csv_path, target_column, training_config=training_config)

METRICS:
MSE: 5.1850
RMSE: 2.2771
MAPE: 39.3645%
MAE: 1.7478
Sample predictions vs actual values:
Predicted: 4.65, Actual: 4.06
Predicted: 4.65, Actual: 6.65
Predicted: 4.65, Actual: 3.71
Predicted: 4.65, Actual: 6.44
Predicted: 4.65, Actual: 2.80


In [13]:
# Experiment 4: 1000 epochs, 3 layers, ReLU activation, synthetic data with 1000 samples
# This works the best. 
# Note: Adding synthetic data doesn't seem to help much.

training_config = {
        "epochs": 5000,
        "learning_rate": 0.01,
        "dropout_rate": 0.5,
        "l2_lambda": 0.01,
        "layers": [128, 64, 32],
        "activation": "ReLU"
    }

csv_path = "data/housing_prices/augmented_housing_prices_large.csv"
target_column = "price"  
run_experiment(csv_path, target_column, training_config=training_config)

METRICS:
MSE: 2.2343
RMSE: 1.4947
MAPE: 24.1896%
MAE: 1.0865
Sample predictions vs actual values:
Predicted: 4.74, Actual: 5.19
Predicted: 3.73, Actual: 4.53
Predicted: 3.61, Actual: 2.03
Predicted: 3.65, Actual: 4.41
Predicted: 3.60, Actual: 2.48
