In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from tqdm import tqdm
import matplotlib.pyplot as plt


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


In [0]:

data = pd.read_csv("../data/age_pred_dataset.csv", header=0, index_col=0, sep='\t')


In [0]:
y = data["age"].to_numpy().reshape(-1, 1).flatten()
X = data.drop(["age"], axis=1).to_numpy()
y.shape, X.shape

In [0]:
from sklearn.model_selection import train_test_split


# Assuming X and y are already defined as numpy arrays
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Standardize data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Convert to tensors and move to device
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)


# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [0]:
# Define models

class ShallowAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(ShallowAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim),
            nn.ReLU()
        )
        self.regression_head = nn.Linear(latent_dim, 1)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        regression_output = self.regression_head(encoded)
        return encoded, decoded, regression_output

class DeepAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(DeepAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim//2),
            nn.ReLU(),
            nn.Linear(input_dim//2, input_dim//4),
            nn.ReLU(),
            nn.Linear(input_dim//4, input_dim//16),
            nn.ReLU(),
            nn.Linear(input_dim//16, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, latent_dim * 2),
            nn.ReLU(),
            nn.Linear(latent_dim * 2, latent_dim * 4),
            nn.ReLU(),
            nn.Linear(latent_dim * 4, latent_dim * 16),
            nn.ReLU(),
            nn.Linear(latent_dim * 16, input_dim),
            nn.ReLU()
        )

        self.regression_head = nn.Linear(latent_dim, 1)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        regression_output = self.regression_head(encoded)
        return encoded, decoded, regression_output
    

class VariationalAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim//4),
            nn.ReLU(),
            nn.Linear(input_dim//4, input_dim//16),
            nn.ReLU(),
            nn.Linear(input_dim//16, 50),
            nn.ReLU()
        )
        self.mu = nn.Linear(50, latent_dim)
        self.logvar = nn.Linear(50, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, latent_dim * 4),
            nn.ReLU(),
            nn.Linear(latent_dim * 4, latent_dim * 16),
            nn.ReLU(),
            nn.Linear(latent_dim * 16, input_dim),
            nn.ReLU()
        )
        self.regression_head = nn.Linear(latent_dim, 1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu, logvar = self.mu(h), self.logvar(h)
        z = self.reparameterize(mu, logvar)
        decoded = self.decoder(z)
        regression_output = self.regression_head(z)
        return z, decoded, regression_output




In [0]:

# Training function
def train_model(model, name, train_loader, test_loader, criterion_ae, criterion_reg, optimizer, lambda_ae=1.0, lambda_reg=1.0, num_epochs=50, patience=10):
    best_loss = float('inf')
    best_model = None
    early_stopping_counter = 0
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            latent, reconstructed, regression_output = model(X_batch)

            loss_ae = criterion_ae(reconstructed, X_batch)  # Reconstruction loss
            loss_reg = criterion_reg(regression_output.squeeze(), y_batch)  # Regression loss
            loss = lambda_ae * loss_ae + lambda_reg * loss_reg

            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)
        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)

        # Validation loss
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                latent, reconstructed, regression_output = model(X_batch)
                loss_ae = criterion_ae(reconstructed, X_batch)  # Reconstruction loss
                loss_reg = criterion_reg(regression_output.squeeze(), y_batch)  # Regression loss
                loss = lambda_ae * loss_ae + lambda_reg * loss_reg
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(test_loader.dataset)
        val_losses.append(val_loss)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # Early stopping
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = model.state_dict()
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print("Early stopping")
                break

    return best_model, train_losses, val_losses

In [0]:

dict_history = dict()
dict_y_pred = dict()


models = [

    VariationalAutoencoder(X.shape[1], 16),
    ShallowAutoencoder(X.shape[1], 16),
    DeepAutoencoder(X.shape[1], 16),

]

model_names = [

    "VariationalAutoencoder",
    "ShallowAutoencoder",
    "DeepAutoencoder",
    # "LinearRegressor",
    "XGBoostRegressor",
    "RandomForestRegressor",

]


for model, name in zip(models, model_names):
    
    print(f"Training {name}")
    if name in ['VariationalAutoencoder', 'ShallowAutoencoder', 'DeepAutoencoder']:
        model.to(device)
        criterion_ae = nn.MSELoss()
        criterion_reg = nn.MSELoss()  # todo choose metric 
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        best_model, train_losses, val_losses = train_model(model, name, train_loader, test_loader, criterion_ae, criterion_reg, optimizer, num_epochs=50, lambda_ae=1, lambda_reg=1)

        # Save the best model
        torch.save(best_model, f"{name}_best_model.pth")

        # Save latent representations
        model.load_state_dict(best_model)
        model.eval()
        with torch.no_grad():
            latent, reconstructed, regression_output = model(X_test_tensor)
            # np.save(f"{name}_latent_representations.npy", latent_representations.cpu().numpy())

        # Log losses
        dict_history[name] = {
                "train_losses": np.array(train_losses),
                "val_losses": np.array(val_losses)
            }
        
        # log predicted values
        dict_y_pred[name] = regression_output.cpu().numpy()
    else: 
        model.fit(X_train, y_train)
        dict_y_pred[name] = model.predict(X_test)


In [0]:

plt.figure(figsize=(5, 12))
for i, (model_name, history) in enumerate(dict_history.items()):
    plt.subplot(3,1, i + 1)
    plt.plot(history['train_losses'], label=f'{model_name} Train Loss')
    plt.plot(history['val_losses'], label=f'{model_name} Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title(model_name)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()



In [0]:
plt.figure(figsize=(12, 12))


for i, (model_name, y_pred) in enumerate(dict_y_pred.items()):
    
    y_pred = y_pred.squeeze()

    plt.subplot(3, 2, i + 1)

    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)

    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title(model_name)


    mask = (y_pred <= 100) & (y_pred >= 0)
    r2 = 1 - (np.sum((y_test[mask] - y_pred[mask]) ** 2) / np.sum((y_test[mask] - np.mean(y_test[mask])) ** 2))
    plt.text(0.05, 0.95, f'R^2: {r2:.2f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')

    plt.tight_layout()