# data loading

In [0]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


In [0]:
df_age = pd.read_csv("../data/age.csv", header=None, index_col=0, sep='\t')
y = df_age.to_numpy().reshape(-1, 1).flatten()

df_abundance_log = pd.read_csv("../data/processed_log_abundance.csv", header=0, index_col=0, sep='\t')
X_log = df_abundance_log.loc[df_age.index, :].to_numpy()

df_abundance = pd.read_csv("../data/processed_abundance.csv", sep='\t', header=0, index_col=0)
X_abundance = df_abundance.loc[df_age.index, :].to_numpy()


y.shape, X_log.shape, X_abundance.shape

In [0]:
X_presence = (X_abundance > 0).astype(int)
dct_X = {'abundance':X_abundance, 'log': X_log, 'presence': X_presence}
del df_age, df_abundance, df_abundance_log

In [0]:
y_class = y//10
y_class[y_class==9] = 8 

In [0]:
sns.histplot(y_class)

In [0]:

train_idx, test_idx = train_test_split(range(len(X_abundance)), test_size=0.2, stratify=y_class, random_state=42)  # split the data once so that index keeps the same for different types of X
test_idx[:10]

[4436, 2292, 4448, 4903, 2378, 842, 2625, 3097, 4898, 1911]

# model and training functions

In [0]:
# data type specific parameters
def get_params(X_type):
    if X_type == 'abundance':
        scale_data = True
        decoder_activation = None
        ae_loss_function = nn.MSELoss(reduction='sum')   # calculate loss for non-zero values rather than averaging over all values, which could otherwise be dominated by the many zeros in the data.

    elif X_type == 'log':
        scale_data = True
        decoder_activation = None
        ae_loss_function = nn.MSELoss(reduction='sum')

    elif X_type == 'presence':
        scale_data = False  # no need to scale as the data is already between 0 and 1
        decoder_activation = nn.Sigmoid()  # use sigmoid so that the output is binary
        ae_loss_function = nn.BCELoss(reduction='sum')  # use binary loss                    # todo try focal ?

    else:
        raise ValueError("Invalid character for data type")

    # return {'scale_data': scale_data, 'decoder_activation': decoder_activation, 'ae_loss_function': ae_loss_function, 'ae_eval_metric': ae_eval_metric}
    return scale_data, decoder_activation, ae_loss_function


In [0]:
def get_data(x_type, y, scale_data):
    X = dct_X[x_type]
    # Assuming X and y are already defined as numpy arrays
    X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    if scale_data:
        scaler = MinMaxScaler()  # scale data so that they are between 0 and 1
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test


def get_dataloader(X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor, batch_size=64):
    # Create TensorDatasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader


In [0]:
# Define models

class ShallowAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim, decoder_activation):
        super(ShallowAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, latent_dim),
            nn.LeakyReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim),
        )
        self.decoder_activation = decoder_activation
        self.regression_head = nn.Sequential(
            nn.Linear(latent_dim, 1),
            nn.ReLU()
        )
            

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        if self.decoder_activation is not None:
            decoded = self.decoder_activation(decoded)
        regression_output = self.regression_head(encoded)
        return encoded, decoded, regression_output

class DeepAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim, decoder_activation):
        super(DeepAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim//2),
            nn.LeakyReLU(),
            nn.Linear(input_dim//2, input_dim//4),
            nn.LeakyReLU(),
            nn.Linear(input_dim//4, latent_dim),
            nn.LeakyReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//4),
            nn.LeakyReLU(),
            nn.Linear(input_dim//4, input_dim//2),
            nn.LeakyReLU(),
            nn.Linear(input_dim//2, input_dim),
        )
        self.decoder_activation = decoder_activation
        self.regression_head = nn.Sequential(
            nn.Linear(latent_dim, 1),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        if self.decoder_activation is not None:
            decoded = self.decoder_activation(decoded)
        regression_output = self.regression_head(encoded)
        return encoded, decoded, regression_output
    

class VariationalAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim, decoder_activation):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim//2),
            nn.LeakyReLU(),
            nn.Linear(input_dim//2, input_dim//4),
            nn.LeakyReLU(),

        )
        self.mu = nn.Linear( input_dim//4, latent_dim)
        self.logvar = nn.Linear( input_dim//4, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim//4),
            nn.LeakyReLU(),
            nn.Linear(input_dim//4, input_dim//2),
            nn.LeakyReLU(),
            nn.Linear(input_dim//2, input_dim),
        )
        self.decoder_activation = decoder_activation
        self.regression_head = nn.Sequential(
            nn.Linear(latent_dim, 1),
            nn.ReLU()
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu, logvar = self.mu(h), self.logvar(h)
        z = self.reparameterize(mu, logvar)
        decoded = self.decoder(z)
        if self.decoder_activation is not None:
            decoded = self.decoder_activation(decoded)
        regression_output = self.regression_head(z)
        return z, decoded, regression_output, mu, logvar





In [0]:
# Training function

def train_model(model, model_name, train_loader, test_loader, ae_loss_function, optimizer, reg_loss_function, lambda_ae, lambda_reg, num_epochs=50, patience=10):
    min_val_loss = float('inf')
    best_model = None  # for early stopping
    early_stopping_counter = 0
    lst_train_loss_ae = []
    lst_train_loss_reg = []
    lst_train_r2 = []
    
    lst_val_loss_ae = []
    lst_val_loss_reg = []
    lst_val_r2 = []

    for epoch in range(num_epochs):
        model.train()
        train_loss_ae = 0.0
        train_loss_reg = 0.0
        train_r2 = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            if model_name == 'VariationalAutoencoder':
                latent, reconstructed, regression_output, mu, logvar = model(X_batch)
                loss_kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
            else:
                latent, reconstructed, regression_output = model(X_batch)
                loss_kl = 0


            loss_ae = ae_loss_function(reconstructed, X_batch) + loss_kl  # Reconstruction loss
            loss_reg = reg_loss_function(regression_output.squeeze(), y_batch)  # Regression loss

            
            train_loss = lambda_ae * loss_ae  + lambda_reg * loss_reg
            train_loss.backward()
            optimizer.step()

            train_loss_ae += loss_ae.item()
            train_loss_reg += loss_reg.item()
            train_r2 += r2_score(y_batch.cpu().detach().numpy(), regression_output.cpu().detach().numpy())
        train_loss = (lambda_ae * train_loss_ae + lambda_reg * train_loss_reg)/len(train_loader)

        lst_train_loss_ae.append(train_loss_ae/len(train_loader))
        lst_train_loss_reg.append(train_loss_reg/len(train_loader))
        lst_train_r2.append(train_r2/len(train_loader))


        # Validation loss
        model.eval()
        val_loss_ae = 0.0
        val_loss_reg = 0.0
        val_r2 = 0.0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                if model_name == 'VariationalAutoencoder':
                    latent, reconstructed, regression_output, mu, logvar = model(X_batch)
                    loss_kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) 
                else:                
                    latent, reconstructed, regression_output = model(X_batch)
                    loss_kl = 0.0
                # loss_l1 = torch.sum(torch.abs(latent)) # regularization term
                # loss_ae = ae_loss_function(reconstructed, X_batch) + loss_kl + 0.01 * loss_l1  # Reconstruction loss
                loss_ae = ae_loss_function(reconstructed, X_batch) + loss_kl 
                loss_reg = reg_loss_function(regression_output.squeeze(), y_batch)  # Regression loss

                val_loss_ae += loss_ae.item()
                val_loss_reg += loss_reg.item()
                val_r2 += r2_score(y_batch.cpu().detach().numpy(), regression_output.cpu().detach().numpy())
            val_loss = (lambda_ae * val_loss_ae + lambda_reg * val_loss_reg)/len(test_loader)
            lst_val_loss_ae.append(val_loss_ae/len(test_loader))
            lst_val_loss_reg.append(val_loss_reg/len(test_loader))
            lst_val_r2.append(val_r2/len(test_loader))

        if epoch % 5 == 0:
            print(f'{model_name} Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # # Early stopping
        # if val_loss < min_val_loss:
        #     min_val_loss = val_loss
        #     best_model = model.state_dict()
        #     early_stopping_counter = 0
        # else:
        #     early_stopping_counter += 1
        #     if early_stopping_counter >= patience:
        #         print("Early stopping")
        #         break

    return best_model, lst_train_loss_ae, lst_train_loss_reg, lst_train_r2, lst_val_loss_ae, lst_val_loss_reg, lst_val_r2

# start

In [0]:
# parameters to define before each experiment

x_type = 'log'  # 'abundance' or 'presence' or 'log'
latent_dim = 100
lambda_ae = 1
lambda_reg = 1 - lambda_ae
num_epochs= 50
patience = 10

In [0]:
scale_data, decoder_activation, ae_loss_function = get_params(x_type)
X_train, X_test, y_train, y_test = get_data(x_type, y, scale_data)
# Convert to tensors and move to device
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
train_loader, test_loader = get_dataloader(X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor, batch_size=64)
input_dim = X_train.shape[1]

In [0]:



models = [

    VariationalAutoencoder(input_dim, latent_dim, decoder_activation),
    ShallowAutoencoder(input_dim, latent_dim, decoder_activation),
    DeepAutoencoder(input_dim, latent_dim, decoder_activation),

]

model_names = [

    "VariationalAutoencoder",
    "ShallowAutoencoder",
    "DeepAutoencoder",
]


dct_history = dict()
dct_y_pred = dict()

plt.figure(figsize=(15, 15))
i = 1
plt.subplot(4, 2, i)
sns.histplot(X_test.flatten())
plt.title('Test Set - Original Distribution')
i+=1
plt.subplot(4, 2, i )
sns.histplot(X_test[X_test > 0].flatten())
plt.title('Test Set - Original Distribution > 0')
i+=1

for model, model_name in zip(models, model_names):
    
    print(f"Training {model_name}")

    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # training
    best_model,lst_train_loss_ae, lst_train_loss_reg, lst_train_r2, lst_val_loss_ae, lst_val_loss_reg, lst_val_r2 = train_model(model, model_name, train_loader, test_loader, ae_loss_function, optimizer=optimizer,reg_loss_function=nn.MSELoss(), num_epochs=num_epochs, patience=patience, lambda_ae=lambda_ae, lambda_reg=lambda_reg)

    # # Save the best model
    # torch.save(best_model, f"model/{model_name}_best_model.pth")

    ## Save latent representations
    # model.load_state_dict(best_model)
    model.eval()
    with torch.no_grad():
        results = model(X_test_tensor)
        # np.save(f"model/{model_name}_latent_representations.npy", results[0].cpu().detach().numpy())
        X_train_latent = model(X_train_tensor)[0].cpu().detach().numpy()
        X_val_latent = results[0].cpu().detach().numpy()

    plt.subplot(4, 2, i)
    sns.histplot(results[1].cpu().numpy().flatten())
    plt.title(f'{model_name} - Latent Representation Distribution')
    i+=1
    plt.subplot(4, 2, i )
    sns.histplot(results[1].cpu().numpy().flatten()[results[1].cpu().numpy().flatten()>0])
    plt.title(f'{model_name} - Latent Representation Distribution > 0')
    i+=1

    # Log losses
    dct_history[model_name] = {
            "train_loss_ae": np.array(lst_train_loss_ae),
            "train_loss_reg": np.array(lst_train_loss_reg),
            "train_r2": np.array(lst_train_r2),
            "val_loss_ae": np.array(lst_val_loss_ae),
            "val_loss_reg": np.array(lst_val_loss_reg),
            "val_r2": np.array(lst_val_r2)
        }
    
    # log predicted values
    dct_y_pred[model_name] = results[2].cpu().detach().numpy()



    # Train XGBoost model on latent features
    print("prediction using embedding by", model_name)

    # Train XGBoost model on latent features
    xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6)
    xgb_model.fit(X_train_latent, y_train)  # Use the latent features as input for regression

    # Predict on the validation set
    dct_y_pred[model_name + '_xgb'] =  xgb_model.predict(X_val_latent)

plt.tight_layout();

In [0]:
sns.histplot(X_test.flatten()[X_test.flatten()>0]);

In [0]:
sns.histplot(results[1].cpu().numpy().flatten());

In [0]:
plt.figure(figsize=(10, 10))
for i, (model_name, history) in enumerate(dct_history.items()):
    plt.subplot(3, 2, 2 * i + 1)
    plt.plot(history['train_loss_ae'], '-', label=f'Train', color='blue', alpha=0.5)
    plt.plot(history['val_loss_ae'], '--', label=f'Validation', color='red', alpha=0.5)
    plt.title(f'{model_name} - Reconstruction Loss')
    plt.text
    plt.xlabel('Epochs')
    plt.ylabel('Reconstruction Loss')

    plt.subplot(3, 2, 2 * i + 2)
    plt.plot(history['train_loss_reg'], '-', label=f'Train', color='blue', alpha=0.5)
    plt.plot(history['val_loss_reg'], '--', label=f'Validation', color='red', alpha=0.5)
    plt.title(model_name)
    plt.xlabel('Epochs')
    plt.title(f'{model_name} - Regression Loss')
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.grid(True)
    plt.tight_layout()

In [0]:
plt.figure(figsize=(10, 12))


for i, (model_name, y_pred) in enumerate(dct_y_pred.items()):
    
    y_pred = y_pred.squeeze()

    plt.subplot(3, 2, i + 1)

    plt.scatter(y_test, y_pred, alpha=0.5, s=6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)

    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title(f'{model_name} lambdaAE={lambda_ae}, lambdaReg={lambda_reg}')


    # mask = (y_pred <= 100) & (y_pred >= 0)
    # r2 = 1 - (np.sum((y_test[mask] - y_pred[mask]) ** 2) / np.sum((y_test[mask] - np.mean(y_test[mask])) ** 2))
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    plt.text(0.05, 0.95, f'R^2: {r2:.2f}\nMSE: {mse:.2f}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')

    plt.tight_layout()

In [0]:
# lambda_ae = 1
# lambda_reg = 0
# num_epochs = 100
# patience = 20

# # Dictionary to store the embeddings
# dct_embedding = {}
# # dct_loss_ae_emb = {}
# dct_xgb_mse = {}
# dct_xgb_r2 = {}


# # Train each model with the specified lambda values and save the embeddings
# for model, model_name in zip(models, model_names):
    
#     print(f"Training {model_name}")

#     model.to(device)
#     ae_loss_function = nn.MSELoss()
#     reg_loss_function = nn.MSELoss()  
#     optimizer = optim.Adam(model.parameters(), lr=0.001)
#     best_model,lst_train_loss_ae, _, _, lst_val_loss_ae, _, _ = train_model(model, model_name, train_loader, test_loader, ae_loss_function, reg_loss_function, optimizer, num_epochs=num_epochs, patience=patience, lambda_ae=lambda_ae, lambda_reg=lambda_reg)

#     dct_train_loss_ae_emb[model_name] = lst_train_loss_ae
#     dct_val_loss_ae_emb[model_name] = lst_val_loss_ae

#     # Save latent representations
#     model.load_state_dict(best_model)
#     model.eval()
#     with torch.no_grad():
#         X_train_latent = model(X_train_tensor)[0].cpu().detach().numpy(), 
#         X_val_latent = model(X_test_tensor)[0].cpu().detach().numpy()
    
#     print("prediction using embedding by", model_name)

#     # Train XGBoost model on latent features
#     xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6)
#     xgb_model.fit(X_train_latent, y_train)  # Use the latent features as input for regression

#     # Predict on the validation set
#     y_pred = xgb_model.predict(X_val_latent)

#     # Evaluate performance
#     from sklearn.metrics import mean_squared_error, r2_score
#     mse = mean_squared_error(y_test, y_pred)
#     r2 = r2_score(y_test, y_pred)

#     print(f"XGBoost MSE: {mse:.4f}")
#     print(f"XGBoost R²: {r2:.4f}")
#     dct_xgb_mse[f'xgb_{model_name}'] = mse
#     dct_xgb_r2[f'xgb_{model_name}'] = r2



