In [1]:
cd /home/tijmen/tudelft/thesis/metaengineering

/home/tijmen/tudelft/thesis/metaengineering


In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error
from src.utils.utils import get_generator, get_project_root
from src.pipeline.config import DataLoaderConfig, TaskLoaderConfig
from src.pipeline.taskloader import TaskLoader, TaskFrame
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from src.settings.tier import Tier
from src.settings.strategy import Strategy
from src.orchestrator.orchestrator import Orchestrator
from src.pipeline.dataloader import DataLoader
from src.orchestrator.trainer import Trainer

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader as TorchDataLoader
from torch.utils.data import TensorDataset
from torch.utils.tensorboard import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DataLoader.DATA_FOLDER = f'{get_project_root()}/data/training/'

tier = Tier.TIER0
strategy = Strategy.ALL

dl_config = DataLoaderConfig(
    additional_filters=["is_precursor", ],
    additional_transforms=["log_fold_change_protein", ]
)

tl_config = TaskLoaderConfig(
    data_throttle=1,
    tier=tier,
)

dl = DataLoader()
dl.prepare_dataloader(dl_config)

tl = TaskLoader()
tl.prepare_taskloader(tl_config)

gen = get_generator(dl, tl, strategy, tier)
tf = next(gen)

trainer = Trainer()


In [68]:
split_kwargs = dict(
    stratify='metabolite_id',
    shuffle=True
)

x_scaler = StandardScaler()
y_scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = trainer.do_train_test_split(tf, strategy, **split_kwargs)
metabolite_id = y_test.index.get_level_values('metabolite_id')

X_train = X_train.drop(['KO_ORF', 'metabolite_id'], axis=1)
X_test = X_test.drop(['KO_ORF', 'metabolite_id'], axis=1)

X_train = x_scaler.fit_transform(X_train)
X_test = x_scaler.transform(X_test)

y_train = y_scaler.fit_transform(y_train.values.reshape((-1, 1)))
y_test = y_scaler.transform(y_test.values.reshape((-1, 1)))


X_train, X_test, y_train, y_test = map(
    torch.from_numpy, (
        X_train.astype(np.float32), X_test.astype(np.float32), 
        y_train.astype(np.float32), y_test.astype(np.float32)
    )
)

train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)

In [69]:
def get_data(train_ds, test_ds, bs):
    return (
        TorchDataLoader(train_ds, bs, shuffle=True),
        TorchDataLoader(test_ds)
    )

def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)
    if opt is not None:
        loss.backward(),
        opt.step(),
        opt.zero_grad()
    return loss.item(), len(xb)

def fit(epochs, model, loss_func, opt, train_dl, test_dl, writer: SummaryWriter):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, (xb, yb) in enumerate(train_dl, 0):
            loss, _ = loss_batch(model, loss_func, xb, yb, opt)
            running_loss += loss
            
            if i % 10 == 9:
                avg_loss = running_loss / 10
                writer.add_scalars('Training loss', {'Training': avg_loss}, epoch * len(train_dl) + i)
        model.eval()

In [73]:
n_features = X_train.shape[1]
n_outputs = 1
num_nodes_hidden = [64, 32]
lr = 0.001
        
model = nn.Sequential(
    nn.Linear(in_features=n_features, out_features=num_nodes_hidden[0]),
    nn.ReLU(),
    nn.Linear(in_features=num_nodes_hidden[0], out_features=num_nodes_hidden[1]),
    nn.ReLU(),
    nn.Linear(in_features=num_nodes_hidden[1], out_features=n_outputs),
)

opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
loss_func = F.mse_loss

In [74]:
train_dl, test_dl = get_data(train_ds, test_ds, 128)
writer = SummaryWriter(log_dir=f"{get_project_root()}/runners/runs")

fit(100, model, loss_func, opt, train_dl, test_dl, writer)
writer.flush()

In [8]:
train_dl, test_dl = get_data(train_ds, test_ds, 128)
dataiter = iter(train_dl)

samples, result = next(dataiter)
writer.add_graph(model, samples)
writer.flush()

In [88]:
from scipy.stats import pearsonr

with torch.no_grad():
    model.eval()
    predictions = [model(xb).cpu().numpy().squeeze().item() for xb, _ in test_dl]
    print(f"Overall mae: {mean_absolute_error(y_test, predictions):.2f}")
    for metabolite in metabolite_id.unique():
        metabolite_predictions = []
        target = []
        for (xb, yb), is_current_metabolite in zip(test_dl, metabolite_id == metabolite):
            if is_current_metabolite:
                metabolite_predictions.append(model(xb).cpu().numpy().squeeze().item())
                target.append(yb.cpu().numpy().squeeze().item())
        print(
            f"==================================== \n"
            f"{metabolite} mae: {mean_absolute_error(target, metabolite_predictions):.2f}, \n"
            f"r2-score: {pearsonr(target, metabolite_predictions)[0]:.2f}, \n"
            f"min prediction: {min(metabolite_predictions):.2f}, max prediction: {max(metabolite_predictions):.2f} \n"
            f"mean log change: {np.mean(target):.2f}, mean prediction {np.mean(metabolite_predictions):.2f} \n"
            f"Inverse scaled: {pearsonr(y_scaler.inverse_transform(np.asarray(target).reshape((-1, 1))).squeeze(), y_scaler.inverse_transform(np.asarray(metabolite_predictions).reshape((-1, 1))).squeeze())[0]:.2f}"
        )

Overall mae: 0.11
pyr mae: 0.09, 
r2-score: 0.82, 
min prediction: 0.36, max prediction: 0.86 
mean log change: 0.70, mean prediction 0.66 
Inverse scaled: 0.82
r5p mae: 0.09, 
r2-score: 0.78, 
min prediction: 0.36, max prediction: 0.83 
mean log change: 0.74, mean prediction 0.69 
Inverse scaled: 0.78
pep mae: 0.10, 
r2-score: 0.72, 
min prediction: 0.36, max prediction: 0.83 
mean log change: 0.70, mean prediction 0.68 
Inverse scaled: 0.72
3pg;2pg mae: 0.05, 
r2-score: 0.83, 
min prediction: 0.44, max prediction: 0.86 
mean log change: 0.69, mean prediction 0.69 
Inverse scaled: 0.83
dhap mae: 0.09, 
r2-score: 0.90, 
min prediction: 0.36, max prediction: 0.81 
mean log change: 0.66, mean prediction 0.63 
Inverse scaled: 0.90
f6p mae: 0.13, 
r2-score: 0.85, 
min prediction: 0.36, max prediction: 0.86 
mean log change: 0.66, mean prediction 0.64 
Inverse scaled: 0.85
g6p;f6p;g6p-B mae: 0.18, 
r2-score: 0.24, 
min prediction: 0.36, max prediction: 0.83 
mean log change: 0.53, mean pred