In [1]:
cd /home/tvangraft/tudelft/thesis/metaengineering

/home/tvangraft/tudelft/thesis/metaengineering


In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from src.utils.utils import get_generator, get_project_root
from src.utils.test_result_store import TestResultStore

from src.pipeline.config import DataLoaderConfig, TaskLoaderConfig
from src.pipeline.taskloader import TaskLoader, TaskFrame
from src.pipeline.dataloader import DataLoader

from src.settings.tier import Tier
from src.settings.strategy import Strategy

from src.orchestrator.orchestrator import SklearnOrchestrator
from src.orchestrator.trainer import Trainer

from scipy.stats import pearsonr

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader as TorchDataLoader
from torch.utils.data import TensorDataset
from torch.utils.tensorboard import SummaryWriter

In [4]:
DataLoader.DATA_FOLDER = f'{get_project_root()}/data/training/'

tier = Tier.TIER0
strategy = Strategy.ALL

dl_config = DataLoaderConfig(
    additional_filters=["is_precursor", ],
    additional_transforms=["log_fold_change_protein", ]
)

tl_config = TaskLoaderConfig(
    data_throttle=1,
    tier=tier,
)

dl = DataLoader()
dl.prepare_dataloader(dl_config)

tl = TaskLoader()
tl.prepare_taskloader(tl_config)

gen = get_generator(dl, tl, strategy, tier)
tf = next(gen)

trainer = Trainer()

split_kwargs = dict(
    stratify='metabolite_id',
    shuffle=True
)

x_scaler = StandardScaler()
y_scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = trainer.do_train_test_split(tf, strategy, **split_kwargs)
metabolite_id = y_test.index.get_level_values('metabolite_id')

X_train = X_train.drop(['KO_ORF', 'metabolite_id'], axis=1)
# X_test = X_test.drop(['KO_ORF', 'metabolite_id'], axis=1)

X_train = x_scaler.fit_transform(X_train)
# X_test = x_scaler.transform(X_test)

y_train = y_scaler.fit_transform(y_train.values.reshape((-1, 1)))
# y_test = y_scaler.transform(y_test.values.reshape((-1, 1)))


X_train, y_train = map(
    torch.from_numpy, (
        X_train.astype(np.float32), 
        y_train.astype(np.float32)
    )
)

train_ds = TensorDataset(X_train, y_train)
# test_ds = TensorDataset(X_test, y_test)
len(X_test['KO_ORF'].unique())


In [38]:
DataLoader.DATA_FOLDER = f'{get_project_root()}/data/training/'

tier = Tier.TIER0
strategy = Strategy.ALL

dl_config = DataLoaderConfig(
    additional_filters=["is_precursor"],
    additional_transforms=["log_fold_change_protein", ]
)

tl_config = TaskLoaderConfig(
    data_throttle=1,
    tier=tier,
)

dl = DataLoader()
dl.prepare_dataloader(dl_config)

tl = TaskLoader()
tl.prepare_taskloader(tl_config)

gen = get_generator(dl, tl, strategy, tier)
tf = next(gen)

trainer = Trainer()
split_kwargs = dict(shuffle=True, stratify='metabolite_id')
X_train, X_test, y_train, y_test = trainer.do_train_test_split(tf, strategy, **split_kwargs)
# metabolite_id = y_test.index.get_level_values('metabolite_id')



# X_train = X_train.drop(['KO_ORF', 'metabolite_id'], axis=1)
# # X_test = X_test.drop(['KO_ORF', 'metabolite_id'], axis=1)

# X_train = x_scaler.fit_transform(X_train)
# # X_test = x_scaler.transform(X_test)

# y_train = y_scaler.fit_transform(y_train.values.reshape((-1, 1)))
# # y_test = y_scaler.transform(y_test.values.reshape((-1, 1)))


# X_train, y_train = map(
#     torch.from_numpy, (
#         X_train.astype(np.float32), 
#         y_train.astype(np.float32)
#     )
# )

# train_ds = TensorDataset(X_train, y_train)
# # test_ds = TensorDataset(X_test, y_test)

X_test['metabolite_id'].unique()
# len(X_test['KO_ORF'].unique())

array(['pyr', 'g6p;f6p;g6p-B', '3pg;2pg', 'dhap', 'akg', 'oaa', 'r5p',
       'pep', 'e4p', 'f6p', 'accoa', 'g6p;g6p-B'], dtype=object)

90

In [33]:
def get_test_data(
    X_test: pd.DataFrame,
    y_test: pd.Series,
    x_scaler: StandardScaler,
    y_scaler: MinMaxScaler,
    metabolite_id: str = ""
):
    _X_test = X_test[X_test['metabolite_id'] == metabolite_id].copy() if len(metabolite_id) > 0 else X_test.copy()
    _y_test = y_test.xs(metabolite_id, level="metabolite_id").copy() if len(metabolite_id) > 0 else y_test.copy()

    _X_test = _X_test.drop(['KO_ORF', 'metabolite_id'], axis=1)
    _X_test = x_scaler.transform(_X_test)

    _y_test = y_scaler.transform(_y_test.values.reshape((-1, 1)))
    _X_test,  _y_test = map(
        torch.from_numpy, (
            _X_test.astype(np.float32), _y_test.astype(np.float32),
        )
    )
    test_ds = TensorDataset(_X_test, _y_test)
    return TorchDataLoader(test_ds)

In [9]:
def get_data(train_ds, bs):
    return TorchDataLoader(train_ds, bs, shuffle=True)

def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)
    if opt is not None:
        loss.backward(),
        opt.step(),
        opt.zero_grad()
    return loss.item(), len(xb)

def fit(epochs, model, loss_func, opt, train_dl, writer: SummaryWriter):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, (xb, yb) in enumerate(train_dl, 0):
            loss, _ = loss_batch(model, loss_func, xb, yb, opt)
            running_loss += loss
            
            if i % 10 == 9:
                avg_loss = running_loss / 10
                writer.add_scalars('Training loss', {'Training': avg_loss}, epoch * len(train_dl) + i)
        model.eval()

In [10]:
n_features = X_train.shape[1]
n_outputs = 1
num_nodes_hidden = [64, 32]
lr = 0.001
        
model = nn.Sequential(
    nn.Linear(in_features=n_features, out_features=num_nodes_hidden[0]),
    nn.ReLU(),
    nn.Linear(in_features=num_nodes_hidden[0], out_features=num_nodes_hidden[1]),
    nn.ReLU(),
    nn.Linear(in_features=num_nodes_hidden[1], out_features=n_outputs),
)

opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
loss_func = F.mse_loss

In [11]:
train_dl = get_data(train_ds, 128)
writer = SummaryWriter(log_dir=f"{get_project_root()}/runners/runs")

fit(100, model, loss_func, opt, train_dl, writer)
writer.flush()

In [12]:
train_dl= get_data(train_ds, 128)
dataiter = iter(train_dl)

samples, result = next(dataiter)
writer.add_graph(model, samples)
writer.flush()

In [52]:
testResultStore = TestResultStore(
    experiment_path=f"{get_project_root()}/data/results/{Tier.TIER0}",
    strategy=Strategy.ALL,
    runner='sgd'
)
architecture = 'sgd'

def get_predict_fn(model, y_scaler: MinMaxScaler):
    def predict_fn(test_dl: TorchDataLoader):
        predictions = np.asarray([model(xb).cpu().numpy().squeeze().item() for xb, _ in test_dl])
        return y_scaler.inverse_transform(predictions.reshape((-1, 1))).squeeze()
    return predict_fn

def get_gt(test_dl: TorchDataLoader, y_scaler: MinMaxScaler):
    _y_test = np.asarray([yb.cpu().numpy().squeeze().item() for _, yb in test_dl])
    _y_test = y_scaler.inverse_transform(_y_test.reshape((-1, 1))).squeeze()
    return pd.Series(_y_test)

with torch.no_grad():
    model.eval()
    predict_fn = get_predict_fn(model, y_scaler)
    test_dl = get_test_data(X_test, y_test, x_scaler, y_scaler)
    
    testResultStore.update_results(
        key='all', predict_fn=predict_fn, architecture='sgd', X_test=test_dl, y_test=get_gt(test_dl, y_scaler)
    )
    
    for metabolite in metabolite_id.unique():
        test_dl = get_test_data(X_test, y_test, x_scaler, y_scaler, metabolite)
        testResultStore.update_results(
            key=metabolite, predict_fn=predict_fn, architecture='sgd', X_test=test_dl, y_test=get_gt(test_dl, y_scaler)
        )

testResultStore.to_file()


# with torch.no_grad():
#     model.eval()
#     predictions = [model(xb).cpu().numpy().squeeze().item() for xb, _ in test_dl]
#     print(f"Overall mae: {mean_absolute_error(y_test, predictions):.2f}")
#     for metabolite in metabolite_id.unique():
#         metabolite_predictions = []
#         target = []
#         for (xb, yb), is_current_metabolite in zip(test_dl, metabolite_id == metabolite):
#             if is_current_metabolite:
#                 metabolite_predictions.append(model(xb).cpu().numpy().squeeze().item())
#                 target.append(yb.cpu().numpy().squeeze().item())
#         print(
#             f"==================================== \n"
#             f"{metabolite} mae: {mean_absolute_error(target, metabolite_predictions):.2f}, \n"
#             f"r2-score: {pearsonr(target, metabolite_predictions)[0]:.2f}, \n"
#             f"min prediction: {min(metabolite_predictions):.2f}, max prediction: {max(metabolite_predictions):.2f} \n"
#             f"mean log change: {np.mean(target):.2f}, mean prediction {np.mean(metabolite_predictions):.2f} \n"
#             f"Inverse scaled: {pearsonr(y_scaler.inverse_transform(np.asarray(target).reshape((-1, 1))).squeeze(), y_scaler.inverse_transform(np.asarray(metabolite_predictions).reshape((-1, 1))).squeeze())[0]:.2f}"
#         )