# Training of the Baseline Model

## Initialization steps

### local dlomix package

In [None]:
# use this repo's code as dlomix package
import os
import sys
module_path = os.path.abspath(os.path.join('..')) + '/src'
if module_path not in sys.path:
    sys.path.append(module_path)

### weights and biases

In [None]:
# initialize weights and biases

import wandb
# from wandb.keras import WandbCallback
from wandb.integration.keras import WandbCallback

project_name = 'baseline model'
wandb.init(project=project_name)

config = wandb.config

### hyperparameters

In [None]:
# hyperparameters

config.seq_length = 30
config.batch_size = 2
# config.val_ratio = 0.2
config.learning_rate = 1.0e-4
config.epochs = 2

### dataset

*Here, I currently import the "small" dataset that Joel had in his script.*

**TODO:** Find baseline dataset and load that instead.

In [None]:
# load dataset
from dlomix.data import FragmentIonIntensityDataset

# from misc import PTMS_ALPHABET
from dlomix.constants import PTMS_ALPHABET

# path to dataset
datset_base_path = "/cmnfs/data/proteomics/Prosit_PTMs/Transformer_Train/clean"
dataset_train_path = f"{datset_base_path}_train.parquet"
dataset_val_path = f"{datset_base_path}_val.parquet"
dataset_test_path = f"{datset_base_path}_test.parquet"

dataset = FragmentIonIntensityDataset(
    data_source=dataset_train_path,
    val_data_source=dataset_val_path,
    test_data_source=dataset_test_path,
    data_format="parquet", 
    # val_ratio=config.val_ratio, # why do we need this if we already have splits?
    batch_size=config.batch_size,
    max_seq_len=config.seq_length,
    encoding_scheme="naive-mods",
    alphabet=PTMS_ALPHABET,
    model_features=[]
    # model_features=["precursor_charge_onehot", "collision_energy_aligned_normed","method_nbr"]
)


### tensorflow setup

In [None]:
# initialize relevant stuff for training
import tensorflow as tf
optimizer = tf.keras.optimizers.Adam(learning_rate=config.learning_rate)

from dlomix.losses import masked_spectral_distance, masked_pearson_correlation_distance

from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(
    monitor="val_loss",
    min_delta=0.001,
    patience=20,
    restore_best_weights=True)

### model setup

In [None]:
# initialize model
from dlomix.models import PrositIntensityPredictor

input_mapping = {
    "SEQUENCE_KEY": "modified_sequence"
}

model = PrositIntensityPredictor(
    seq_length=config.seq_length,
    alphabet=PTMS_ALPHABET,
    use_prosit_ptm_features=False,
    with_termini=False
    # input_keys=input_mapping
)

model.compile(
    optimizer=optimizer,
    loss=masked_spectral_distance,
    metrics=[masked_pearson_correlation_distance]
)


## Training

In [None]:
model.fit(
    dataset.tensor_train_data,
    validation_data=dataset.tensor_val_data,
    epochs=config.epochs,
    callbacks=[WandbCallback(), early_stopping]
)