In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import tensorflow as tf
from dlomix.interface.oktoberfest_interface import load_keras_model, process_dataset
from dlomix.refinement_transfer_learning.automatic_rl_tl import AutomaticRlTlTraining, AutomaticRlTlTrainingConfig

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = '3'
os.environ['HF_HOME'] = '/cmnfs/proj/bmpc_dlomix/datasets'
os.environ['HF_DATASETS_CACHE'] = '/cmnfs/proj/bmpc_dlomix/datasets/hf_cache'

num_proc = 16
os.environ["OMP_NUM_THREADS"] = f"{num_proc}"
os.environ["TF_NUM_INTRAOP_THREADS"] = f"{num_proc}"
os.environ["TF_NUM_INTEROP_THREADS"] = f"{num_proc}"

tf.config.threading.set_inter_op_parallelism_threads(num_proc)
tf.config.threading.set_intra_op_parallelism_threads(num_proc)

In [6]:
# small dataset
parquet_path = '/cmnfs/data/proteomics/Prosit_PTMs/21PTMs/Kmod_Formyl.parquet'
modifications = ['K[UNIMOD:122]']
# large single ptm
# parquet_path= '/cmnfs/data/proteomics/Prosit_PTMs/TUM_mod_monomethyl.parquet'
# modifications = ['K[UNIMOD:34]', 'R[UNIMOD:34]']
ion_types = ['y', 'b']

In [7]:
# load baseline model
model = load_keras_model('baseline')
model.summary()

2024-07-28 18:00:30.329769: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7505 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:82:00.0, compute capability: 6.1


Model: "prosit_intensity_predictor"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  464       
                                                                 
 sequential (Sequential)     (None, 30, 512)           1996800   
                                                                 
 sequential_1 (Sequential)   multiple                  4608      
                                                                 
 sequential_2 (Sequential)   (None, 29, 512)           1576806   
                                                                 
 encoder_att (AttentionLaye  multiple                  542       
 r)                                                              
                                                                 
 sequential_3 (Sequential)   multiple                  0         
                                        

In [8]:
dataset = process_dataset(
    parquet_file_path=parquet_path,
    model=model,
    modifications=modifications,
    ion_types=ion_types,
    label_column='intensities_raw',
    val_ratio=0.2
)


            There are new tokens in the dataset, which are not supported by the loaded model.
            Either load a different model or transfer learning needs to be done.
            


Mapping SequenceParsingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 8894.50 examples/s] 
Mapping SequenceParsingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 10351.83 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 17416.12 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 11261.07 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 17009.59 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 14568.72 examples/s]
Filter: 100%|██████████| 7169/7169 [00:00<00:00, 271417.30 examples/s]
Filter: 100%|██████████| 1793/1793 [00:00<00:00, 131525.89 examples/s]
Casting the dataset: 100%|██████████| 7169/7169 [00:01<00:00, 4578.22 examples/s]
Casting the dataset: 100%|██████████| 1793/1793 [00:00<00:00, 4798.55 examples/s]


In [12]:
config = AutomaticRlTlTrainingConfig(
    dataset=dataset,
    baseline_model=model,
    use_wandb=False
)

In [13]:
trainer = AutomaticRlTlTraining(config)

[embedding layer]  model and dataset modifications match
[regressor layer]  matching ion types


In [None]:
new_model = trainer.train()