In [7]:
from dlomix.data.fragment_ion_intensity import FragmentIonIntensityDataset
from dlomix.constants import PTMS_ALPHABET
import tensorflow as tf
from pyarrow import parquet as pq
from dlomix.losses import masked_spectral_distance
from get_updated_alphabet import get_modification
from tqdm import tqdm
import sys
sys.path.extend(['../bmpc_shared_scripts/oktoberfest_interface', '../bmpc_shared_scripts/prepare_dataset'])
from oktoberfest_interface import process_dataset, download_model_from_github, load_keras_model

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# load a model
MODEL_DIR = '/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024_unmod_extended/'
RUN_NAME = '7ef3360f-2349-46c0-a905-01187d4899e2'
model = tf.keras.models.load_model(MODEL_DIR + RUN_NAME + '.keras')

2024-07-23 08:22:29.155748: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-23 08:22:29.155796: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: minotaur.exbio.wzw.tum.de
2024-07-23 08:22:29.155806: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: minotaur.exbio.wzw.tum.de
2024-07-23 08:22:29.155965: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 550.90.7
2024-07-23 08:22:29.156000: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 550.90.7
2024-07-23 08:22:29.156007: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:241] kernel version seems to match DSO: 550.90.7


In [5]:
small_parquet_path = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_train.parquet'
ion_types = ['y', 'b']

In [6]:
# check if intensities column in parquet file
inference_only = True
col_names = pq.read_schema(small_parquet_path).names
if 'intensities_raw' in col_names:
    inference_only = False

In [7]:
# get all tokens present in the dataset
file = pq.ParquetFile(small_parquet_path)
dataset_tokens = set()
for batch in tqdm(file.iter_batches()):
    for cur_seq in batch['modified_sequence']:
        cur_mods = get_modification(str(cur_seq))
        dataset_tokens |= set(cur_mods)

1it [00:00, 11.00it/s]


In [8]:
# get the model alphabet and compare with the tokens from the dataset
# if new modifications are present -> need new embedding layer
model_tokens = set(model.alphabet.keys())
difference = dataset_tokens - model_tokens
if not difference:
    print('No tokens unknown to the model appear in the dataset!')
    new_alphabet = model.alphabet
else:
    print(f'These tokens appear in the dataset, but are not known to the model {difference}')
    print('A new embedding layer is necessary.')
    old_alphabet = model.alphabet
    new_alphabet = old_alphabet.update({k: i for i, k in enumerate(difference, start=len(model.alphabet) + 1)})

No tokens unknown to the model appear in the dataset!


In [9]:
# check for the ion types -> if ion types contain other than the b and y ions -> new output layer is necessary
number_of_ions = len(ion_types)
if any([ion_type in ['c', 'z', 'a', 'x'] for ion_type in ion_types]):
    if len(number_of_ions) == 2:
        print(f'New ion types detected, but only 2 ion types present. -> reinitialize the output layer')
    if len(number_of_ions) > 2:
        if 'y' in ion_types and 'b' in ion_types:
            print('New Ion types in addition to y and b ions detected -> new output layer, but can keep trained weights for y and b ions')
else:
    print('No new ion types detected. Output layer can stay the same.')

No new ion types detected. Output layer can stay the same.


In [13]:
ds = FragmentIonIntensityDataset(
    data_source=small_parquet_path,
    data_format='parquet',
    inference_only=inference_only,
    alphabet=new_alphabet,
    encoding_scheme='naive-mods',
    model_features=["precursor_charge_onehot", "collision_energy_aligned_normed", "method_nbr"],
    ion_types=['y', 'b', 'z', 'c']
)

Mapping SequenceParsingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 10424.86 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 9912.83 examples/s] 
Mapping SequenceEncodingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 15803.83 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 10502.73 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 17052.17 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 12661.72 examples/s]
Filter: 100%|██████████| 6400/6400 [00:00<00:00, 326997.42 examples/s]
Filter: 100%|██████████| 1600/1600 [00:00<00:00, 71847.19 examples/s]
Casting the dataset: 100%|██████████| 6390/6390 [00:01<00:00, 4419.79 examples/s]
Casting the dataset: 100%|██████████| 1598/1598 [00:00<00:00, 4469.37 examples/s]


In [11]:
ds.hf_dataset

DatasetDict({
    train: Dataset({
        features: ['modified_sequence', 'intensities_raw', 'precursor_charge_onehot', 'collision_energy_aligned_normed', 'method_nbr', '_parsed_sequence', '_n_term_mods', '_c_term_mods'],
        num_rows: 6391
    })
    val: Dataset({
        features: ['modified_sequence', 'intensities_raw', 'precursor_charge_onehot', 'collision_energy_aligned_normed', 'method_nbr', '_parsed_sequence', '_n_term_mods', '_c_term_mods'],
        num_rows: 1597
    })
})

In [14]:
ds.ion_types

['y', 'b', 'z', 'c']

### test model downloading from github

In [10]:
model_path = download_model_from_github()

Using cached model: /cmnfs/home/f.kapitza/.dlomix/models/prosit_baseline_model.keras


In [11]:
model = tf.keras.models.load_model(model_path)
model.summary()

Model: "prosit_intensity_predictor_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  464       
                                                                 
 sequential_5 (Sequential)   (None, 30, 512)           1996800   
                                                                 
 sequential_6 (Sequential)   multiple                  4608      
                                                                 
 sequential_7 (Sequential)   (None, 29, 512)           1576806   
                                                                 
 encoder_att (AttentionLaye  multiple                  542       
 r)                                                              
                                                                 
 sequential_8 (Sequential)   multiple                  0         
                                      

### test the process_dataset function on different use cases

In [5]:
# different dataset paths
etd_dataset = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/new_ion_types_ETD_support_edited.parquet'
inference_only_ds = 'test_inference_only.parquet'
single_ptm = '/cmnfs/data/proteomics/Prosit_PTMs/21PTMs/Kmod_Formyl.parquet'

In [6]:
# model path
model_path = '/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024_unmod_extended/7ef3360f-2349-46c0-a905-01187d4899e2.keras'

In [8]:
baseline_model = load_keras_model(model_path)

In [15]:
# test the function with a dataset containing new ion types
ds, model = process_dataset(etd_dataset, baseline_model, ion_types=['z', 'c'])

                Number of ions is the same as the loaded model supports, but the ion types are different.
                The model probably needs to be refined to achieve a better performance on these new ion types.
                


Start processing the dataset...


Mapping SequenceParsingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 10629.34 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 10483.45 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 18676.26 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 12869.19 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 19678.77 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 13256.93 examples/s]
Filter: 100%|██████████| 6302/6302 [00:00<00:00, 225591.05 examples/s]
Filter: 100%|██████████| 1576/1576 [00:00<00:00, 110007.21 examples/s]
Casting the dataset: 100%|██████████| 6299/6299 [00:01<00:00, 4537.23 examples/s]
Casting the dataset: 100%|██████████| 1576/1576 [00:00<00:00, 3819.53 examples/s]

The available data splits are: train, val





In [12]:
# test function with inference only dataset, and no model is specified
ds, model = process_dataset(inference_only_ds, ion_types=['y', 'b'])

Using cached model: /cmnfs/home/f.kapitza/.dlomix/models/prosit_baseline_model.keras
Start processing the dataset...


Generating train split: 7878 examples [00:00, 252442.68 examples/s]
                This is a inference only dataset! You can only make predictions with this dataset! Attempting to
                train a model with this dataset will result in an error!
                
Mapping SequenceParsingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 53786.87 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 52111.78 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 51173.66 examples/s]
Filter: 100%|██████████| 7878/7878 [00:00<00:00, 489507.38 examples/s]
Casting the dataset: 100%|██████████| 7875/7875 [00:01<00:00, 6364.76 examples/s]

The available data splits are: inference





In [14]:
# test function with a dataset containing new modifications
ds, model = process_dataset(single_ptm, baseline_model, modifications=['K[UNIMOD:122]'])

Start processing the dataset...


Mapping SequenceParsingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 10365.77 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 10143.96 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 17843.58 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 12396.58 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 17675.83 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 13033.24 examples/s]
Filter: 100%|██████████| 7169/7169 [00:00<00:00, 242667.79 examples/s]
Filter: 100%|██████████| 1793/1793 [00:00<00:00, 188051.99 examples/s]
Casting the dataset: 100%|██████████| 7169/7169 [00:01<00:00, 4856.73 examples/s]
Casting the dataset: 100%|██████████| 1793/1793 [00:00<00:00, 4664.25 examples/s]


The available data splits are: train, val
