In [1]:
from dlomix.data.fragment_ion_intensity import FragmentIonIntensityDataset
from dlomix.constants import PTMS_ALPHABET
import tensorflow as tf
from pyarrow import parquet as pq
from dlomix.losses import masked_spectral_distance
from tqdm import tqdm
import sys
sys.path.extend(['../bmpc_shared_scripts/oktoberfest_interface', '../bmpc_shared_scripts/prepare_dataset'])
from get_updated_alphabet import get_modification
from dlomix.interface.oktoberfest_interface import process_dataset, download_model_from_github, load_keras_model

  from .autonotebook import tqdm as notebook_tqdm



Avaliable feature extractors are (use the key of the following dict and pass it to features_to_extract in the Dataset Class):
{
   "atom_count": "Atom count of PTM.",
   "delta_mass": "Delta mass of PTM.",
   "mod_gain": "Gain of atoms due to PTM.",
   "mod_loss": "Loss of atoms due to PTM.",
   "red_smiles": "Reduced SMILES representation of PTM."
}.
When writing your own feature extractor, you can either
    (1) use the FeatureExtractor class or
    (2) write a function that can be mapped to the Hugging Face dataset.
In both cases, you can access the parsed sequence information from the dataset using the following keys, which all provide python lists:
    - _parsed_sequence: parsed sequence
    - _n_term_mods: N-terminal modifications
    - _c_term_mods: C-terminal modifications



2024-07-28 18:05:53.449658: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-28 18:05:53.449710: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-28 18:05:53.451260: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-28 18:05:53.460252: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# load a model
MODEL_DIR = '/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024_unmod_extended/'
RUN_NAME = '7ef3360f-2349-46c0-a905-01187d4899e2'
model = tf.keras.models.load_model(MODEL_DIR + RUN_NAME + '.keras')

2024-07-28 18:05:58.860918: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7505 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1
2024-07-28 18:05:58.861532: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 7505 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1
2024-07-28 18:05:58.862098: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 136 MB memory:  -> device: 2, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:81:00.0, compute capability: 6.1
2024-07-28 18:05:58.862699: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 136 MB memory:  -> device: 3, name: NVIDIA GeForce GTX 1080, pci 

In [4]:
small_parquet_path = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_train.parquet'
ion_types = ['y', 'b']

In [5]:
# check if intensities column in parquet file
inference_only = True
col_names = pq.read_schema(small_parquet_path).names
if 'intensities_raw' in col_names:
    inference_only = False

In [6]:
# get all tokens present in the dataset
file = pq.ParquetFile(small_parquet_path)
dataset_tokens = set()
for batch in tqdm(file.iter_batches()):
    for cur_seq in batch['modified_sequence']:
        cur_mods = get_modification(str(cur_seq))
        dataset_tokens |= set(cur_mods)

0it [00:00, ?it/s]

1it [00:00, 11.54it/s]


In [7]:
# get the model alphabet and compare with the tokens from the dataset
# if new modifications are present -> need new embedding layer
model_tokens = set(model.alphabet.keys())
difference = dataset_tokens - model_tokens
if not difference:
    print('No tokens unknown to the model appear in the dataset!')
    new_alphabet = model.alphabet
else:
    print(f'These tokens appear in the dataset, but are not known to the model {difference}')
    print('A new embedding layer is necessary.')
    old_alphabet = model.alphabet
    new_alphabet = old_alphabet.update({k: i for i, k in enumerate(difference, start=len(model.alphabet) + 1)})

No tokens unknown to the model appear in the dataset!


In [8]:
# check for the ion types -> if ion types contain other than the b and y ions -> new output layer is necessary
number_of_ions = len(ion_types)
if any([ion_type in ['c', 'z', 'a', 'x'] for ion_type in ion_types]):
    if len(number_of_ions) == 2:
        print(f'New ion types detected, but only 2 ion types present. -> reinitialize the output layer')
    if len(number_of_ions) > 2:
        if 'y' in ion_types and 'b' in ion_types:
            print('New Ion types in addition to y and b ions detected -> new output layer, but can keep trained weights for y and b ions')
else:
    print('No new ion types detected. Output layer can stay the same.')

No new ion types detected. Output layer can stay the same.


In [9]:
ds = FragmentIonIntensityDataset(
    data_source=small_parquet_path,
    data_format='parquet',
    inference_only=inference_only,
    alphabet=new_alphabet,
    encoding_scheme='naive-mods',
    model_features=["precursor_charge_onehot", "collision_energy_aligned_normed", "method_nbr"],
    ion_types=['y', 'b', 'z', 'c']
)

Mapping SequenceParsingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 9306.93 examples/s] 
Mapping SequenceParsingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 9894.40 examples/s] 
Mapping SequenceEncodingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 17163.32 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 11316.78 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 18463.89 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 12405.90 examples/s]
Filter: 100%|██████████| 6400/6400 [00:00<00:00, 229183.24 examples/s]
Filter: 100%|██████████| 1600/1600 [00:00<00:00, 107800.21 examples/s]
Casting the dataset: 100%|██████████| 6391/6391 [00:01<00:00, 4534.39 examples/s]
Casting the dataset: 100%|██████████| 1597/1597 [00:00<00:00, 4795.56 examples/s]


In [10]:
ds.hf_dataset

DatasetDict({
    train: Dataset({
        features: ['modified_sequence', 'intensities_raw', 'precursor_charge_onehot', 'collision_energy_aligned_normed', 'method_nbr', '_parsed_sequence', '_n_term_mods', '_c_term_mods'],
        num_rows: 6391
    })
    val: Dataset({
        features: ['modified_sequence', 'intensities_raw', 'precursor_charge_onehot', 'collision_energy_aligned_normed', 'method_nbr', '_parsed_sequence', '_n_term_mods', '_c_term_mods'],
        num_rows: 1597
    })
})

In [11]:
ds.ion_types

['y', 'b', 'z', 'c']

### test model downloading from github

In [12]:
model_path = download_model_from_github()

In [13]:
model = tf.keras.models.load_model(model_path)
model.summary()

Model: "prosit_intensity_predictor_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  464       
                                                                 
 sequential_5 (Sequential)   (None, 30, 512)           1996800   
                                                                 
 sequential_6 (Sequential)   multiple                  4608      
                                                                 
 sequential_7 (Sequential)   (None, 29, 512)           1576806   
                                                                 
 encoder_att (AttentionLaye  multiple                  542       
 r)                                                              
                                                                 
 sequential_8 (Sequential)   multiple                  0         
                                      

### test the process_dataset function on different use cases

In [14]:
# different dataset paths
etd_dataset = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/new_ion_types_ETD_support_edited.parquet'
inference_only_ds = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/test_inference_only.parquet'
single_ptm = '/cmnfs/data/proteomics/Prosit_PTMs/21PTMs/Kmod_Formyl.parquet'
split_dataset = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small'

In [15]:
# model path
model_path = '/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024_unmod_extended/7ef3360f-2349-46c0-a905-01187d4899e2.keras'

In [16]:
baseline_model = load_keras_model(model_path)

In [17]:
# test the function with a dataset containing new ion types
ds = process_dataset(etd_dataset, baseline_model, ion_types=['z', 'c'])


                Number of ions is the same as the loaded model supports, but the ion types are different.
                The model probably needs to be refined to achieve a better performance on these new ion types.
                


Mapping SequenceParsingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 8520.55 examples/s] 
Mapping SequenceParsingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 9532.14 examples/s] 
Mapping SequenceEncodingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 18415.41 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 12194.79 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 18189.61 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 13307.80 examples/s]
Filter: 100%|██████████| 6302/6302 [00:00<00:00, 227136.05 examples/s]
Filter: 100%|██████████| 1576/1576 [00:00<00:00, 58903.62 examples/s]
Casting the dataset: 100%|██████████| 6299/6299 [00:01<00:00, 4072.55 examples/s]
Casting the dataset: 100%|██████████| 1576/1576 [00:00<00:00, 4183.27 examples/s]


In [18]:
# test function with inference only dataset, and no model is specified
ds = process_dataset(inference_only_ds, ion_types=['y', 'b'])

Generating train split: 7878 examples [00:00, 460089.77 examples/s]
                This is a inference only dataset! You can only make predictions with this dataset! Attempting to
                train a model with this dataset will result in an error!
                
Mapping SequenceParsingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 46523.98 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 41791.38 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 43249.36 examples/s]
Filter: 100%|██████████| 7878/7878 [00:00<00:00, 375840.02 examples/s]
Casting the dataset: 100%|██████████| 7875/7875 [00:01<00:00, 5664.17 examples/s]


In [19]:
# test function with a dataset containing new modifications
ds = process_dataset(single_ptm, baseline_model, modifications=['K[UNIMOD:122]'])


            There are new tokens in the dataset, which are not supported by the loaded model.
            Either load a different model or transfer learning needs to be done.
            


Mapping SequenceParsingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 12323.67 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 10589.28 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 16867.73 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 11372.83 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 7169/7169 [00:00<00:00, 18929.60 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 13862.03 examples/s]
Filter: 100%|██████████| 7169/7169 [00:00<00:00, 237221.14 examples/s]
Filter: 100%|██████████| 1793/1793 [00:00<00:00, 119564.80 examples/s]
Casting the dataset: 100%|██████████| 7169/7169 [00:01<00:00, 4589.33 examples/s]
Casting the dataset: 100%|██████████| 1793/1793 [00:00<00:00, 4708.55 examples/s]


In [20]:
# test function with dataset which is already split
ds = process_dataset(split_dataset, baseline_model)

                Multiple data sources or a single non-train data source provided {'train': '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_train.parquet', 'val': '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_val.parquet', 'test': '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_test.parquet'}, please ensure that the data sources are already split into train, val and test sets
                since no splitting will happen. If not, please provide only one data_source and set the val_ratio to split the data into train and val sets."
                
Mapping SequenceParsingProcessor: 100%|██████████| 8000/8000 [00:00<00:00, 21471.73 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 4000/4000 [00:00<00:00, 19059.69 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 2000/2000 [00:00<00:00, 15881.56 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 8000/8000 [00:00<00:00, 16473.45 examples/s]
Mapping Sequenc

In [21]:
# test function with a given test_ratio
ds = process_dataset(single_ptm, baseline_model, modifications=['K[UNIMOD:122]'], test_ratio=0.1)


            There are new tokens in the dataset, which are not supported by the loaded model.
            Either load a different model or transfer learning needs to be done.
            


Mapping SequenceParsingProcessor: 100%|██████████| 6272/6272 [00:00<00:00, 10330.99 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 4215.72 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 897/897 [00:00<00:00, 13546.43 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 6272/6272 [00:00<00:00, 17262.62 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 14091.51 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 897/897 [00:00<00:00, 11015.21 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6272/6272 [00:00<00:00, 16149.35 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1793/1793 [00:00<00:00, 12923.23 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 897/897 [00:00<00:00, 11724.15 examples/s]
Filter: 100%|██████████| 6272/6272 [00:00<00:00, 361038.03 examples/s]
Filter: 100%|██████████| 1793/1793 [00:00<00:00, 168660.14 examples/s