In [1]:
import sys
sys.path.append('../bmpc_shared_scripts/prepare_dataset')

In [2]:
from dlomix.data.fragment_ion_intensity import FragmentIonIntensityDataset
from dlomix.constants import PTMS_ALPHABET
import tensorflow as tf
from pyarrow import parquet as pq
from dlomix.losses import masked_spectral_distance
from get_updated_alphabet import get_modification
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm



Avaliable feature extractors are (use the key of the following dict and pass it to features_to_extract in the Dataset Class):
{
   "atom_count": "Atom count of PTM.",
   "delta_mass": "Delta mass of PTM.",
   "mod_gain": "Gain of atoms due to PTM.",
   "mod_loss": "Loss of atoms due to PTM.",
   "red_smiles": "Reduced SMILES representation of PTM."
}.
When writing your own feature extractor, you can either
    (1) use the FeatureExtractor class or
    (2) write a function that can be mapped to the Hugging Face dataset.
In both cases, you can access the parsed sequence information from the dataset using the following keys, which all provide python lists:
    - _parsed_sequence: parsed sequence
    - _n_term_mods: N-terminal modifications
    - _c_term_mods: C-terminal modifications



2024-07-19 11:54:46.398514: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-19 11:54:46.398571: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-19 11:54:46.400101: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-19 11:54:46.408113: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# load a model
MODEL_DIR = '/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024_unmod_extended/'
RUN_NAME = '7ef3360f-2349-46c0-a905-01187d4899e2'
model = tf.keras.models.load_model(MODEL_DIR + RUN_NAME + '.keras')

2024-07-19 11:54:51.427349: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-19 11:54:51.427426: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: minotaur.exbio.wzw.tum.de
2024-07-19 11:54:51.427435: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: minotaur.exbio.wzw.tum.de
2024-07-19 11:54:51.427582: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 550.90.7
2024-07-19 11:54:51.427622: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 550.90.7
2024-07-19 11:54:51.427629: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:241] kernel version seems to match DSO: 550.90.7


In [5]:
small_parquet_path = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_train.parquet'
ion_types = ['y', 'b']

In [6]:
# check if intensities column in parquet file
inference_only = True
col_names = pq.read_schema(small_parquet_path).names
if 'intensities_raw' in col_names:
    inference_only = False

In [7]:
# get all tokens present in the dataset
file = pq.ParquetFile(small_parquet_path)
dataset_tokens = set()
for batch in tqdm(file.iter_batches()):
    for cur_seq in batch['modified_sequence']:
        cur_mods = get_modification(str(cur_seq))
        dataset_tokens |= set(cur_mods)

1it [00:00, 11.00it/s]


In [8]:
# get the model alphabet and compare with the tokens from the dataset
# if new modifications are present -> need new embedding layer
model_tokens = set(model.alphabet.keys())
difference = dataset_tokens - model_tokens
if not difference:
    print('No tokens unknown to the model appear in the dataset!')
    new_alphabet = model.alphabet
else:
    print(f'These tokens appear in the dataset, but are not known to the model {difference}')
    print('A new embedding layer is necessary.')
    old_alphabet = model.alphabet
    new_alphabet = old_alphabet.update({k: i for i, k in enumerate(difference, start=len(model.alphabet) + 1)})

No tokens unknown to the model appear in the dataset!


In [9]:
# check for the ion types -> if ion types contain other than the b and y ions -> new output layer is necessary
number_of_ions = len(ion_types)
if any([ion_type in ['c', 'z', 'a', 'x'] for ion_type in ion_types]):
    if len(number_of_ions) == 2:
        print(f'New ion types detected, but only 2 ion types present. -> reinitialize the output layer')
    if len(number_of_ions) > 2:
        if 'y' in ion_types and 'b' in ion_types:
            print('New Ion types in addition to y and b ions detected -> new output layer, but can keep trained weights for y and b ions')
else:
    print('No new ion types detected. Output layer can stay the same.')

No new ion types detected. Output layer can stay the same.


In [13]:
ds = FragmentIonIntensityDataset(
    data_source=small_parquet_path,
    data_format='parquet',
    inference_only=inference_only,
    alphabet=new_alphabet,
    encoding_scheme='naive-mods',
    model_features=["precursor_charge_onehot", "collision_energy_aligned_normed", "method_nbr"],
    ion_types=['y', 'b', 'z', 'c']
)

Mapping SequenceParsingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 10424.86 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 9912.83 examples/s] 
Mapping SequenceEncodingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 15803.83 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 10502.73 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6400/6400 [00:00<00:00, 17052.17 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1600/1600 [00:00<00:00, 12661.72 examples/s]
Filter: 100%|██████████| 6400/6400 [00:00<00:00, 326997.42 examples/s]
Filter: 100%|██████████| 1600/1600 [00:00<00:00, 71847.19 examples/s]
Casting the dataset: 100%|██████████| 6390/6390 [00:01<00:00, 4419.79 examples/s]
Casting the dataset: 100%|██████████| 1598/1598 [00:00<00:00, 4469.37 examples/s]


In [11]:
ds.hf_dataset

DatasetDict({
    train: Dataset({
        features: ['modified_sequence', 'intensities_raw', 'precursor_charge_onehot', 'collision_energy_aligned_normed', 'method_nbr', '_parsed_sequence', '_n_term_mods', '_c_term_mods'],
        num_rows: 6391
    })
    val: Dataset({
        features: ['modified_sequence', 'intensities_raw', 'precursor_charge_onehot', 'collision_energy_aligned_normed', 'method_nbr', '_parsed_sequence', '_n_term_mods', '_c_term_mods'],
        num_rows: 1597
    })
})

In [14]:
ds.ion_types

['y', 'b', 'z', 'c']

### test the process_dataset function on different use cases

In [55]:
import sys
sys.path.append('../bmpc_shared_scripts/oktoberfest_interface')
from oktoberfest_interface import process_dataset

In [44]:
# different dataset paths
etd_dataset = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/new_ion_types_ETD_support_edited.parquet'
inference_only_ds = 'test_inference_only.parquet'
single_ptm = '/cmnfs/data/proteomics/Prosit_PTMs/21PTMs/Kmod_Formyl.parquet'

In [45]:
# model path
model_path = '/cmnfs/proj/bmpc_dlomix/models/baseline_models/noptm_baseline_full_bs1024_unmod_extended/7ef3360f-2349-46c0-a905-01187d4899e2.keras'

In [56]:
ds, model = process_dataset(etd_dataset, model_path, ion_types=['z', 'c'])

                Number of ions is the same as the loaded model supports, but the ion types are different.
                The model probably needs to be refined to achieve a better performance on these new ion types.
                
  """


Start processing the dataset...


Mapping SequenceParsingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 10704.92 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 10473.91 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 18409.15 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 12151.23 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6302/6302 [00:00<00:00, 19121.32 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1576/1576 [00:00<00:00, 13252.84 examples/s]
Filter: 100%|██████████| 6302/6302 [00:00<00:00, 354984.54 examples/s]
Filter: 100%|██████████| 1576/1576 [00:00<00:00, 155574.93 examples/s]
Casting the dataset: 100%|██████████| 6300/6300 [00:01<00:00, 4535.82 examples/s]
Casting the dataset: 100%|██████████| 1575/1575 [00:00<00:00, 2930.95 examples/s]

The available data splits are: train, val





dict_keys(['train', 'val'])

In [66]:
ds, model = process_dataset(inference_only_ds, 'unmod_ext', ion_types=['y', 'b'])

Start processing the dataset...


Generating train split: 7878 examples [00:00, 190705.20 examples/s]
                This is a inference only dataset! You can only make predictions with this dataset! Attempting to
                train a model with this dataset will result in an error!
                
Mapping SequenceParsingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 61882.63 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 52750.79 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 7878/7878 [00:00<00:00, 48444.92 examples/s]
Filter: 100%|██████████| 7878/7878 [00:00<00:00, 484888.50 examples/s]
Casting the dataset: 100%|██████████| 7875/7875 [00:01<00:00, 6177.81 examples/s]

The available data splits are: inference





In [72]:
ds, model = process_dataset(single_ptm, 'unmod_ext', modifications=['K[UNIMOD:1]'])

            There are new tokens in the dataset, which are not supported by the loaded model.
            Either load a different model or transfer learning needs to be done.
            


Start processing the dataset...


Mapping SequenceParsingProcessor: 100%|██████████| 6839/6839 [00:01<00:00, 6166.51 examples/s]
Mapping SequenceParsingProcessor: 100%|██████████| 1710/1710 [00:00<00:00, 7753.03 examples/s] 
Mapping SequenceEncodingProcessor: 100%|██████████| 6839/6839 [00:00<00:00, 17776.25 examples/s]
Mapping SequenceEncodingProcessor: 100%|██████████| 1710/1710 [00:00<00:00, 10453.15 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 6839/6839 [00:00<00:00, 15557.11 examples/s]
Mapping SequencePaddingProcessor: 100%|██████████| 1710/1710 [00:01<00:00, 1686.71 examples/s]
Filter: 100%|██████████| 6839/6839 [00:00<00:00, 136080.63 examples/s]
Filter: 100%|██████████| 1710/1710 [00:02<00:00, 676.75 examples/s]
Casting the dataset: 100%|██████████| 6839/6839 [00:02<00:00, 3092.24 examples/s]
Casting the dataset: 100%|██████████| 1710/1710 [00:00<00:00, 3025.42 examples/s]

The available data splits are: train, val



