### Check for unseen modifications that might explain negative values in the sequences

invalid rows are the same as those sequences with the large negative values  

In [1]:
import pandas as pd
import re
df = pd.read_parquet('/cmnfs/proj//bmpc_dlomix/datasets/parquet/noptm_baseline_small_test.parquet')
# df = pd.read_parquet('/cmnfs/data/proteomics/Prosit_unmod/intensity/no_aug_test.parquet')

In [2]:
df.columns

Index(['raw_file', 'scan_number', 'method_nbr', 'precursor_charge_onehot',
       'collision_energy_aligned_normed', 'intensities_raw', 'package',
       'modified_sequence', 'sub'],
      dtype='object')

In [3]:
p = r'\w?\[UNIMOD:\d{1,4}\]'
test = df.loc[:, 'modified_sequence'].apply(lambda seq: re.findall(p, seq))

In [4]:
mods = set()
test.apply(lambda mod: [mods.add(m) for m in mod])
print(mods)

{'[UNIMOD:737]', '[UNIMOD:1]', 'M[UNIMOD:35]', 'C[UNIMOD:4]', 'K[UNIMOD:737]'}


In [5]:
# find out rows in the dataset with the UNIMOD:1 or the K[UNIMOD:737] modification
invalid_rows = list()
for i, row in test.items():
    if '[UNIMOD:1]' in row or 'K[UNIMOD:737]' in row:
        invalid_rows.append(i)
len(invalid_rows)

53

The \[UNIMOD:1\] and K\[UNIMOD:737\] modification does not appear in the naive mods alphabet -> cause the large negative values

In [7]:
# load the noptm small dataset processed with the naive alphabet and check wether the entries correspond to the negative values
from dlomix.data import load_processed_dataset
no_ptm_naive = load_processed_dataset('/cmnfs/proj/prosit_astral/bmpc_dlomix_group/datasets/processed/noptm_baseline_small_bs1024_naivemods')

In [8]:
no_ptm_naive.batch_size = 1

In [13]:
invalid_seqs = {}
for i, (batch, _) in enumerate(no_ptm_naive.tensor_test_data):
    if batch['modified_sequence'].numpy().min() == -9223372036854775808:
        invalid_seqs[i] = batch['modified_sequence'].numpy()

In [10]:
for i, seq in invalid_seqs.items():
    print(df.loc[i, 'modified_sequence'], list(seq[0]))

[UNIMOD:1]-PHFTVVPVDGPR-[] [-9223372036854775808, 13, 7, 5, 17, 18, 18, 13, 18, 3, 6, 13, 15, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[UNIMOD:737]-APFTGITDFSVTK[UNIMOD:737]-[] [21, 1, 13, 5, 17, 6, 8, 17, 3, 5, 16, 18, 17, -9223372036854775808, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[UNIMOD:737]-NFWIGLTYK[UNIMOD:737]-[] [21, 12, 5, 19, 8, 6, 10, 17, 20, -9223372036854775808, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[UNIMOD:1]-AHAGGGSGGSGAGGPAGR-[] [-9223372036854775808, 1, 7, 1, 6, 6, 6, 16, 6, 6, 16, 6, 1, 6, 6, 13, 1, 6, 15, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[UNIMOD:1]-DNLSDTLKK-[] [-9223372036854775808, 3, 12, 10, 16, 3, 17, 10, 9, 9, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[UNIMOD:1]-AASLVGKKIVFVTGNAK-[] [-9223372036854775808, 1, 1, 16, 10, 18, 6, 9, 9, 8, 18, 5, 18, 17, 6, 12, 1, 9, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[UNIMOD:737]-VTHVEDLNAK[UNIMOD:737]-[] [21, 18, 17, 7, 18, 4, 3, 10, 12, 1, -9223372036854775808, 22,

In [11]:
len(invalid_rows), len(invalid_seqs)

(53, 53)

In [12]:
assert set(invalid_rows) == set(invalid_seqs.keys())