# Synthetic "dummy" datasets for refinement Learning
### Idea: Use the baseline data and change position of b and y ions.
### Idea 2: duplicate the intensities to simulate more ion types.

In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

Ion arrangement in tensor:
[y1_1, y1_2, y1_3, b1_1, b1_2, b1_3, y2_1, y2_2, y2_3, b2_1, b2_2, b2_3, ...]

After swapping:
[b1_1, b1_2, b1_3, y1_1, y1_2, y1_3, b2_1, b2_2, b2_3, y2_1, y2_2, y2_3, ...]

In [2]:
def swap_intensities(arr):
    # Reshape the array into shape (-1, 3)
    reshaped = arr.reshape(-1, 3)

    # Swap every two groups of three elements
    swapped = reshaped.reshape(-1, 2, 3)[:, ::-1].reshape(-1, 3)

    # Flatten the array back to its original shape
    result = swapped.flatten()
    return(result)

In [6]:
inpath = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small'
outpath = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/refinement_learning_toy_dataset/noptm_baseline_small_bs1024_swapped_ions'
dataset_types = ['train', 'val', 'test'] 

for data_type in dataset_types:
    dataset = pd.read_parquet(f'{inpath}_{data_type}.parquet')
    dataset['intensities_raw'] = dataset['intensities_raw'].apply(swap_intensities)
    dataset.to_parquet(f'{outpath}_{data_type}.parquet', index=False)

In [7]:
inpath = '/cmnfs/data/proteomics/Prosit_unmod/intensity/no_aug'
outpath = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/refinement_learning_toy_dataset/noptm_baseline_full_swapped_ions'
dataset_types = ['train', 'val', 'test'] 

for data_type in dataset_types:
    dataset = pd.read_parquet(f'{inpath}_{data_type}.parquet')
    dataset['intensities_raw'] = dataset['intensities_raw'].apply(swap_intensities)
    dataset.to_parquet(f'{outpath}_{data_type}.parquet', index=False)

Check if it is saved correctly:

In [9]:
original = pd.read_parquet("/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_train.parquet")
swapped = pd.read_parquet("/cmnfs/proj/bmpc_dlomix/datasets/parquet/refinement_learning_toy_dataset/noptm_baseline_small_bs1024_swapped_ions_train.parquet")

In [10]:
original

Unnamed: 0,raw_file,scan_number,method_nbr,precursor_charge_onehot,collision_energy_aligned_normed,intensities_raw,package,modified_sequence,sub
0,01640c_BA2-Thermo_SRM_Pool_9_01_01-2xIT_2xHCD-...,29024,2,"[0, 1, 0, 0, 0, 0]",0.223670,"[0.2857142857142857, 0.0, -1.0, 0.0, 0.0, -1.0...",Thermo_SRM_Pool_9,[]-TGQFDSQEYTEYAVK-[],0.0
1,03036a_BB12-TUM_HLA2_120_01_01-DDA-1h-R1,24650,2,"[0, 0, 1, 0, 0, 0]",0.312461,"[0.66, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.0, 0.0...",TUM_HLA2_122_01,[]-AHPKLVFSQEGRY-[],0.0
2,02445d_BH9-TUM_HLA_93_01_01-2xIT_2xHCD-1h-R4,10628,2,"[0, 1, 0, 0, 0, 0]",0.216905,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.25, 0.0, -1...",TUM_HLA_93_01,[]-NEEPKVATA-[],0.0
3,02445d_BF9-TUM_HLA_69_01_01-2xIT_2xHCD-1h-R4,49207,2,"[0, 1, 0, 0, 0, 0]",0.231191,"[0.04, 0.0, -1.0, 0.0, 0.0, -1.0, 0.04, 0.0, -...",TUM_HLA_69_01,[]-WPEAWRQQL-[],0.0
4,01812a_GG1-TUM_second_pool_91_01_01-3xHCD-1h-R1,51757,2,"[0, 0, 1, 0, 0, 0]",0.273261,"[0.02, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",TUM_first_pool_105,[]-SDLEANVEALTQEIDFLRR-[],0.0
...,...,...,...,...,...,...,...,...,...
7995,01640c_BD2-Thermo_SRM_Pool_12_01_01-3xHCD-1h-R2,55210,2,"[0, 1, 0, 0, 0, 0]",0.372297,"[0.51, 0.0, -1.0, 0.0, 0.0, -1.0, 0.5, 0.0, -1...",Thermo_SRM_Pool_12,[]-EELSGSLLQSVQEALEER-[],0.0
7996,02097a_BF7-TUM_isoform_67_01_01-3xHCD-1h-R3,32437,2,"[0, 0, 0, 1, 0, 0]",0.266420,"[0.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0,...",TUM_isoform_67_01,[]-PLLGSLRHMAPIR-[],0.0
7997,02445b_BG5-TUM_HLA_77_01_01-3xHCD-1h-R4,22717,2,"[0, 0, 1, 0, 0, 0]",0.283213,"[0.01, 0.0, 0.0, 0.03, 0.0, 0.0, 0.24, 0.0, 0....",TUM_HLA_77_01,[]-RPASPPVVKL-[],0.0
7998,01717a_BF2-TUM_second_pool_26_01_01-DDA-1h-R1,12543,2,"[0, 0, 1, 0, 0, 0]",0.325728,"[0.67, 0.0, 0.0, 0.0, 0.0, 0.0, 0.32, 0.0, 0.0...",TUM_second_pool_26,[]-EIPEVKDEEK-[],0.0


In [11]:
swapped

Unnamed: 0,raw_file,scan_number,method_nbr,precursor_charge_onehot,collision_energy_aligned_normed,intensities_raw,package,modified_sequence,sub
0,01640c_BA2-Thermo_SRM_Pool_9_01_01-2xIT_2xHCD-...,29024,2,"[0, 1, 0, 0, 0, 0]",0.223670,"[0.0, 0.0, -1.0, 0.2857142857142857, 0.0, -1.0...",Thermo_SRM_Pool_9,[]-TGQFDSQEYTEYAVK-[],0.0
1,03036a_BB12-TUM_HLA2_120_01_01-DDA-1h-R1,24650,2,"[0, 0, 1, 0, 0, 0]",0.312461,"[0.0, 0.0, 0.0, 0.66, 0.0, 0.0, 1.0, 0.0, 0.0,...",TUM_HLA2_122_01,[]-AHPKLVFSQEGRY-[],0.0
2,02445d_BH9-TUM_HLA_93_01_01-2xIT_2xHCD-1h-R4,10628,2,"[0, 1, 0, 0, 0, 0]",0.216905,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.97, 0.0, -1...",TUM_HLA_93_01,[]-NEEPKVATA-[],0.0
3,02445d_BF9-TUM_HLA_69_01_01-2xIT_2xHCD-1h-R4,49207,2,"[0, 1, 0, 0, 0, 0]",0.231191,"[0.0, 0.0, -1.0, 0.04, 0.0, -1.0, 0.12, 0.0, -...",TUM_HLA_69_01,[]-WPEAWRQQL-[],0.0
4,01812a_GG1-TUM_second_pool_91_01_01-3xHCD-1h-R1,51757,2,"[0, 0, 1, 0, 0, 0]",0.273261,"[0.0, 0.0, 0.0, 0.02, 0.0, 0.0, 0.09, 0.0, 0.0...",TUM_first_pool_105,[]-SDLEANVEALTQEIDFLRR-[],0.0
...,...,...,...,...,...,...,...,...,...
7995,01640c_BD2-Thermo_SRM_Pool_12_01_01-3xHCD-1h-R2,55210,2,"[0, 1, 0, 0, 0, 0]",0.372297,"[0.0, 0.0, -1.0, 0.51, 0.0, -1.0, 0.6, 0.0, -1...",Thermo_SRM_Pool_12,[]-EELSGSLLQSVQEALEER-[],0.0
7996,02097a_BF7-TUM_isoform_67_01_01-3xHCD-1h-R3,32437,2,"[0, 0, 0, 1, 0, 0]",0.266420,"[0.0, 0.0, 0.0, 0.22, 0.0, 0.0, 0.2, 0.0, 0.0,...",TUM_isoform_67_01,[]-PLLGSLRHMAPIR-[],0.0
7997,02445b_BG5-TUM_HLA_77_01_01-3xHCD-1h-R4,22717,2,"[0, 0, 1, 0, 0, 0]",0.283213,"[0.03, 0.0, 0.0, 0.01, 0.0, 0.0, 0.05, 0.0, 0....",TUM_HLA_77_01,[]-RPASPPVVKL-[],0.0
7998,01717a_BF2-TUM_second_pool_26_01_01-DDA-1h-R1,12543,2,"[0, 0, 1, 0, 0, 0]",0.325728,"[0.0, 0.0, 0.0, 0.67, 0.0, 0.0, 0.07, 0.0, 0.0...",TUM_second_pool_26,[]-EIPEVKDEEK-[],0.0


### Idea 2: duplicate the intensities to simulate more ion types.

In [2]:
def duplicate_intensities(arr):
    return np.concatenate((arr, arr))


In [3]:
inpath = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small'
outpath = '/cmnfs/proj/bmpc_dlomix/datasets/parquet/transfer_learning_toy_dataset/noptm_baseline_small_bs1024_duplicated_ions'
dataset_types = ['train', 'val', 'test'] 

for data_type in dataset_types:
    dataset = pd.read_parquet(f'{inpath}_{data_type}.parquet')
    dataset['intensities_raw'] = dataset['intensities_raw'].apply(duplicate_intensities)
    dataset.to_parquet(f'{outpath}_{data_type}.parquet', index=False)

In [4]:
original = pd.read_parquet("/cmnfs/proj/bmpc_dlomix/datasets/parquet/noptm_baseline_small_train.parquet")
duplicated = pd.read_parquet("/cmnfs/proj/bmpc_dlomix/datasets/parquet/transfer_learning_toy_dataset/noptm_baseline_small_bs1024_duplicated_ions_train.parquet")

In [7]:
len(duplicated["intensities_raw"][0])

348

In [5]:
original

Unnamed: 0,raw_file,scan_number,method_nbr,precursor_charge_onehot,collision_energy_aligned_normed,intensities_raw,package,modified_sequence,sub
0,01640c_BA2-Thermo_SRM_Pool_9_01_01-2xIT_2xHCD-...,29024,2,"[0, 1, 0, 0, 0, 0]",0.223670,"[0.2857142857142857, 0.0, -1.0, 0.0, 0.0, -1.0...",Thermo_SRM_Pool_9,[]-TGQFDSQEYTEYAVK-[],0.0
1,03036a_BB12-TUM_HLA2_120_01_01-DDA-1h-R1,24650,2,"[0, 0, 1, 0, 0, 0]",0.312461,"[0.66, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.0, 0.0...",TUM_HLA2_122_01,[]-AHPKLVFSQEGRY-[],0.0
2,02445d_BH9-TUM_HLA_93_01_01-2xIT_2xHCD-1h-R4,10628,2,"[0, 1, 0, 0, 0, 0]",0.216905,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.25, 0.0, -1...",TUM_HLA_93_01,[]-NEEPKVATA-[],0.0
3,02445d_BF9-TUM_HLA_69_01_01-2xIT_2xHCD-1h-R4,49207,2,"[0, 1, 0, 0, 0, 0]",0.231191,"[0.04, 0.0, -1.0, 0.0, 0.0, -1.0, 0.04, 0.0, -...",TUM_HLA_69_01,[]-WPEAWRQQL-[],0.0
4,01812a_GG1-TUM_second_pool_91_01_01-3xHCD-1h-R1,51757,2,"[0, 0, 1, 0, 0, 0]",0.273261,"[0.02, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",TUM_first_pool_105,[]-SDLEANVEALTQEIDFLRR-[],0.0
...,...,...,...,...,...,...,...,...,...
7995,01640c_BD2-Thermo_SRM_Pool_12_01_01-3xHCD-1h-R2,55210,2,"[0, 1, 0, 0, 0, 0]",0.372297,"[0.51, 0.0, -1.0, 0.0, 0.0, -1.0, 0.5, 0.0, -1...",Thermo_SRM_Pool_12,[]-EELSGSLLQSVQEALEER-[],0.0
7996,02097a_BF7-TUM_isoform_67_01_01-3xHCD-1h-R3,32437,2,"[0, 0, 0, 1, 0, 0]",0.266420,"[0.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0,...",TUM_isoform_67_01,[]-PLLGSLRHMAPIR-[],0.0
7997,02445b_BG5-TUM_HLA_77_01_01-3xHCD-1h-R4,22717,2,"[0, 0, 1, 0, 0, 0]",0.283213,"[0.01, 0.0, 0.0, 0.03, 0.0, 0.0, 0.24, 0.0, 0....",TUM_HLA_77_01,[]-RPASPPVVKL-[],0.0
7998,01717a_BF2-TUM_second_pool_26_01_01-DDA-1h-R1,12543,2,"[0, 0, 1, 0, 0, 0]",0.325728,"[0.67, 0.0, 0.0, 0.0, 0.0, 0.0, 0.32, 0.0, 0.0...",TUM_second_pool_26,[]-EIPEVKDEEK-[],0.0


In [6]:
duplicated

Unnamed: 0,raw_file,scan_number,method_nbr,precursor_charge_onehot,collision_energy_aligned_normed,intensities_raw,package,modified_sequence,sub
0,01640c_BA2-Thermo_SRM_Pool_9_01_01-2xIT_2xHCD-...,29024,2,"[0, 1, 0, 0, 0, 0]",0.223670,"[0.2857142857142857, 0.0, -1.0, 0.0, 0.0, -1.0...",Thermo_SRM_Pool_9,[]-TGQFDSQEYTEYAVK-[],0.0
1,03036a_BB12-TUM_HLA2_120_01_01-DDA-1h-R1,24650,2,"[0, 0, 1, 0, 0, 0]",0.312461,"[0.66, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.0, 0.0...",TUM_HLA2_122_01,[]-AHPKLVFSQEGRY-[],0.0
2,02445d_BH9-TUM_HLA_93_01_01-2xIT_2xHCD-1h-R4,10628,2,"[0, 1, 0, 0, 0, 0]",0.216905,"[0.0, 0.0, -1.0, 0.0, 0.0, -1.0, 0.25, 0.0, -1...",TUM_HLA_93_01,[]-NEEPKVATA-[],0.0
3,02445d_BF9-TUM_HLA_69_01_01-2xIT_2xHCD-1h-R4,49207,2,"[0, 1, 0, 0, 0, 0]",0.231191,"[0.04, 0.0, -1.0, 0.0, 0.0, -1.0, 0.04, 0.0, -...",TUM_HLA_69_01,[]-WPEAWRQQL-[],0.0
4,01812a_GG1-TUM_second_pool_91_01_01-3xHCD-1h-R1,51757,2,"[0, 0, 1, 0, 0, 0]",0.273261,"[0.02, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",TUM_first_pool_105,[]-SDLEANVEALTQEIDFLRR-[],0.0
...,...,...,...,...,...,...,...,...,...
7995,01640c_BD2-Thermo_SRM_Pool_12_01_01-3xHCD-1h-R2,55210,2,"[0, 1, 0, 0, 0, 0]",0.372297,"[0.51, 0.0, -1.0, 0.0, 0.0, -1.0, 0.5, 0.0, -1...",Thermo_SRM_Pool_12,[]-EELSGSLLQSVQEALEER-[],0.0
7996,02097a_BF7-TUM_isoform_67_01_01-3xHCD-1h-R3,32437,2,"[0, 0, 0, 1, 0, 0]",0.266420,"[0.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0,...",TUM_isoform_67_01,[]-PLLGSLRHMAPIR-[],0.0
7997,02445b_BG5-TUM_HLA_77_01_01-3xHCD-1h-R4,22717,2,"[0, 0, 1, 0, 0, 0]",0.283213,"[0.01, 0.0, 0.0, 0.03, 0.0, 0.0, 0.24, 0.0, 0....",TUM_HLA_77_01,[]-RPASPPVVKL-[],0.0
7998,01717a_BF2-TUM_second_pool_26_01_01-DDA-1h-R1,12543,2,"[0, 0, 1, 0, 0, 0]",0.325728,"[0.67, 0.0, 0.0, 0.0, 0.0, 0.0, 0.32, 0.0, 0.0...",TUM_second_pool_26,[]-EIPEVKDEEK-[],0.0
