In [1]:
import pandas as pd
import os
from pathlib import Path

## Small ETD (single folder) dataset creation and inspection

In [2]:
etd_dir = '/cmnfs/data/proteomics/ProteomeTools/ETD/parquet'
data_frames = list()
for file_name in os.listdir(etd_dir):
    data_frames.append(pd.read_parquet(Path(etd_dir) / file_name))

In [3]:
for df in data_frames:
    print(len(df))

2588
2083
2365
2554
2257
2885
2569
2351
2287
2749
140
40
20


In [4]:
all_seqs = pd.concat(data_frames, axis=0)

In [5]:
all_seqs.reset_index(inplace=True)
all_seqs.drop('index', inplace=True, axis=1)

In [3]:
def count_non_zero(intensities):
    intensities = intensities[intensities != 0]
    intensities = intensities[intensities != -1]
    return len(intensities)

In [7]:
all_seqs.loc[:, 'intensities_raw'].apply(count_non_zero).describe()

count    24888.000000
mean         3.915622
std          2.416116
min          0.000000
25%          2.000000
50%          3.000000
75%          5.000000
max         16.000000
Name: intensities_raw, dtype: float64

In [58]:
total_seqs = len(all_seqs)
train_df = all_seqs.sample(frac=0.7)
all_seqs.drop(train_df.index, axis=0, inplace=True)
val_df = all_seqs.sample(frac=2/3)
test_df = all_seqs.drop(val_df.index, axis=0)
print(f'Train set has {len(train_df) / total_seqs:.2%} of all sequences')
print(f'Validation set has {len(val_df) / total_seqs:.2%} of all sequences')
print(f'Test set has {len(test_df) / total_seqs:.2%} of all sequences')

Train set has 70.00% of all sequences
Validation set has 20.00% of all sequences
Test set has 10.00% of all sequences


In [61]:
train_df.to_parquet(Path(etd_dir) / 'etd_data_train.parquet')
val_df.to_parquet(Path(etd_dir) / 'etd_data_val.parquet')
test_df.to_parquet(Path(etd_dir) / 'etd_data_test.parquet')

## Larger ETD dataset inspection

In [2]:
parquet_path = './out2/data/dlomix/refinement_dataset/processed_dataset.parquet'
ds = pd.read_parquet(parquet_path)
len(ds)

61493

In [4]:
ds.loc[:, 'intensities_raw'].apply(count_non_zero).describe()

count    61493.000000
mean         6.338705
std          3.593216
min          0.000000
25%          4.000000
50%          6.000000
75%          7.000000
max         24.000000
Name: intensities_raw, dtype: float64

In [1]:
import pandas as pd

In [2]:
msms = pd.read_table('/cmnfs/data/proteomics/ProteomeTools/ETD/multiple_msms/msms.txt')

  msms = pd.read_table('/cmnfs/data/proteomics/ProteomeTools/ETD/multiple_msms/msms.txt')


In [3]:
len(msms)

61495

In [None]:
msms

In [2]:
from dlomix.interface import process_dataset, load_keras_model
import os

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [4]:
model = load_keras_model('/cmnfs/home/f.kapitza/dlomix/finn_notebooks/refinement_ETD_second_pool_improve_out/data/dlomix/refined.keras')

2024-08-28 10:17:15.242409: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4093 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1
2024-08-28 10:17:15.243178: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 7401 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1
2024-08-28 10:17:15.243691: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 7401 MB memory:  -> device: 2, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:81:00.0, compute capability: 6.1
2024-08-28 10:17:15.244205: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 7401 MB memory:  -> device: 3, name: NVIDIA GeForce GTX 1080, pc

In [None]:
ds = process_dataset(
    '/cmnfs/home/f.kapitza/dlomix/finn_notebooks/refinement_ETD_second_pool_improve_out/data/dlomix/refinement_dataset/processed_dataset.parquet',
    model,
    ion_types=['z_r', 'c']
)

In [None]:
ds

In [None]:
ds['train']