# ICEBERG Training on MassSpecGym Dataset
This notebook preprocesses the MassSpecGym.csv dataset, prepares it for ICEBERG model training, runs the full ICEBERG pipeline, and saves the resulting model checkpoints.

## 1. Import Required Libraries
Import Python libraries for data processing, model training, and file operations.

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import yaml
import shutil
from pathlib import Path
from tqdm.cli import tqdm
import os
from pathlib import Path
import h5py
from matchms import Spectrum
from matchms.exporting import save_as_mgf


In [5]:
# Input and output paths
RAW_DATA_PATH = Path('MassSpecGym.csv')
PROCESSED_DATA_DIR = Path('data/spec_datasets/massspecgym')
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# 1. Read CSV
df = pd.read_csv(RAW_DATA_PATH, nrows=1000)
df

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,40.0,train,True
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,55.0,train,True
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,10.0,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,MassSpecGymID0001241,"95.0865,249.1485,267.1591","0.10710710710710711,0.12612612612612611,1.0",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,20.0,train,True
996,MassSpecGymID0001242,"67.0554,69.0709,79.0553,81.0709,83.0502,91.055...","0.11774286,0.0374477,0.19451771999999998,0.402...",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,55.0,train,True
997,MassSpecGymID0001243,"79.0552,81.0709,91.0542,93.0699,95.0865,97.064...","0.06624181,0.315034,0.03697615,0.04299285,1.0,...",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,40.0,train,True
998,MassSpecGymID0001244,"81.0709,95.0865,97.0648,107.0855,109.0648,123....","0.14877523,0.67714908,0.223023,0.047935,0.1197...",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,30.0,train,True


2.1. Data Sets. We train our models on the two data sets,
NIST2021 as generated by the National Institute of Standards
and NPLIB1 extracted from the GNPS database19 and
prepared previously by Dü hrkop et al.20 and Goldman et
al.22 For each spectrum in the data set, we first merge all scans
at various collision energies, combine peaks that are within
10−4 m/z tolerance from each other, renormalize the resulting
spectrum by dividing by the maximum observed intensity, and
take the square-root of each intensity. We subset the resulting
spectrum to keep the top 50 peaks with an intensity above
0.003. This normalization process is identical to our previous
study12 and emphasizes (a) removing peaks that are likely
noise and (b) combining various collision energies. We refer
the reader to our previous study12 for exact details on data set
extraction.

In [48]:
from matchms import Spectrum
from matchms.filtering import normalize_intensities, reduce_to_number_of_peaks, \
  select_by_intensity

sps = []
for i, sp in df.iterrows():
  mz = np.array([float(e) for e in sp.mzs.split(',')])
  intensities = np.array([float(e) for e in sp.intensities.split(',')])

  # Merge peaks when closer than 1e-4 to each other
  assert len(mz) == 1 or np.diff(np.sort(mz)).min() > 1e-4, f"Need merging! {mz}"

  # Take square root of intensity
  intensities = intensities ** .5

  sp = Spectrum(
    mz = mz,
    intensities=intensities,
    metadata=dict(
      precursor_mz = float(sp.precursor_mz),
      smiles = sp.smiles
    )
  )
  # renormalize the resulting spectrum by dividing by the maximum observed intens
  sp = normalize_intensities(sp)
  # Subset, select only largest 50 peaks, s.t. each peak has at least 0.003 intensity
  sp = select_by_intensity(sp, 0.003) # since they want sqrt intensity
  sp = reduce_to_number_of_peaks(sp, n_max=50)
  sps.append(sp)

To further normalize the data set, for each spectrum, we
subtract the mass of the adduct ion from each resulting MS2
peak. Concretely, the precursor molecule is ionized with an
adduct ion, for instance, H+. In this case, the mass of each peak
in the spectrum is shifted by the mass of H+ before proceeding
further.

In [None]:
# TODO

2.2. Canonical DAG Construction. We build a custom
reimplementation of the MAGMa algorithm9 to help create
explanatory directed acyclic graphs (DAGs) for each
normalized and adduct-shifted spectrum.
Given an input molecule, , MAGMa iteratively breaks
each molecule by removing atoms. Each time an atom is
removed, multiple fragments may form, from which we keep all
fragments of >2 heavy (non-hydrogen) atoms.

In [None]:
from rdkit import Chem
for sp in sps:
  break
sp.get('smiles')

'CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC'

In [None]:
# dataset=nist20 # nist20, nist23
# max_peaks=50
# ppm_diff=20
# workers=32

# python3 src/ms_pred/magma/run_magma.py  \
# --spectra-dir data/spec_datasets/$dataset/spec_files.hdf5  \
# --output-dir data/spec_datasets/$dataset/magma_outputs  \
# --spec-labels data/spec_datasets/$dataset/labels.tsv \
# --max-peaks $max_peaks \
# --ppm-diff $ppm_diff \
# --workers $workers


In [2]:
#  
# └── nist20
#     ├── labels.tsv
#     ├── mgf_files
#     ├── spec_files.hdf5
#     └── splits

Unnamed: 0,identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,40.0,train,True
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,55.0,train,True
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,10.0,train,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,MassSpecGymID0001241,"95.0865,249.1485,267.1591","0.10710710710710711,0.12612612612612611,1.0",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,20.0,train,True
996,MassSpecGymID0001242,"67.0554,69.0709,79.0553,81.0709,83.0502,91.055...","0.11774286,0.0374477,0.19451771999999998,0.402...",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,55.0,train,True
997,MassSpecGymID0001243,"79.0552,81.0709,91.0542,93.0699,95.0865,97.064...","0.06624181,0.315034,0.03697615,0.04299285,1.0,...",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,40.0,train,True
998,MassSpecGymID0001244,"81.0709,95.0865,97.0648,107.0855,109.0648,123....","0.14877523,0.67714908,0.223023,0.047935,0.1197...",CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,GFLMBFRNOPTZDK,C15H22O4,C15H23O4,266.151224,267.1585,[M+H]+,Orbitrap,30.0,train,True


In [None]:
# Save MassSpecGym dataset as MSP file with all metadata
import pandas as pd
import numpy as np
from matchms import Spectrum

Saved 1000 spectra to data/spec_datasets/massspecgym/spec_files.msp


In [10]:
# 2. Write labels.tsv
labels_path = PROCESSED_DATA_DIR / 'labels.tsv'
df_labels = df[['identifier', 'smiles', 'formula', 'inchikey', 'adduct']].copy()
df_labels = df_labels.rename(columns={'identifier': 'spec', 'adduct': 'ionization'})
df_labels.to_csv(labels_path, sep='\t', index=False)
print(f"Wrote {labels_path}")
df_labels

Wrote data/spec_datasets/massspecgym/labels.tsv


Unnamed: 0,spec,smiles,formula,inchikey,ionization
0,MassSpecGymID0000001,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,C16H17NO4,VFMQMACUYWGDOJ,[M+H]+
1,MassSpecGymID0000002,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,C16H17NO4,VFMQMACUYWGDOJ,[M+H]+
2,MassSpecGymID0000003,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,C16H17NO4,VFMQMACUYWGDOJ,[M+H]+
3,MassSpecGymID0000004,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,C16H17NO4,VFMQMACUYWGDOJ,[M+H]+
4,MassSpecGymID0000005,CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,C16H17NO4,VFMQMACUYWGDOJ,[M+H]+
...,...,...,...,...,...
995,MassSpecGymID0001241,CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,C15H22O4,GFLMBFRNOPTZDK,[M+H]+
996,MassSpecGymID0001242,CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,C15H22O4,GFLMBFRNOPTZDK,[M+H]+
997,MassSpecGymID0001243,CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,C15H22O4,GFLMBFRNOPTZDK,[M+H]+
998,MassSpecGymID0001244,CC1=C[C@]23[C@](CC1)([C@]4(C[C@@H]([C@H]([C@]4...,C15H22O4,GFLMBFRNOPTZDK,[M+H]+


In [36]:
import pandas as pd
from matchms import Spectrum
from matchms.exporting import save_as_msp, save_as_mgf

df = pd.read_csv("MassSpecGym.csv").head(1000)
spectra = []
for idx, row in df.iterrows():
    # Replace with your actual m/z and intensity columns
    mzs = np.array([float(x) for x in str(row.get('mzs', '')).split(',') if x])
    intensities = np.array([float(x) for x in str(row.get('intensities', '')).split(',') if x])
    metadata = {
        "title": row['identifier'],
        "smiles": row.get('smiles', ''),
        "formula": row.get('formula', ''),
        "inchikey": row.get('inchikey', ''),
        "ionization": row.get('ionization', ''),
        "precursor_mz": row.get('precursor_mz', '')
    }
    spectrum = Spectrum(mz=mzs, intensities=intensities, metadata=metadata)
    spectra.append(spectrum)

!rm -rf output.mgf
save_as_mgf(spectra, "output.mgf")
!head -n 50 output.mgf

BEGIN IONS
TITLE=MassSpecGymID0000001
SMILES=CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC
FORMULA=C16H17NO4
INCHIKEY=VFMQMACUYWGDOJ
PRECURSOR_MZ=288.1225
91.0542 0.24524524524524524 
125.0233 1.0 
154.0499 0.08008008008008008 
155.0577 0.35535535535535534 
185.0961 0.34934934934934936 
200.107 0.04504504504504504 
229.0859 0.14214214214214213 
246.1125 0.7347347347347347 
END IONS

BEGIN IONS
TITLE=MassSpecGymID0000002
SMILES=CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC
FORMULA=C16H17NO4
INCHIKEY=VFMQMACUYWGDOJ
PRECURSOR_MZ=288.1225
91.0542 0.0990990990990991 
125.0233 0.28128128128128127 
155.0577 0.04004004004004004 
185.0961 0.06506506506506507 
229.0859 0.07407407407407407 
246.1125 1.0 
END IONS

BEGIN IONS
TITLE=MassSpecGymID0000003
SMILES=CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC
FORMULA=C16H17NO4
INCHIKEY=VFMQMACUYWGDOJ
PRECURSOR_MZ=288.1225
69.0343 0.03403403403403404 
91.0542 0.31431431431431434 
125.0233 1.0 
127.039 0.03803803803803804 
153.0699 0.04104104104104104 
1

In [17]:

# 3. Write MGF file
mgf_dir = PROCESSED_DATA_DIR / 'mgf_files'
mgf_dir.mkdir(exist_ok=True)
mgf_path = mgf_dir / 'massspecgym.mgf'
spectra = []
for idx, row in tqdm(df.iterrows()):
    spec_id = str(row['identifier'])
    mzs = np.array([float(x) for x in str(row['mzs']).split(',') if x])
    intensities = np.array([float(x) for x in str(row['intensities']).split(',') if x])
    metadata = {
        "title": spec_id,
        "identifier": spec_id,
        "smiles": row.get('smiles', ''),
        "formula": row.get('formula', ''),
        "inchikey": row.get('inchikey', ''),
        "adduct": row.get('adduct', ''),
        "charge": 1,
        "precursor_mz": row.get('precursor_mz', ''),
        "instrument": row.get('instrument_type', ''),
        "collision_energy": row.get('collision_energy', ''),
        "retention_time": row.get('retention_time', ''),
    }
    spectrum = Spectrum(mz=mzs, intensities=intensities, metadata=metadata)
    spectra.append(spectrum)
save_as_mgf(spectra, str(mgf_path))
print(f"Saved {len(spectra)} spectra to {mgf_path}")

spectra[:3]

1000it [00:00, 7608.36it/s]

Saved 1000 spectra to data/spec_datasets/massspecgym/mgf_files/massspecgym.mgf





[Spectrum(precursor m/z=288.12, 8 fragments between 91.1 and 246.1),
 Spectrum(precursor m/z=288.12, 6 fragments between 91.1 and 246.1),
 Spectrum(precursor m/z=288.12, 12 fragments between 69.0 and 246.1)]

In [18]:

# 4. Write spec_files.hdf5
spec_hdf5_path = PROCESSED_DATA_DIR / 'spec_files.hdf5'
dt = h5py.special_dtype(vlen=bytes)
with h5py.File(spec_hdf5_path, 'w') as h5f:
    for idx, row in df.iterrows():
        spec_id = str(row['identifier'])
        mzs = [float(x) for x in str(row['mzs']).split(',') if x]
        intensities = [float(x) for x in str(row['intensities']).split(',') if x]
        parentmass = row.get('precursor_mz', None)
        if parentmass is None or pd.isna(parentmass):
            parentmass = row.get('parent_mass', None)
        if parentmass is None or pd.isna(parentmass):
            parentmass = max(mzs) if mzs else 0.0
        adduct = row.get('adduct', '')
        ms_text = f"#COMPOUND {spec_id}\n#PARENTMASS {parentmass}\n#IONIZATION {adduct}\n"
        for mz, inten in zip(mzs, intensities):
            ms_text += f"{mz}\t{inten}\n"
        h5f.create_dataset(f"{spec_id}.ms", (1,), dtype=dt)[0] = ms_text.encode('utf-8')
print(f"Generated {spec_hdf5_path} with {len(df)} spectra")


Generated data/spec_datasets/massspecgym/spec_files.hdf5 with 1000 spectra


In [20]:
# 5. Write splits
splits_dir = PROCESSED_DATA_DIR / 'splits'
splits_dir.mkdir(exist_ok=True)
train, val, test = np.split(df.sample(frac=1, random_state=42), [int(.8*len(df)), int(.9*len(df))])
train['identifier'].to_csv(splits_dir / 'train.txt', index=False, header=False)
val['identifier'].to_csv(splits_dir / 'val.txt', index=False, header=False)
test['identifier'].to_csv(splits_dir / 'test.txt', index=False, header=False)
print(f"Wrote splits to {splits_dir}")

Wrote splits to data/spec_datasets/massspecgym/splits


## 2. Set Up Paths and Configuration
Define paths for raw and processed data, configuration files, and output directories. Load YAML config for ICEBERG.

In [2]:
# Define paths
RAW_DATA_PATH = Path('MassSpecGym.csv')
PROCESSED_DATA_DIR = Path('data/spec_datasets/massspecgym')
CONFIG_PATH = Path('configs/iceberg/iceberg_elucidation.yaml')
GEN_CKPT_PATH = Path('checkpoints/iceberg_gen_massspecgym.pt')
INTEN_CKPT_PATH = Path('checkpoints/iceberg_inten_massspecgym.pt')

# Create output directories if not exist
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs('checkpoints', exist_ok=True)

# Load ICEBERG config
def load_yaml_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

iceberg_config = load_yaml_config(CONFIG_PATH)


## 3. Preprocess MassSpecGym Dataset
Convert MassSpecGym.csv to the required format (labels.tsv, mgf_files, spec_files.hdf5, splits).

In [3]:
!head MassSpecGym.csv

identifier,mzs,intensities,smiles,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge
MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,200.107,229.0859,246.1125","0.24524524524524524,1.0,0.08008008008008008,0.35535535535535534,0.34934934934934936,0.04504504504504504,0.14214214214214213,0.7347347347347347",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True
MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,246.1125","0.0990990990990991,0.28128128128128127,0.04004004004004004,0.06506506506506507,0.07407407407407407,1.0",CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC,VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True
MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154.0499,155.0577,170.0726,185.0961,200.107,229.0859,246.1125","0.03403403403403404,0.31431

## 4. Assign Subformulae with SCARF
Run assign_subformulae.py to generate subformulae files for ICEBERG preprocessing.

In [22]:
# Assign subformulae using SCARF script
import subprocess
import os
subformula_script = 'data_scripts/forms/01_assign_subformulae.py'
subformulae_dir = PROCESSED_DATA_DIR / 'subformulae'
os.makedirs(subformulae_dir, exist_ok=True)
labels_file = PROCESSED_DATA_DIR / 'labels.tsv'
workers = 32

# Assign all subformulae
cmd_all = [
    'python', subformula_script,
    '--debug',
    '--data-dir', str(PROCESSED_DATA_DIR),
    '--labels-file', str(labels_file),
    '--use-all',
    '--output-dir', str(subformulae_dir / 'no_subform.hdf5'),
    '--num-workers', str(workers)
]
subprocess.run(cmd_all)
# from data_scripts.forms



Processing ms raw data
{'MassSpecGymID0000001': None, 'MassSpecGymID0000002': None, 'MassSpecGymID0000003': None, 'MassSpecGymID0000004': None, 'MassSpecGymID0000005': None, 'MassSpecGymID0000006': None, 'MassSpecGymID0000007': None, 'MassSpecGymID0000008': None, 'MassSpecGymID0000009': None, 'MassSpecGymID0000010': None, 'MassSpecGymID0000011': None, 'MassSpecGymID0000012': None, 'MassSpecGymID0000013': None, 'MassSpecGymID0000014': None, 'MassSpecGymID0000015': None, 'MassSpecGymID0000016': None, 'MassSpecGymID0000017': None, 'MassSpecGymID0000018': None, 'MassSpecGymID0000019': None, 'MassSpecGymID0000020': None, 'MassSpecGymID0000021': None, 'MassSpecGymID0000022': None, 'MassSpecGymID0000023': None, 'MassSpecGymID0000026': None, 'MassSpecGymID0000027': None, 'MassSpecGymID0000032': None, 'MassSpecGymID0000033': None, 'MassSpecGymID0000035': None, 'MassSpecGymID0000036': None, 'MassSpecGymID0000037': None, 'MassSpecGymID0000038': None, 'MassSpecGymID0000039': None, 'MassSpecGymID00

100%|██████████| 1000/1000 [00:00<00:00, 7358.32it/s]
Traceback (most recent call last):
  File "data_scripts/forms/01_assign_subformulae.py", line 366, in <module>
    main()
  File "data_scripts/forms/01_assign_subformulae.py", line 330, in main
    for colli_eng, spec in input_specs_dict[spec_name].items():
AttributeError: 'NoneType' object has no attribute 'items'


CompletedProcess(args=['python', 'data_scripts/forms/01_assign_subformulae.py', '--debug', '--data-dir', 'data/spec_datasets/massspecgym', '--labels-file', 'data/spec_datasets/massspecgym/labels.tsv', '--use-all', '--output-dir', 'data/spec_datasets/massspecgym/subformulae/no_subform.hdf5', '--num-workers', '32'], returncode=1)

In [23]:
cmd_all

['python',
 'data_scripts/forms/01_assign_subformulae.py',
 '--debug',
 '--data-dir',
 'data/spec_datasets/massspecgym',
 '--labels-file',
 'data/spec_datasets/massspecgym/labels.tsv',
 '--use-all',
 '--output-dir',
 'data/spec_datasets/massspecgym/subformulae/no_subform.hdf5',
 '--num-workers',
 '32']

In [22]:

# Assign magma subformulae
cmd_magma = [
    'python', subformula_script,
    '--debug',
    '--data-dir', str(PROCESSED_DATA_DIR),
    '--labels-file', str(labels_file),
    '--use-magma',
    '--mass-diff-thresh', '20',
    '--output-dir', str(subformulae_dir / 'magma_subform_50.hdf5'),
    '--num-workers', str(workers)
]
subprocess.run(cmd_magma)

Processing ms raw data


  0%|          | 0/5000 [00:00<?, ?it/s]Traceback (most recent call last):
  File "data_scripts/forms/01_assign_subformulae.py", line 366, in <module>
    main()
  File "data_scripts/forms/01_assign_subformulae.py", line 320, in main
    input_specs = [proc_spec_full(i) for i in tqdm(spec_fn_lst)]
  File "data_scripts/forms/01_assign_subformulae.py", line 320, in <listcomp>
    input_specs = [proc_spec_full(i) for i in tqdm(spec_fn_lst)]
  File "data_scripts/forms/01_assign_subformulae.py", line 276, in process_spec_file
    spec_lines = ms_h5.read_str(f"{spec_name}.ms").split('\n')
  File "/home/tornikeo/Documents/personal/projects/ms-pred/src/ms_pred/common/misc_utils.py", line 59, in read_str
    str_obj = grp[name][0]
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
  File "/home/tornikeo/micromamba/envs/ms-gen/lib/python3.8/site-packages/h5py/_hl/dataset.py", line 817, in __getitem__
   

CompletedProcess(args=['python', 'data_scripts/forms/01_assign_subformulae.py', '--debug', '--data-dir', 'data/spec_datasets/massspecgym', '--labels-file', 'data/spec_datasets/massspecgym/labels.tsv', '--use-magma', '--mass-diff-thresh', '20', '--output-dir', 'data/spec_datasets/massspecgym/subformulae/magma_subform_50.hdf5', '--num-workers', '32'], returncode=1)

## 5. Annotate Substructures with MAGMa for ICEBERG
Run MAGMa annotation script to label substructures and create breakage dataset for ICEBERG training.

In [None]:
# Annotate substructures using MAGMa script
magma_script = 'data_scripts/dag/run_magma.sh'
cmd = ['bash', magma_script]
subprocess.run(cmd)


## 6. Prepare PubChem-SMILES Mapping (Optional)
Download and place pubchem_formulae_inchikey.hdf5 if contrastive finetuning is required.

In [None]:
# Download PubChem-SMILES mapping if needed
import urllib.request
pubchem_hdf5_url = 'https://zenodo.org/records/15529765/files/pubchem_formulae_inchikey.hdf5'
pubchem_hdf5_path = Path('data/pubchem/pubchem_formulae_inchikey.hdf5')
os.makedirs(pubchem_hdf5_path.parent, exist_ok=True)
if not pubchem_hdf5_path.exists():
    urllib.request.urlretrieve(pubchem_hdf5_url, pubchem_hdf5_path)


## 7. Train ICEBERG Fragment Generator
Train the ICEBERG fragment generator using train_gen.py, saving the checkpoint after training.

In [None]:
# Train ICEBERG fragment generator
train_gen_script = 'src/ms_pred/dag_pred/train_gen.py'
gen_ckpt_out = GEN_CKPT_PATH
cmd = [
    'python', train_gen_script,
    '--config', str(CONFIG_PATH),
    '--data', str(PROCESSED_DATA_DIR),
    '--output', str(gen_ckpt_out)
]
subprocess.run(cmd)


## 8. Sweep Fragment Generation Threshold
Run sweep_gen_thresh.py to optimize the number of fragments generated by the model.

In [None]:
# Sweep fragment generation threshold
sweep_script = 'run_scripts/iceberg/02_sweep_gen_thresh.py'
cmd = [
    'python', sweep_script,
    '--config', str(CONFIG_PATH),
    '--data', str(PROCESSED_DATA_DIR),
    '--gen_ckpt', str(GEN_CKPT_PATH)
]
subprocess.run(cmd)


## 9. Predict Training Set for Intensity Model
Use the trained fragment generator to predict the training set for the intensity model.

In [None]:
# Predict training set for intensity model
predict_script = 'run_scripts/iceberg/03_run_dag_gen_predict.sh'
cmd = ['bash', predict_script]
subprocess.run(cmd)


## 10. Train ICEBERG Intensity Model
Train the ICEBERG intensity predictor using train_inten.py, optionally with contrastive training, and save the checkpoint.

In [None]:
# Train ICEBERG intensity model
train_inten_script = 'src/ms_pred/dag_pred/train_inten.py'
inten_ckpt_out = INTEN_CKPT_PATH
cmd = [
    'python', train_inten_script,
    '--config', str(CONFIG_PATH),
    '--data', str(PROCESSED_DATA_DIR),
    '--output', str(inten_ckpt_out)
]
subprocess.run(cmd)


## 11. Save Model Checkpoints
Save the trained model checkpoints (gen_ckpt and inten_ckpt) to the specified output directory for later use.

In [None]:
# Confirm checkpoints are saved
assert GEN_CKPT_PATH.exists(), f"Fragment generator checkpoint not found: {GEN_CKPT_PATH}"
assert INTEN_CKPT_PATH.exists(), f"Intensity model checkpoint not found: {INTEN_CKPT_PATH}"
print(f"ICEBERG fragment generator checkpoint saved at: {GEN_CKPT_PATH}")
print(f"ICEBERG intensity model checkpoint saved at: {INTEN_CKPT_PATH}")