In [2]:
import re
from alphabase.smiles.peptide import PeptideSmilesEncoder

# Initialize the encoder
encoder = PeptideSmilesEncoder()

# Minimal mapping of UNIMOD IDs to names; extend as needed
UNIMOD_ID_TO_NAME = {
    "35": "Oxidation",
    "4": "Carbamidomethyl",
    # Add more as needed
}

# Allowed UNIMODs
allowed_unimods = {"4", "35"}

# Function to check if all UNIMODs in a string are allowed
def contains_only_allowed_unimods(seq):
    unimod_ids = re.findall(r'\[UNIMOD:(\d+)\]', seq)
    return all(uid in allowed_unimods for uid in unimod_ids)

def convert_unimod_string(unimod_str):
    sequence = ''
    mods = []
    mod_sites = []

    # This keeps track of the unmodified sequence position (1-based)
    unmod_pos = 0

    # Regex to find amino acids followed optionally by [UNIMOD:xxx]
    pattern = re.compile(r'([A-Z])(\[UNIMOD:(\d+)\])?')

    for match in pattern.finditer(unimod_str):
        aa = match.group(1)
        unimod_id = match.group(3)

        unmod_pos += 1
        sequence += aa

        if unimod_id:
            mod_name = UNIMOD_ID_TO_NAME.get(unimod_id, f"UNIMOD:{unimod_id}")
            mods.append(f"{mod_name}@{aa}")
            mod_sites.append(str(unmod_pos))

    return sequence, ';'.join(mods), ';'.join(mod_sites)

def to_smile(sequence, encoder) -> str:
    seq, mods, sites = convert_unimod_string(sequence)
    return encoder.peptide_to_smiles(
    sequence=seq,
    mods=mods,
    mod_site=sites)

ModuleNotFoundError: No module named 'alphabase'

In [2]:
from datasets import load_dataset
dataset = load_dataset("theGreatHerrLebert/ionmob")
TRAIN = dataset["train"].to_pandas()

In [9]:
TRAIN_SAMPLE = TRAIN.sample(n=25_000)

In [10]:
# Apply the filter
TRAIN_SAMPLE_F = TRAIN_SAMPLE[TRAIN_SAMPLE["sequence_modified"].apply(contains_only_allowed_unimods)].reset_index(drop=True)

In [11]:
TRAIN_SAMPLE_F["smile"] = TRAIN_SAMPLE_F.sequence_modified.apply(lambda s: to_smile(s, encoder))

In [12]:
TRAIN_SAMPLE_F

Unnamed: 0,mz,charge,ccs,rt,sequence_modified,inv_ion_mob,ccs_std,inv_ion_mob_std,dataset_origin,smile
0,846.444285,2,431.362152,59.638,GTPAIGFSPIINTTM[UNIMOD:35]R,1.068255,-1.000000,-1.000000,meier_train,[H]N([H])CC(=O)N([H])[C@H](C(=O)N1CCC[C@H]1C(=...
1,1408.174065,2,548.950928,38.928,QVSQGQGSEDVPGEESWLEGLSQIQK,1.363895,-1.000000,-1.000000,meier_test,[H]N(CC(=O)N([H])[C@@H](CO)C(=O)N([H])[C@@H](C...
2,698.840175,2,388.064148,10.251,LTQDKSFNDNSK,0.959381,-1.000000,-1.000000,meier_train,[H]N([H])[C@@H](CC(C)C)C(=O)N([H])[C@H](C(=O)N...
3,742.386960,2,414.712189,52.070,LLANEELDPQDVK,1.025852,-1.000000,-1.000000,meier_train,[H]N([H])[C@@H](CC(C)C)C(=O)N([H])[C@@H](CC(C)...
4,471.210400,2,326.845615,2.870,QPASPSAHM[UNIMOD:35],0.804227,-1.000000,-1.000000,Zepeda et al.,[H]N(C(=O)[C@H](Cc1cnc[nH]1)N([H])C(=O)[C@H](C...
...,...,...,...,...,...,...,...,...,...,...
24774,730.853815,2,387.386414,14.124,ERAEQLSQENEK,0.958117,2.629735,0.006504,meier_train,[H]N([H])[C@@H](CCC(=O)O)C(=O)N([H])[C@@H](CCC...
24775,1121.981310,2,480.983826,23.706,GPLM[UNIMOD:35]C[UNIMOD:4]NSPSNSNANC[UNIMOD:4]...,1.193529,-1.000000,-1.000000,meier_test,[H]N([H])CC(=O)N1CCC[C@H]1C(=O)N([H])[C@@H](CC...
24776,851.121225,3,659.110291,37.977,ASALAMVSGDGFLVSRPEAIHLGPR,1.091174,-1.000000,-1.000000,meier_test,[H]N(CC(=O)N([H])[C@@H](Cc1ccccc1)C(=O)N([H])[...
24777,753.408560,2,402.195068,28.422,ELRYLDLSNNRL,0.995024,-1.000000,-1.000000,meier_test,[H]N([H])[C@@H](CCC(=O)O)C(=O)N([H])[C@@H](CC(...
