## Running MS2LDA

In [None]:
%matplotlib agg
import sys
sys.path.insert(0, '../../')
import MS2LDA

In [None]:
# Download datasets if not present
import os
import requests
import zipfile

datasets_dir = "../../datasets"
os.makedirs(datasets_dir, exist_ok=True)

if not os.path.exists(f"{datasets_dir}/GNPS-SUSPECTLIST.mgf"):
    print("Downloading datasets from Zenodo...")
    response = requests.get("https://zenodo.org/records/15857387/files/datasets.zip?download=1")
    with open("datasets.zip", "wb") as f:
        f.write(response.content)
    with zipfile.ZipFile("datasets.zip", "r") as zip_ref:
        zip_ref.extractall("../..")
    os.remove("datasets.zip")
    print("Datasets downloaded!")
else:
    print("Datasets already present.")

In [3]:
preprocessing_parameters = {
    "min_mz": 0,
    "max_mz": 1000,
    "max_frags": 1000,
    "min_frags": 3,
    "min_intensity": 0.01,
    "max_intensity": 1
}

In [4]:
convergence_parameters = {
    "step_size": 50,
    "window_size": 10,
    "threshold": 0.001,
    "type": "perplexity_history"
}

In [None]:
annotation_parameters = {
    "criterium": "best", # return cluster with most compounds in it after optimization ("best" also an option)
    "cosine_similarity": 0.70, #0.8 how similar are the spectra compared to motifs in the optimization
    "n_mols_retrieved": 10, # 10 molecules retrieved from database by Spec2Vec
    "s2v_model_path": "../../MS2LDA/Add_On/Spec2Vec/model_positive_mode/150225_Spec2Vec_pos_CleanedLibraries.model",
    "s2v_library_embeddings": "../../MS2LDA/Add_On/Spec2Vec/model_positive_mode/150225_CleanedLibraries_Spec2Vec_pos_embeddings.npy",
    "s2v_library_db": "../../MS2LDA/Add_On/Spec2Vec/model_positive_mode/150225_CombLibraries_spectra.db",
}

In [9]:
n_motifs = 2000 #1500
n_iterations = 5000

In [10]:
import random
random.seed(42)
model_parameters = {
    "rm_top": 0, 
    "min_cf": 0,
    "min_df": 3,
    "alpha": 0.6, #A higher alpha makes the document preferences "smoother" over topics
    "eta": 0.1, #and a higher eta makes the topic preferences "smoother" over words.
    "seed": 42,
}

In [11]:
train_parameters = {
    "parallel": 3,
    "workers": 0, 
}

In [12]:
dataset_parameters = {
    "acquisition_type": "DDA",
    "significant_digits": 2,
    "charge": 1,
    "name": "DDA-Suspectlist",
    "output_folder": f"CaseStudy_Suspectlist_{n_motifs}motifs_output", 
}

In [13]:
fingerprint_parameters = {
    "fp_type": "maccs",
    "threshold": 0.8,
}

In [15]:
motif_parameter = 20

In [16]:
from matchms.importing import load_from_mgf
dataset = list(load_from_mgf("../../datasets/GNPS-SUSPECTLIST.mgf"))
len(dataset)

87916

In [17]:
motif_spectra, optimized_motifs, motif_fps = MS2LDA.run(dataset, n_motifs=n_motifs, n_iterations=n_iterations,
        dataset_parameters=dataset_parameters,
        train_parameters=train_parameters,
        model_parameters=model_parameters,
        convergence_parameters=convergence_parameters,
        annotation_parameters=annotation_parameters,
        motif_parameter=motif_parameter,
        preprocessing_parameters=preprocessing_parameters,
        fingerprint_parameters=fingerprint_parameters)

Cleaning spectra ... 87070 spectra left


  model.train(convergence_parameters["step_size"], **train_parameters)
 46%|█████████████████████████████████▌                                       | 46/100 [13:19:18<15:38:18, 1042.57s/it]

Model has converged



100%|██████████████████████████████████████████████████████████████████████████████| 2500/2500 [01:11<00:00, 35.05it/s]


m2m folder stored
convergence curve stored
network stored


  plt.legend(loc="best")


---

In [21]:
optimized_motifs[637].losses.mz

array([106.1])