## Running MS2LDA

In [1]:
%matplotlib agg
import sys
sys.path.insert(0, '../../')
import MS2LDA

In [2]:
# Download datasets if not present
import os
import requests
import zipfile

datasets_dir = "../../datasets"
os.makedirs(datasets_dir, exist_ok=True)

if not os.path.exists(f"{datasets_dir}/Case_Study_Fungal_dataset.mgf"):
    print("Downloading datasets from Zenodo...")
    response = requests.get("https://zenodo.org/records/15857387/files/datasets.zip?download=1")
    with open("datasets.zip", "wb") as f:
        f.write(response.content)
    with zipfile.ZipFile("datasets.zip", "r") as zip_ref:
        zip_ref.extractall("../..")
    os.remove("datasets.zip")
    print("Datasets downloaded!")
else:
    print("Datasets already present.")

Datasets already present.


In [3]:
preprocessing_parameters = {
    "min_mz": 0,
    "max_mz": 2000,
    "max_frags": 1000,
    "min_frags": 5,
    "min_intensity": 0.01,
    "max_intensity": 1
}

In [4]:
convergence_parameters = {
    "step_size": 50,
    "window_size": 10,
    "threshold": 0.005,
    "type": "perplexity_history"
}

In [5]:
annotation_parameters = {
    "criterium": "biggest", 
    "cosine_similarity": 0.90,
    "n_mols_retrieved": 5, 
    "s2v_model_path": "../../MS2LDA/Add_On/Spec2Vec/model_positive_mode/150225_Spec2Vec_pos_CleanedLibraries.model",
    "s2v_library_embeddings": "../../MS2LDA/Add_On/Spec2Vec/model_positive_mode/150225_CleanedLibraries_Spec2Vec_pos_embeddings.npy",
    "s2v_library_db": "../../MS2LDA/Add_On/Spec2Vec/model_positive_mode/150225_CombLibraries_spectra.db",
}

In [6]:
n_motifs = 200
n_iterations = 10000

In [7]:
model_parameters = {
    "rm_top": 0, 
    "min_cf": 0,
    "min_df": 3,
    "alpha": 0.6, #A higher alpha makes the document preferences "smoother" over topics
    "eta": 0.01, #and a higher eta makes the topic preferences "smoother" over words.
    "seed": 42,
}

In [8]:
train_parameters = {
    "parallel": 3,
    "workers": 0, 
}

I will save this as a folder called mytest10!

In [9]:
dataset_parameters = {
    "acquisition_type": "DDA",
    "significant_digits": 3,
    "charge": 1,
    "name": "test",
    "output_folder": f"mytest10", 
}

In [10]:
fingerprint_parameters = {
    "fp_type": "maccs",
    "threshold": 0.8,
}

In [11]:
motif_parameter = 50

In [12]:
from matchms.importing import load_from_mgf
dataset = list(load_from_mgf("../../datasets/Case_Study_Fungal_dataset.mgf"))
len(dataset)

18562

In [13]:
motif_spectra, optimized_motifs, motif_fps = MS2LDA.run(dataset, n_motifs=n_motifs, n_iterations=n_iterations,
        dataset_parameters=dataset_parameters,
        train_parameters=train_parameters,
        model_parameters=model_parameters,
        convergence_parameters=convergence_parameters,
        annotation_parameters=annotation_parameters,
        motif_parameter=motif_parameter,
        preprocessing_parameters=preprocessing_parameters,
        fingerprint_parameters=fingerprint_parameters)

Cleaning spectra ... 2714 spectra left


  model.train(convergence_parameters["step_size"], **train_parameters)
 12%|█▏        | 23/200 [00:48<06:15,  2.12s/it]

Model has converged



100%|██████████| 200/200 [00:01<00:00, 158.39it/s]


m2m folder stored
convergence curve stored
network stored


  plt.legend(loc="best")


Visualization data saved (gzipped) to: mytest10/ms2lda_viz.json.gz


### Analysis spectra -> Mass2Motif

In [14]:
%matplotlib inline

First we need to obtain the documents or spectra in this case coming from the model

In [None]:
import tomotopy as tp
ms2lda = tp.LDAModel.load("mytest10/ms2lda.bin")
len(ms2lda.docs) #we have 2714 preprocessed spectra

2714

Here is for example doc or spectra  or in this case "spectra", here for example spectra with scan ID = 216, and the metadata associated with this

In [25]:
MS2LDA.retrieve_spec4doc(7).metadata

{'scans': '216',
 'charge': 0,
 'collision_energy': '0.0',
 'retention_time': 66.016,
 'ms_level': '2',
 'precursor_mz': 185.0326,
 'retention_index': None,
 'id': 'spec_7'}

Then we print the topics associated with that spectra

In [16]:
ms2lda.docs[7].get_topics()

[(2, 0.5857964754104614),
 (120, 0.35888227820396423),
 (126, 0.04184675216674805),
 (99, 0.011281222105026245),
 (179, 0.0001715535472612828),
 (177, 0.00010518742783460766),
 (84, 9.225017856806517e-05),
 (59, 8.504003199050203e-05),
 (65, 7.905204984126613e-05),
 (101, 3.4376051189610735e-05)]

Meaning the spectra with scan 216 and prec_mz of 185.0326 is connected to mass2motif 2, 120, 126... etc and their probabilities

### Now lets substract the Mass2Motifs per spectra that have more than 0.1 probability

In [18]:
top_motifs = []

for i in range(len(ms2lda.docs)):
    topics = ms2lda.docs[i].get_topics()
    doc_motifs = []  
    for motif in topics:
        if motif[1] > 0.1:
            doc_motifs.append(motif)
    top_motifs.append(doc_motifs) 


### Now lets build a table that has everything: spectra, rt, Mass2Motifs and probabilities

In [20]:
scans_list = []
retention_list = []
motifs_list = []

more properties can be added

In [21]:
for i in range(len(ms2lda.docs)):
    metadata = MS2LDA.retrieve_spec4doc(i).metadata
    scans_list.append(metadata['scans'])
    retention_list.append(metadata['retention_time'])
    motifs_list.append(top_motifs[i])

In [22]:
import pandas as pd

df = pd.DataFrame({
    'scans': scans_list,
    'retention_time': retention_list,
    'motifs': motifs_list
})

print(df.head())

  scans  retention_time                                             motifs
0    32          64.419  [(122, 0.7681128978729248), (117, 0.2094958275...
1   112          63.864  [(51, 0.48322176933288574), (24, 0.32932263612...
2   129          68.448                          [(2, 0.8528659343719482)]
3   132          64.418                        [(126, 0.9104835987091064)]
4   138          64.963  [(99, 0.6073269248008728), (1, 0.3637784719467...


In [23]:
df.to_csv('test10_spec_motifs.tsv', index=False, sep='\t')


---