## Running MS2LDA

In [16]:
#%pip install requests 
import requests
from matchms.importing import load_from_mgf
from io import StringIO
url = "https://raw.githubusercontent.com/matchms/matchms/master/tests/testdata/pesticides.mgf"
filename = "pesticides.mgf"
response = requests.get(url)
if response.status_code == 200:
    with open(f"../datasets/{filename}", "wb") as f:
        f.write(response.content)
    print(f"File downloaded and saved as {filename}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

File downloaded and saved as pesticides.mgf


In [4]:
%matplotlib agg
import MS2LDA

In [5]:
preprocessing_parameters = {
    "min_mz": 0,
    "max_mz": 1000,
    "max_frags": 1000,
    "min_frags": 3,
    "min_intensity": 0.01,
    "max_intensity": 1
}

In [6]:
convergence_parameters = {
    "step_size": 50,
    "window_size": 10,
    "threshold": 0.001,
    "type": "perplexity_history"
}

In [7]:
annotation_parameters = {
    "criterium": "best", # return cluster with most compounds in it after optimization ("best" also an option)
    "cosine_similarity": 0.70, #0.8 how similar are the spectra compared to motifs in the optimization
    "n_mols_retrieved": 5 # 10 molecules retrieved from database by Spec2Vec
}

In [8]:
n_motifs = 12
n_iterations = 2000

In [9]:
model_parameters = {
    "rm_top": 0, 
    "min_cf": 0,
    "min_df": 3,
    "alpha": 0.6, #A higher alpha makes the document preferences "smoother" over topics
    "eta": 0.01, #and a higher eta makes the topic preferences "smoother" over words.
    "seed": 42,
}

In [10]:
train_parameters = {
    "parallel": 3,
    "workers": 1, 
}

In [11]:
dataset_parameters = {
    "acquisition_type": "DDA",
    "significant_digits": 3,
    "charge": 1,
    "name": "Microcystein_12m2m",
    "output_folder": f"test", 
}

In [12]:
fingerprint_parameters = {
    "fp_type": "rdkit",
    "threshold": 0.8,
}

In [13]:
motif_parameter = 20

In [17]:
dataset = "../datasets/pesticides.mgf"

In [18]:
motif_spectra, optimized_motifs, motif_fps = MS2LDA.run(dataset, n_motifs=n_motifs, n_iterations=n_iterations,
        dataset_parameters=dataset_parameters,
        train_parameters=train_parameters,
        model_parameters=model_parameters,
        convergence_parameters=convergence_parameters,
        annotation_parameters=annotation_parameters,
        motif_parameter=motif_parameter,
        preprocessing_parameters=preprocessing_parameters,
        fingerprint_parameters=fingerprint_parameters)

Cleaning spectra ... 75 spectra left


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 65.52it/s]


model did not converge
Loading Spec2Vec model ...
Searches for candidates ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 40.07it/s]


Visualization data saved to: mc_test\ms2lda_viz.json


---

## Screening for spectra from experiments, motifs or motifDB

In [19]:
motifDB_query = "QUERY scaninfo(MS2DATA) WHERE MS2PROD=185.95"

In [21]:
screening_hits = MS2LDA.screen_spectra(motifDB=f"{output_folder}/motifDB.xlsx", dataset="../datasets/pesticides.mgf", motifDB_query=motifDB_query, output_folder=dataset_parameters["output_folder"])

Loading Spec2Vec model ...
TOTAL QUERIES 1


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.45it/s]
10it [00:00, 2459.14it/s]


In [22]:
screening_hits

Unnamed: 0,hit_id,screen_type,score,ref_motifset,ref_motif_id,ref_short_annotation,ref_annotation,ref_charge
0,spec_40,spectrum-motif,0.9,Microcystein_12m2m,motif_3,"['c1cc2c(s1)CCOC21CCN(Cc2ccc3c(c2)OCCO3)CC1', ...",,1
1,spec_43,spectrum-motif,0.89,Microcystein_12m2m,motif_3,"['c1cc2c(s1)CCOC21CCN(Cc2ccc3c(c2)OCCO3)CC1', ...",,1
2,spec_47,spectrum-motif,0.98,Microcystein_12m2m,motif_3,"['c1cc2c(s1)CCOC21CCN(Cc2ccc3c(c2)OCCO3)CC1', ...",,1
3,spec_52,spectrum-motif,0.98,Microcystein_12m2m,motif_3,"['c1cc2c(s1)CCOC21CCN(Cc2ccc3c(c2)OCCO3)CC1', ...",,1
4,spec_54,spectrum-motif,0.93,Microcystein_12m2m,motif_3,"['c1cc2c(s1)CCOC21CCN(Cc2ccc3c(c2)OCCO3)CC1', ...",,1
5,spec_42,spectrum-motif,0.7,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1
6,spec_66,spectrum-motif,0.51,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1
7,spec_73,spectrum-motif,0.54,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1
8,spec_74,spectrum-motif,0.53,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1
9,spec_32,spectrum-motif,0.64,Microcystein_12m2m,motif_6,['Cc1ccc(NC(=O)CSc2nc(C)cc(O)n2)cc1'],,1


In [23]:
screening_hits.loc[screening_hits.ref_motif_id == "motif_1"]

Unnamed: 0,hit_id,screen_type,score,ref_motifset,ref_motif_id,ref_short_annotation,ref_annotation,ref_charge
5,spec_42,spectrum-motif,0.7,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1
6,spec_66,spectrum-motif,0.51,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1
7,spec_73,spectrum-motif,0.54,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1
8,spec_74,spectrum-motif,0.53,Microcystein_12m2m,motif_1,['COc1cccc(NC(=O)C(Cc2ccccc2)NC(=O)c2cc3ccccc3...,,1


---