## Running MS2LDA

In [1]:
%matplotlib agg
import MS2LDA

In [2]:
preprocessing_parameters = {
    "min_mz": 0,
    "max_mz": 1000,
    "max_frags": 1000,
    "min_frags": 4,
    "min_intensity": 0.01,
    "max_intensity": 1
}

In [3]:
convergence_parameters = {
    "step_size": 50,
    "window_size": 10,
    "threshold": 0.001,
    "type": "perplexity_history"
}

In [4]:
annotation_parameters = {
    "criterium": "best", # return cluster with most compounds in it after optimization ("best" also an option)
    "cosine_similarity": 0.70, #0.8 how similar are the spectra compared to motifs in the optimization
    "n_mols_retrieved": 25 # 10 molecules retrieved from database by Spec2Vec
}

In [5]:
n_motifs = 400
n_iterations = 5000

In [6]:
import random
random.seed(42)
model_parameters = {
    "rm_top": 0, 
    "min_cf": 0,
    "min_df": 3,
    "alpha": 0.1, #A higher alpha makes the document preferences "smoother" over topics
    "eta": 0.1, #and a higher eta makes the topic preferences "smoother" over words.
    "seed": 42,
}

In [7]:
train_parameters = {
    "parallel": 3,
    "workers": 0, 
}

In [8]:
dataset_parameters = {
    "acquisition_type": "DDA",
    "significant_digits": 3,
    "charge": 1,
    "name": "DDA-Pesticide-Neurotox",
    "output_folder": f"DDA-Pesticide_{n_motifs}_c", #6
}

In [9]:
fingerprint_parameters = {
    "fp_type": "rdkit",
    "threshold": 0.8,
}

In [10]:
motif_parameter = 20

In [11]:
#dataset = "../datasets/240102_bovi_2023_4_pos_her_BWO_LC-OTT3_034.mzML"
dataset = "../datasets//mzmine443_Bovi06_sirius.mgf"

In [12]:
from matchms.importing import load_from_mgf
#dataset = list(load_from_mgf("../datasets/mzmine443_Bovi34_sirius.mgf")) + list(load_from_mgf("../datasets/mzmine443_Bovi74_sirius.mgf")) + list(load_from_mgf("../datasets/mzmine443_Bovi12_sirius.mgf")) + list(load_from_mgf("../datasets/mzmine443_Bovi06_sirius.mgf"))
dataset = list(load_from_mgf("../datasets/mzmine443_tomato_spiked_sirius.mgf"))
len(dataset)

9765

In [13]:
#from matchms.importing import load_from_mzml
#dataset = list(load_from_mzml("../datasets/240102_bovi_2023_4_pos_her_BWO_LC-OTT3_006.mzML")) + list(load_from_mzml("../datasets/240102_bovi_2023_4_pos_her_BWO_LC-OTT3_012.mzML")) + list(load_from_mzml("../datasets/240102_bovi_2023_4_pos_her_BWO_LC-OTT3_034.mzML")) + list(load_from_mzml("../datasets/240102_bovi_2023_4_pos_her_BWO_LC-OTT3_074.mzML"))
#len(dataset)

In [14]:
motif_spectra, optimized_motifs, motif_fps = MS2LDA.run(dataset, n_motifs=n_motifs, n_iterations=n_iterations,
        dataset_parameters=dataset_parameters,
        train_parameters=train_parameters,
        model_parameters=model_parameters,
        convergence_parameters=convergence_parameters,
        annotation_parameters=annotation_parameters,
        motif_parameter=motif_parameter,
        preprocessing_parameters=preprocessing_parameters,
        fingerprint_parameters=fingerprint_parameters)

Cleaning spectra ... 7211 spectra left


  model.train(convergence_parameters["step_size"], **train_parameters)
 52%|██████████████████████████████████████████                                       | 52/100 [09:00<08:18, 10.39s/it]

Model has converged
Loading Spec2Vec model ...





Searches for candidates ...


100%|████████████████████████████████████████████████████████████████████████████████| 399/399 [03:02<00:00,  2.19it/s]


Visualization data saved to: DDA-Pesticide_400_c\ms2lda_viz.json


---

In [20]:
from tomotopy import LDAModel

In [21]:
LDA = LDAModel.load(f"{dataset_parameters['output_folder']}/ms2lda.bin")

In [22]:
LDA.summary()

<Basic Info>
| LDAModel (current version: 0.12.7)
| 21155 docs, 11814007 words
| Total Vocabs: 18320, Used Vocabs: 15086
| Entropy of words: 7.70110
| Entropy of term-weighted words: 7.70110
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 2000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -5.07424
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 2 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 200 (the number of topics between 1 ~ 32767)
| alpha: [0.01] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.1 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 42 (random seed)
| trained in version 0.12.7
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  [0.00398883

In [23]:
len(LDA.used_vocabs)

15086

In [24]:
"frag@182.06" in LDA.used_vocabs

True

In [25]:
for vocab, freq, weight_freq in zip(LDA.used_vocabs, LDA.used_vocab_freq, LDA.used_vocab_weighted_freq):
    if vocab == "frag@182.06": #2
        print(182.062, freq, weight_freq)
    if vocab == "frag@72.04": #3
        print(72.043, freq, weight_freq)
    if vocab == "frag@125.02": #15
        print(125.015, freq, weight_freq)
    if vocab == "frag@70.04":
        print(70.04, freq, weight_freq)

72.043 13196 13196.0
125.015 3473 3473.0
182.062 524 524.0
70.04 117 117.0


## Screening for spectra from experiments, motifs or motifDB

In [14]:
motifDB_query = "QUERY scaninfo(MS2DATA)"

In [16]:
screening_hits = MS2LDA.screen_spectra(motifDB="DIA-Pesticide-Kidney_1000/motifDB.xlsx", dataset="../datasets/WFSR_External_Library_gnps.mgf", motifDB_query=motifDB_query, output_folder=dataset_parameters["output_folder"])

Loading Spec2Vec model ...
TOTAL QUERIES 1


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.95it/s]
1000it [00:00, 4694.84it/s]


In [19]:
screening_hits.loc[screening_hits.ref_motif_id == "motif_1"]

Unnamed: 0,hit_id,screen_type,score,ref_motifset,ref_motif_id,ref_short_annotation,ref_annotation,ref_charge


---

## Search by SMILES for annotated motifs

In [15]:
import pandas as pd
import pubchempy as pcb

In [16]:
pesticides_names = pd.read_excel("../datasets/Quan_scope_LC_AID_2019.xlsx", names=[None, "names"], skiprows=1)

In [17]:
pesticide_smiles = []
for name in pesticides_names.names:
    compound = pcb.get_compounds(name, "name")
    if compound:
        print(name)
        smi = compound[0].canonical_smiles
        pesticide_smiles.append(smi)
    else:
        print("NOT FOUND ", name)

2,4-D
Abamectin
Acephate
Acequinocyl
Acetamiprid
Aclonifen
Aldicarb
Ametoctradin
Amidosulfuron
Amisulbrom
Asulam
Azadirachtin
Azamethiphos
Azoxystrobin
Bendiocarb
Bentazone
Bifenazate
Bifenthrin
Bixafen
Boscalid
Brodifacoum
Bromadiolone
Bromoxynil
Bromuconazole
Bupirimate
Buprofezin
Carbaryl
Carbendazim
Carbetamide
Carbofuran
Carfentrazone-ethyl
Chlorantraniliprole
Chlorbromuron
Chloridazon
Chlorpyrifos
Clodinafop-propargyl
Clofentezine
Clomazone
Clothianidin
Cyazofamid
Cybutryne
Cyflufenamide
Cyflumetofen
Cymoxanil
Cyproconazole
Cyprodinil
Cythioate
Desmedipham
Dichlofluanid
Difenoconazole
Diflubenzuron
Diflufenican
Dimethenamid
Dimethoate
Dimethomorph
Dinoterb
Diuron
DNOC
Dodemorph
Dodine
Emamectin
Epoxiconazole
Ethirimol
Ethoprophos
Etoxazole
Famoxadone
Fenamidone
Fenamiphos
Fenhexamid
Fenoxaprop-p-ethyl
Fenoxycarb
Fenpropidin
Fenpropimorph
Fenpyrazamine
Fipronil
Flonicamid
Florasulam
Fluazifop
Fluazinam
Flubendiamide
Flubendiamide
Flucycloxuron
Fludioxonil
Flufenacet
Flufenoxuron
F

In [18]:
# compounds not found by pubchempy
Fluroxypyr_meptylester = "CCCCCCC(C)OC(=O)COC1=NC(=C(C(=C1Cl)N)Cl)F"
MCPP = "Clc1cc(ccc1)N2CCNCC2"
Naftylacetic_acid_1 = "C1=CC=C2C(=C1)C=CC=C2CC(=O)O"

In [19]:
pesticide_smiles += [Fluroxypyr_meptylester, MCPP, Naftylacetic_acid_1]

In [20]:
structure_query = pesticide_smiles

In [21]:
screening_structure_hits = MS2LDA.screen_structure(motif_fps, motif_spectra, structure_query, fp_type="rdkit", threshold=0.50, output_folder=dataset_parameters["output_folder"])

In [23]:
len(motif_spectra)

600