## Running MS2LDA

In [1]:
#%pip install requests 
import requests
from matchms.importing import load_from_mgf
from io import StringIO

In [2]:
# Ensure MS2LDA can be imported
import sys
from pathlib import Path
# Add parent directory to path if MS2LDA not already importable
try:
    import MS2LDA
except ImportError:
    sys.path.insert(0, str(Path.cwd().parent.parent))
    import MS2LDA

In [3]:
%matplotlib agg
import MS2LDA

In [4]:
preprocessing_parameters = {
    "min_mz": 0,
    "max_mz": 1000,
    "max_frags": 1000,
    "min_frags": 3,
    "min_intensity": 0.01,
    "max_intensity": 1
}

In [5]:
convergence_parameters = {
    "step_size": 50,
    "window_size": 10,
    "threshold": 0.001,
    "type": "perplexity_history"
}

In [6]:
annotation_parameters = {
    "criterium": "best", # return cluster with most compounds in it after optimization ("best" also an option)
    "cosine_similarity": 0.70, #0.8 how similar are the spectra compared to motifs in the optimization
    "n_mols_retrieved": 5 # 10 molecules retrieved from database by Spec2Vec
}

In [7]:
n_motifs = 100
n_iterations = 2000

In [8]:
model_parameters = {
    "rm_top": 4, 
    "min_cf": 50,
    "min_df": 10,
    "alpha": 0.6, #A higher alpha makes the document preferences "smoother" over topics
    "eta": 0.01, #and a higher eta makes the topic preferences "smoother" over words.
    "seed": 42,
}

In [9]:
train_parameters = {
    "parallel": 3,
    "workers": 0, 
}

In [10]:
dataset_parameters = {
    "acquisition_type": "DDA",
    "significant_digits": 3,
    "charge": 1,
    "name": "test",
    "output_folder": f"mytest10", 
}

In [11]:
fingerprint_parameters = {
    "fp_type": "rdkit",
    "threshold": 0.8,
}

In [12]:
motif_parameter = 20

In [13]:
dataset = "../../datasets/mzmine443_Tomato200_InclusionListA_15000A.mgf"

In [14]:
motif_spectra, optimized_motifs, motif_fps = MS2LDA.run(dataset, n_motifs=n_motifs, n_iterations=n_iterations,
        dataset_parameters=dataset_parameters,
        train_parameters=train_parameters,
        model_parameters=model_parameters,
        convergence_parameters=convergence_parameters,
        annotation_parameters=annotation_parameters,
        motif_parameter=motif_parameter,
        preprocessing_parameters=preprocessing_parameters,
        fingerprint_parameters=fingerprint_parameters)

Cleaning spectra ... 7255 spectra left


  model.train(convergence_parameters["step_size"], **train_parameters)
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:59<00:00,  1.49s/it]


model did not converge


100%|███████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 132.30it/s]


m2m folder stored
convergence curve stored
network stored


  plt.legend(loc="best")


Visualization data saved (gzipped) to: mytest10/ms2lda_viz.json.gz


In [15]:
%matplotlib inline

In [None]:
import tomotopy as tp
ms2lda = tp.LDAModel.load("mytest10/ms2lda.bin")
len(ms2lda.docs)

In [None]:
ms2lda.docs[7].get_topics()

In [None]:
MS2LDA.retrieve_spec4doc(7)

In [None]:
MS2LDA.retrieve_spec4doc(7).metadata

In [None]:
MS2LDA.retrieve_spec4doc(7).plot()

In [None]:
MS2LDA.retrieve_spec4doc(7).peaks.mz

In [None]:
motif_spectra[47].peaks.mz

---

## Screening for spectra from experiments, motifs or motifDB

In [None]:
motifDB_query = "QUERY scaninfo(MS2DATA) WHERE MS2PROD=72.04"

In [None]:
screening_hits = MS2LDA.screen_spectra(motifDB=dataset_parameters["output_folder"] + "/motifDB.json", motifs_stored=dataset_parameters["output_folder"] + "/motifDB.json", motifDB_query=motifDB_query, output_folder=dataset_parameters["output_folder"])

In [None]:
screening_hits

In [None]:
screening_hits.loc[screening_hits.ref_motif_id == "motif_1"]

---

In [None]:
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import load_motifDB_excel

In [None]:
ms1, ms2 = load_motifDB_excel("DDA-Pesticide_100/motifDB_optimized.xlsx")

In [None]:
ms1

In [None]:
x = ms1.to_dict()
y = ms2.to_dict()

In [None]:
pd.DataFrame(x)

In [None]:
z = {
    "x": x,
    "y": y,
}

In [None]:
pd.DataFrame(z["x"])

In [None]:
with open("motifDB_test.json", "w") as outfile:
    json.dump(z, outfile)

In [None]:
with open("motifDB_test.json", "r") as infile:
    mDB = json.load(infile)

In [None]:
pd.DataFrame(mDB["x"])