In [1]:
import os
os.chdir("..")
print(os.getcwd())

C:\Users\dietr004\Documents\PhD\computational mass spectrometry\Spec2Struc\Project_SubstructureIdentification\MS2LDA


## Load spectra, Clean spectra and generate a Corpus for LDA

In [2]:
from Preprocessing.load_and_clean import load_mgf
from Preprocessing.load_and_clean import clean_spectra

from Preprocessing.generate_corpus import features_to_words
from Preprocessing.generate_corpus import combine_features

In [3]:
# define spectra path
spectra_path = "test_data\\pos_ache_inhibitors_pesticides.mgf"

In [4]:
# This is the cleaning part: removing spectra with too less peaks and so on ...
spectra = load_mgf(spectra_path)
cleaned_spectra = clean_spectra(spectra)

In [5]:
# This is the Corpus generatino part: adding frag@ and loss@ and so on ...
fragment_words, loss_words = features_to_words(cleaned_spectra)
feature_words = combine_features(fragment_words, loss_words)

## Topic Modelling

In [6]:
from MS2LDA.modeling import define_model
from MS2LDA.modeling import train_model
from MS2LDA.modeling import extract_motifs
from MS2LDA.modeling import create_motif_spectra

In [7]:
# set the model parameters. Number of motifs must be given, the rest can be based on tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel
model_parameters = {}
ms2lda = define_model(n_motifs=43)

In [8]:
%%time
# set the training parameters based on the tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel.train
# the parallization scheme (keyword parallel) has a big impact on the speed.
train_parameters = {"parallel": 4}
trained_ms2lda = train_model(ms2lda, feature_words, iterations=300, train_parameters=train_parameters)

CPU times: total: 46.9 ms
Wall time: 37 ms


In [9]:
# retrieves motif from LDA model and converts it into a matchms spectrum object
motifs = extract_motifs(trained_ms2lda, top_n=20)
motif_spectra = create_motif_spectra(motifs)

## Store and load Motifs/Topics

In [10]:
from MS2LDA.motif_parser import store_m2m_folder
from MS2LDA.motif_parser import load_m2m_folder

In [11]:
non_existing_folder_name = "notebooks\\ACHE Inhibitors from Pesticides in Positive Mode"

In [12]:
store_m2m_folder(motif_spectra, non_existing_folder_name)

True

In [12]:
loaded_motif_spectra = load_m2m_folder(non_existing_folder_name)

## Topic Modelling with "fixed" Topics

In [13]:
from MS2LDA.modeling import emulate_fixed_motifs

In [14]:
# set the model parameters. Number of motifs must be given, the rest can be based on tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel
model_parameters = {}
ms2lda = define_model(n_motifs=73)

In [15]:
#! now include the "fixed" Topics/Motifs
fixed_motifs = loaded_motif_spectra[:5] # fix the first 5 motifs
fixed_ms2lda = emulate_fixed_motifs(ms2lda, fixed_motifs)

In [16]:
%%time
# set the training parameters based on the tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel.train
# the parallization scheme (keyword parallel) has a big impact on the speed.
train_parameters = {"parallel": 4}
trained_fixed_ms2lda = train_model(fixed_ms2lda, feature_words, iterations=300, train_parameters=train_parameters)

CPU times: total: 62.5 ms
Wall time: 66.3 ms


In [17]:
# retrieves motif from LDA model and converts it into a matchms spectrum object
motifs_fixed = extract_motifs(trained_fixed_ms2lda, top_n=20)
motif_fixed_spectra = create_motif_spectra(motifs_fixed)

## Annotate Topics

In [10]:
from Add_On.annotation import load_s2v_and_library
from Add_On.annotation import calc_embeddings, calc_similarity
from Add_On.annotation import get_library_matches

In [11]:
%%time
s2v_similarity, library = load_s2v_and_library("Add_On")

CPU times: total: 14.5 s
Wall time: 14.7 s


In [12]:
%%time
# calculate embeddings for found motifs
motif_embeddings = calc_embeddings(s2v_similarity, motif_spectra)

CPU times: total: 15.6 ms
Wall time: 7.84 ms


In [13]:
%%time
# calculate similarity between found motifs and GNPS library with Spec2Vec
similarity_matrix = calc_similarity(motif_embeddings, library.embeddings)

CPU times: total: 52.6 s
Wall time: 44.7 s


In [14]:
%%time
matching_settings = {
    "similarity_matrix": similarity_matrix,
    "library": library,
    "top_n": 10
}
library_matches = get_library_matches(matching_settings)

CPU times: total: 1.31 s
Wall time: 1.31 s


### Refined Annotations

In [15]:
from Add_On.annotation_refined import mask_spectra

In [16]:
masked_motif_spectra = mask_spectra(motif_spectra)

---

In [25]:
from Add_On.annotation_refined import hierachical_clustering

In [30]:
cluster = hierachical_clustering(s2v_similarity, library_matches[0][1], masked_motif_spectra[0])

-0.5518796992481202
[1 1 1 1 1 1 2 1 1 1]
0.7488721804511278
