In [None]:
import os
os.chdir("..")
print(os.getcwd())

/Users/rosinatorres/Documents/PhD/Project/Code/MS2LDA/MS2LDA


## Load spectra, Clean spectra and generate a Corpus for LDA

In [None]:
from Preprocessing.load_and_clean import load_mgf
from Preprocessing.load_and_clean import clean_spectra

from Preprocessing.generate_corpus import features_to_words
from Preprocessing.generate_corpus import combine_features

In [None]:
# define spectra path, change from \\ to / for mac
spectra_path = "test_data/pos_ache_inhibitors_pesticides.mgf"

In [None]:
# This is the cleaning part: removing spectra with too less peaks and so on ...
spectra = load_mgf(spectra_path)
cleaned_spectra = clean_spectra(spectra)

In [None]:
# This is the Corpus generatino part: adding frag@ and loss@ and so on ...
fragment_words, loss_words = features_to_words(cleaned_spectra)
feature_words = combine_features(fragment_words, loss_words)

## Topic Modelling

In [None]:
from MS2LDA.modeling import define_model
from MS2LDA.modeling import train_model
from MS2LDA.modeling import extract_motifs
from MS2LDA.modeling import create_motif_spectra

In [None]:
# set the model parameters. Number of motifs must be given, the rest can be based on tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel
model_parameters = {"rm_top":2}
ms2lda = define_model(n_motifs=88, model_parameters=model_parameters)

In [8]:
%%time
# set the training parameters based on the tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel.train
# the parallization scheme (keyword parallel) has a big impact on the speed.
train_parameters = {"parallel": 4}
trained_ms2lda = train_model(ms2lda, feature_words, iterations=300, train_parameters=train_parameters)

CPU times: total: 46.9 ms
Wall time: 46.1 ms


In [9]:
from tomotopy.coherence import Coherence
import tomotopy as tp
coherence=tp.coherence.Coherence(trained_ms2lda, coherence='c_v')
print(coherence.get_score())

0.7786070104539395


In [10]:
# retrieves motif from LDA model and converts it into a matchms spectrum object
motifs = extract_motifs(trained_ms2lda, top_n=20)
motif_spectra = create_motif_spectra(motifs)

In [18]:
import tmplot as tmp
phi = tmp.get_phi(trained_ms2lda)
phi.head()

topics,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
frag@124.98,0.01685,0.021007,0.012316,0.014546,0.019415,0.011233,0.017846,0.012457,0.01808,0.014473,...,0.023953,0.02232,0.019515,0.023381,0.008098,0.021113,0.014036,0.020395,0.01748,0.016088
frag@96.95,0.01685,0.018008,0.00924,0.015585,0.014309,0.015314,0.010502,0.010382,0.006033,0.01158,...,0.01246,0.014207,0.015409,0.010171,0.016186,0.023123,0.011031,0.009717,0.016452,0.009054
frag@77.04,0.014745,0.013008,0.015393,0.018699,0.013287,0.010213,0.014699,0.015568,0.017076,0.01158,...,0.014376,0.015221,0.007196,0.010171,0.015175,0.012069,0.014036,0.010688,0.014397,0.009054
frag@142.99,0.013693,0.009009,0.008214,0.004164,0.014309,0.011233,0.009453,0.010382,0.010049,0.01158,...,0.015333,0.009137,0.008223,0.01322,0.016186,0.014079,0.011031,0.011659,0.010286,0.011064
frag@114.96,0.009483,0.011009,0.008214,0.013508,0.010223,0.012253,0.005256,0.010382,0.007037,0.008687,...,0.010545,0.008123,0.012329,0.01322,0.01012,0.010059,0.008025,0.010688,0.01748,0.007044


In [23]:
topics_coords = tmp.prepare_coords(trained_ms2lda)
topics_coords.head()

Unnamed: 0,x,y,topic,size,label
0,-7.425251,-2.958406,0,0.242954,0
1,-7.437016,-2.644304,1,0.256221,1
2,-7.901285,-3.866524,2,0.25259,2
3,-3.911962,-0.30519,3,0.240273,3
4,6.259974,2.674344,4,0.258721,4


In [24]:
tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label')

In [25]:
terms_probs = tmp.calc_terms_probs_ratio(phi, topic=0, lambda_=1)
tmp.plot_terms(terms_probs)

In [26]:
tmp.report(trained_ms2lda, docs=feature_words, height=400, width=250)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)
def plot_categories_by_topics_heatmap(labels, 
                                      topic_distributions, 
                                      topic_keys, 
                                      output_path=None,
                                      target_labels=None,
                                      color_map = sns.cm.rocket_r,
                                      dim=None):
    
    # Combine the labels and distributions into a list of dictionaries.
    dicts_to_plot = []
    for _label, _distribution in zip(labels, topic_distributions):
        if not target_labels or _label in target_labels:
            for _topic_index, _probability in enumerate(_distribution):
                dicts_to_plot.append({'Probability': float(_probability),
                                      'Category': _label,
                                      'Topic': 'Topic ' + str(_topic_index).zfill(2) + ': ' + ' '.join(topic_keys[_topic_index][:5])})

    # Create a dataframe, format it for the heatmap function, and normalize the columns.
    df_to_plot = pd.DataFrame(dicts_to_plot)
    df_wide = df_to_plot.pivot_table(index='Category', 
                                     columns='Topic', 
                                     values='Probability')
    df_norm_col=(df_wide-df_wide.mean())/df_wide.std()
        
    # Show the final plot.
    if dim:
        plt.figure(figsize=dim)
    sns.set(style='ticks', font_scale=1.2)
    ax = sns.heatmap(df_norm_col, cmap=color_map)    
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')
    plt.xticks(rotation=30, ha='left')
    plt.tight_layout()
    if output_path:
        plt.savefig(output_path)
    plt.show()

In [11]:
#Quick way of savin the LDA output into a list (pickle)
import pickle

folder_path = "test_data"
# Save the list to a file
with open(os.path.join(folder_path, 'motifs.pkl'), 'wb') as f:
    pickle.dump(motifs, f)

In [12]:
folder_path = "test_data"
# Save the list to a file
with open(os.path.join(folder_path, 'motifs_spectra.pkl'), 'wb') as f:
    pickle.dump(motif_spectra, f)

In [13]:
folder_path = "test_data"
# Save the list to a file
with open(os.path.join(folder_path, 'corpus.pkl'), 'wb') as f:
    pickle.dump(feature_words, f)

## Store and load Motifs/Topics

In [12]:
from MS2LDA.motif_parser import store_m2m_folder
from MS2LDA.motif_parser import load_m2m_folder

In [13]:
non_existing_folder_name = "notebooks\\ACHE Inhibitors from Pesticides in Positive Mode"

In [14]:
#store_m2m_folder(motif_spectra, non_existing_folder_name)

True

In [15]:
loaded_motif_spectra = load_m2m_folder(non_existing_folder_name)

## Topic Modelling with "fixed" Topics

In [16]:
from MS2LDA.modeling import emulate_fixed_motifs

In [17]:
# set the model parameters. Number of motifs must be given, the rest can be based on tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel
model_parameters = {}
ms2lda = define_model(n_motifs=73)

In [18]:
#! now include the "fixed" Topics/Motifs
fixed_motifs = loaded_motif_spectra[:20] # fix the first 20 motifs
fixed_ms2lda = emulate_fixed_motifs(ms2lda, fixed_motifs)

In [19]:
%%time
# set the training parameters based on the tomotopy implementation: https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel.train
# the parallization scheme (keyword parallel) has a big impact on the speed.
train_parameters = {"parallel": 4}
trained_fixed_ms2lda = train_model(fixed_ms2lda, feature_words, iterations=300, train_parameters=train_parameters)

CPU times: total: 125 ms
Wall time: 122 ms


In [20]:
# retrieves motif from LDA model and converts it into a matchms spectrum object
motifs_fixed = extract_motifs(trained_fixed_ms2lda, top_n=20)
motif_fixed_spectra = create_motif_spectra(motifs_fixed)

## Annotate Topics

In [18]:
from Add_On.Spec2Vec.annotation import load_s2v_and_library
from Add_On.Spec2Vec.annotation import calc_embeddings, calc_similarity
from Add_On.Spec2Vec.annotation import get_library_matches

ModuleNotFoundError: No module named 'Add_On'

In [23]:
%%time
s2v_similarity, library = load_s2v_and_library("Add_On\Spec2Vec")

CPU times: total: 18.5 s
Wall time: 18.5 s


In [24]:
%%time
# calculate embeddings for found motifs
motif_embeddings = calc_embeddings(s2v_similarity, motif_spectra)

CPU times: total: 31.2 ms
Wall time: 31 ms


In [25]:
%%time
# calculate similarity between found motifs and GNPS library with Spec2Vec
similarity_matrix = calc_similarity(motif_embeddings, library.embeddings)

CPU times: total: 2min 31s
Wall time: 2min 15s


In [26]:
%%time
matching_settings = {
    "similarity_matrix": similarity_matrix,
    "library": library,
    "top_n": 10
}
library_matches = get_library_matches(matching_settings)

CPU times: total: 6.64 s
Wall time: 6.64 s


### Refined Annotations

In [27]:
from Add_On.Spec2Vec.annotation_refined import mask_spectra

In [28]:
masked_motif_spectra = mask_spectra(motif_spectra)

---

In [29]:
from Add_On.Spec2Vec.annotation_refined import hierachical_clustering

In [30]:
cluster = hierachical_clustering(s2v_similarity, library_matches[6][1], masked_motif_spectra[6])

0.05714285714285714
[2 1 1 1 1 1 1 1 1 1]
0.1894736842105263
[2 2 2 2 2 1 2 1 2]
0.33684210526315783
[2 1 2 2 2 1 2]
0.5924812030075187
[2 1 1 2 2]
0.7338345864661654


the hierachical clustering is not ready to use yet, since it only runs on one subcluster and only one motif at a time