In [1]:
import os
os.chdir("..")
print(os.getcwd())

/Users/rosinatorres/Documents/PhD/WP1/Project/Code/MS2LDA/MS2LDA


In [2]:
%matplotlib qt

In [3]:
from Preprocessing.load_and_clean import load_mgf
from Preprocessing.load_and_clean import clean_spectra

from Preprocessing.generate_corpus import features_to_words
from Preprocessing.generate_corpus import combine_features

from MS2LDA.modeling import define_model
from MS2LDA.modeling import train_model
from MS2LDA.modeling import extract_motifs
from MS2LDA.modeling import create_motif_spectra

from Add_On.Spec2Vec.annotation import load_s2v_and_library
from Add_On.Spec2Vec.annotation import calc_embeddings, calc_similarity
from Add_On.Spec2Vec.annotation import get_library_matches

from Add_On.Spec2Vec.annotation_refined import mask_spectra
from Add_On.Spec2Vec.annotation_refined import refine_annotation

from Visualisation.visualisation import create_interactive_motif_network


In [4]:
def generate_motifs(mgf_path, 
                    n_motifs = 50,
                    model_parameters = {
                        "rm_top": 0,
                        "min_cf": 0,
                        "min_df": 0,
                        "alpha": 0.1,
                        "eta": 0.1,
                        "seed": 42,
                    },
                    train_parameters = {
                        "parallel": 3
                    }, 
                    motif_parameter = 20,
                    charge=1,
                    motifset_name="unknown"):
    
    """generates the motif spectra based on a given mgf file
    
    ARGS:
        mgf_path (str): path to the mgf file
        model_parameters (dict): model parameters that can be set for a tomotopy LDA model
        train_parameters (dict): train parameters that can be set for a tomotopy training of an LDA model
        motif_parameter (int): number of top n most important features per motif
        
    RETURNS:
        motif_spectra (list): list of matchms spectrum objects (no precursor ion) 
    """
    # Preprocessing
    loaded_spectra = load_mgf(mgf_path)
    cleaned_spectra = clean_spectra(loaded_spectra)

    # Corpus Generation
    fragment_words, loss_words = features_to_words(cleaned_spectra)
    feature_words = combine_features(fragment_words, loss_words)

    # Modeling
    ms2lda = define_model(n_motifs=n_motifs, model_parameters=model_parameters)
    trained_ms2lda = train_model(ms2lda, feature_words, iterations=100, train_parameters=train_parameters)

    # Motif Generation
    motifs = extract_motifs(trained_ms2lda, top_n=motif_parameter)
    motif_spectra = create_motif_spectra(motifs, charge, motifset_name)

    return motif_spectra

In [5]:
def annotate_motifs(motif_spectra, 
                    top_n_matches = 5,
                    unique_mols = True,
                    path_model = "Add_On/Spec2Vec/model_positive_mode/020724_Spec2Vec_pos_CleanedLibraries.model",
                    path_library = "Add_On/Spec2Vec/model_positive_mode/positive_s2v_library.pkl"):
    """annotates motif with Spec2Vec
    
    ARGS:
        top_n_matches (int): top n compounds retrieved the database 
        unique_mols (boolean): True if only unique compounds or False duplicates can also be retrieved
        path_model (str): path to Spec2Vec model
        path_library (str): path the pkl library file, which contains embeddings, spectra and smiles
        
    RETURNS:
        optimized_motif_spectra (list): list of matchms motif spectra
        optimized_clusters (list): list of lists of spectra from clustered compounds
        smiles_clusters (list) list of lists of SMILES for clustered compounds
    """

    
    s2v_similarity, library = load_s2v_and_library(path_model, path_library)
    print("Model loaded ...")

    motif_embeddings = calc_embeddings(s2v_similarity, motif_spectra)
    similarity_matrix = calc_similarity(motif_embeddings, library.embeddings)
   
    matching_settings = {
                        "similarity_matrix": similarity_matrix,
                        "library": library,
                        "top_n": 5,
                        "unique_mols": True,
                    }

    
    library_matches = get_library_matches(matching_settings)

    masked_motif_spectra = mask_spectra(motif_spectra)
    optimized_motif_spectra, optimized_clusters, smiles_clusters, clusters_similarity = refine_annotation(s2v_similarity, library_matches, masked_motif_spectra, motif_spectra)

    return optimized_motif_spectra, optimized_clusters, smiles_clusters, clusters_similarity

In [6]:
mgf_path_train = "test_data/GNPS-NIH-NATURALPRODUCTSLIBRARY.mgf"

In [7]:
motif_spectra = generate_motifs(mgf_path_train, n_motifs=62)

  model.train(iterations, **train_parameters)


In [8]:
optimized_motif_spectra, optimized_clusters, smiles_clusters, clusters_similarity = annotate_motifs(motif_spectra)

Model loaded ...
One compound cluster!
Only one cluster:  0.662809917355372
Similarity Match:  0.9954545454545455
Similarity Match:  0.7600000000000001
Similarity Match:  0.7394636015325671
Similarity Match:  0.8095238095238095
Similarity Match:  0.9038626609442062
Similarity Match:  0.990990990990991
Similarity Match:  0.8141762452107281
Similarity Match:  0.9789272030651341
Similarity Match:  0.9974025974025974
Similarity Match:  0.9966942148760333
Similarity Match:  1.0
Similarity Match:  0.8074534161490683
Similarity Match:  0.8197424892703864
Similarity Match:  0.9948051948051948
One compound cluster!
Similarity Match:  0.7298850574712644
Only one cluster:  0.24935064935064935
Similarity Match:  0.8414414414414414
Similarity Match:  0.9891891891891891
Similarity Match:  0.7099099099099099
One compound cluster!
Only one cluster:  0.38888888888888895
One compound cluster!
Similarity Match:  0.9917184265010353
Similarity Match:  0.7322314049586778
Similarity Match:  0.814592274678111

In [9]:
create_interactive_motif_network(optimized_motif_spectra, 2, clusters_similarity, smiles_clusters)

<networkx.classes.graph.Graph at 0x308d35090>

Node motif_39 clicked!
Node motif_40 clicked!
