In [1]:
from rdkit.Chem import Draw
from rdkit import Chem

In [2]:
from matchms import Spectrum, Fragments
import numpy as np

In [3]:
import os
os.chdir('../programming_scripts/.')
print(os.getcwd())

C:\Users\dietr004\Documents\PhD\computational mass spectrometry\Spec2Struc\Project_SubstructureIdentification\scripts\programming_scripts


In [4]:
from Spec2Vec_annotation import load_model_and_data
from Spec2Vec_annotation import calc_similarity
from Spec2Vec_annotation import retrieve_top_hits
s2v_model, embeddings_smiles_DB = load_model_and_data()
smiles = embeddings_smiles_DB.smiles.to_numpy()
embeddings_DB = embeddings_smiles_DB.embeddings.to_list()
spectra_DB = embeddings_smiles_DB.spectra.to_list()

In [5]:
file = r"MotifDB\Urine derived Mass2Motifs\urine_mass2motif_133.m2m"
file = r"MotifDB\Rhamnaceae Plant Mass2Motifs\rhamn_motif_3.m2m"

### Parse mass 2 motif (m2m) files

In [6]:
def parse_m2m(file):
    """parses mass to motif by extraction the fragments, losses, names and (short) annotation

    ARGS:
        file (str): path to m2m file

    RETURNS:
        fragments_mz (list): mz values of fragments
        fragments_importance (list): LDA importance of fragments
        losses_mz (list): mz values of losses
        losses_importance (list): LDA importance of losses
        name (str): unique identifier of m2m
        annotation (str): short manually annotation
    """
    
    with open(file, "r") as motif_file:
        motif_file = motif_file.readlines()
    
    fragments_mz = []
    fragments_importance = []
    
    losses_mz = []
    losses_importance = []
    
    name = None
    annotation = None
    
    for line in motif_file:
        line = line.replace("\n", "")
        if line.startswith("fragment_"):
            frag_mz, frag_importance = line.split("_")[1].split(",")
            fragments_mz.append(float(frag_mz))
            fragments_importance.append(float(frag_importance))
        elif line.startswith("loss_"):
            loss_mz, loss_importance = line.split("_")[1].split(",")
            losses_mz.append(float(loss_mz))
            losses_importance.append(float(loss_importance))
        elif line.startswith("#NAME"):
            name = line.split(" ")[1]
        elif line.startswith("#SHORT_ANNOTATION"):
            annotation = line.split(" ", 1)[1]

    return fragments_mz, fragments_importance, losses_mz, losses_importance, name, annotation

### Sort fragments in ascending order and also their importance/intensity

In [7]:
def sort_dependent_lists(loss_or_frag, intensities):
    """sorts two lists based on the values of one of the lists

    ARGS:
        loss_or_frag (list): list of mz values of either fragments or losses
        intensities (list): list of LDA importance values for either fragments or losses

    RETURNS:
        sorted_loss_or_frag (as np.array): sorted mz values for fragments or losses
        sorted_intensities (as np.array): based on sorted mz values sorted LDA importance values
    """

    combined_lists = list(zip(loss_or_frag, intensities))
    sorted_combined_lists = sorted(combined_lists, key=lambda x: x[0])
    sorted_loss_or_frag, sorted_intensities = zip(*sorted_combined_lists)

    return np.array(sorted_loss_or_frag), np.array(sorted_intensities)

### Build a Motif spectrum

In [8]:
def build_spectrum(fragments_mz, fragments_importance, losses_mz, losses_importance, name, annotation):
    """builds a matchms spectrum object for the m2m

    ARGS:
        fragments_mz (list): mz values of fragments
        fragments_importance (list): LDA importance of fragments
        losses_mz (list): mz values of losses
        losses_importance (list): LDA importance of losses
        name (str): unique identifier of m2m
        annotation (str): short manually annotation

    RETURNS:
        motif_spectrum (matchms.spectrum.object): built spectrum based on given m2m frag, loss, name (as metadata) and annotation (as metadata)
    """
    
    sorted_fragments_mz, sorted_fragments_importance = sort_dependent_lists(fragments_mz, fragments_importance)
    sorted_losses_mz, sorted_losses_importance = sort_dependent_lists(fragments_mz, fragments_importance)
    
    motif_spectrum = Spectrum(
        mz=sorted_fragments_mz,
        intensities=sorted_fragments_importance,
        metadata={
            "id": name,
            "precursor_mz": max(fragments_mz),
            "annotation": annotation
        }
    )
    motif_spectrum.losses = Fragments(
    mz = sorted_losses_mz,
    intensities = sorted_losses_importance
    )

    return motif_spectrum

### Spec2Vec calculation

In [84]:
def spec2vec_predictions(motif_spectrum):
    """predicts similar molecules based on the motif spectrum

    ARGS:
        motif_spectrum (matchms.spectrum.object): spectrum based on given m2m frag, loss, name (as metadata) and annotation (as metadata)

    RETURNS:
        top_smiles (list): list of SMILES (strings)
        top_scores (list): list of sv2 socres for each SMILES 
        annotation (str): short description of manual annotation
    """

    s2v_scores = calc_similarity(s2v_model, [motif_spectrum], embeddings_DB)
    top_scores, top_smiles, top_spectra = retrieve_top_hits(s2v_scores, 0, smiles, spectra_DB)

    annotation = motif_spectrum.get("annotation")

    return top_smiles, top_scores, annotation

### Generate Spec2Vec Hit images and store them

In [96]:
def generate_png(top_smiles, top_scores, annotation, m2m_path):
    """generates Spec2Vec images for top 10 hits with manual annotation as title and stores them.

    ARGS:
        top_smiles (list): list of SMILES (strings)
        top_scores (list): list of sv2 socres for each SMILES 
        annotation (str): short description of manual annotation

    RETURNS:
        None
    """
    
    img = Draw.MolsToGridImage([Chem.MolFromSmiles(smi) for smi in top_smiles], molsPerRow=4, subImgSize=(300, 300), legends=[str(round(score,3)) for score in top_scores], returnPNG=False)

    font = ImageFont.truetype(r"C:\WINDOWS\FONTS\ARIAL.TTF", 24)
    font_color = (0,0,0) 

    draw = ImageDraw.Draw(img)
    
    
    text_x = img.width // 2  
    text_y = 10  

    draw.text((text_x, text_y), annotation, fill=font_color, font=font)
    img.save(m2m_path)

---

### Run against the entire Motif_DB

In [94]:
m2m_paths = []

MotifDB_folder = os.listdir('MotifDB')
for library_name in MotifDB_folder:
    mass2motif_folder = os.listdir(rf".\MotifDB\{library_name}")
    for mass2motif_name in mass2motif_folder:
        m2m_path = rf".\MotifDB\{library_name}\{mass2motif_name}"
        m2m_paths.append(m2m_path)

        

In [98]:
len(m2m_paths)

996

In [100]:
for m2m_path in m2m_paths:
    print(m2m_path)
    fragments_mz, fragments_importance, losses_mz, losses_importance, name, annotation = parse_m2m(m2m_path)
    if fragments_mz:
        motif_spectrum = build_spectrum(fragments_mz, fragments_importance, losses_mz, losses_importance, name, annotation)
        top_smiles, top_scores, annotation = spec2vec_predictions(motif_spectrum)

        new_m2m_path = m2m_path.replace("MotifDB", "MotifDB_annotations_s2v", 1)[:-4]+"_SimMols.png"

        generate_png(top_smiles, top_scores, annotation, new_m2m_path)

.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_0.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_119.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_126.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_132.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_135.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_14.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_149.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_158.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_162.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_173.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_179.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_18.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_180.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_182.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_192.m2m
.\MotifDB\Euphorbia Plant Mass2Motifs\euphorbia_motif_199.m2m
.\MotifDB\Eu

---

### Fun to play around with or for presentations

In [17]:
from molecule_slide_generator import * 
from rdkit.Chem import AllChem

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


In [35]:
all_props = []
mols = []
for smi, score, spec in zip(top_smiles, top_scores, top_spectra):
    mol = Chem.MolFromSmiles(smi)
    AllChem.Compute2DCoords(mol)
    if score > .80:
        color = '#3A662F'
    elif score < 0.80 and score > 0.60:
        color = '#e8860e'
    else:
        color = '#b52009'

    props = [TextProperty("S2V score", "S2V score: "+str(round(score,3)), color=color)]
    mols.append(mol)
    all_props.append(props)

In [38]:
sg = SlideGenerator(mols_per_row=2, rows=5, font_size=18, font='comicbd', number_of_properties=2, slide_width=800, slide_height=600)

In [39]:
png = sg.generate_slide(mols, all_props, 'example_slide.png')