## Running MS2LDA

In [1]:
%matplotlib agg
import sys, os
sys.path.append('..') 
import tomotopy as tp
from matchms.importing import load_from_mgf
from MS2LDA.Preprocessing.load_and_clean import clean_spectra
import MS2LDA

In [2]:
preprocessing_parameters = {
    "min_mz": 0,
    "max_mz": 2000,
    "max_frags": 1000,
    "min_frags": 5,
    "min_intensity": 0.03,
    "max_intensity": 1
}

In [3]:
convergence_parameters = {
    "step_size": 50,
    "window_size": 10,
    "threshold": 0.005,
    "type": "perplexity_history"
}

In [4]:
annotation_parameters = {
    "criterium": "biggest", 
    "cosine_similarity": 0.90,
    "n_mols_retrieved": 5, 
    "s2v_model_path": "/Users/rtlortega/Documents/PhD/Spec2Vec/model_positive_mode/positive_train_data/150225_Spec2Vec_pos_CleanedLibraries.model",
    "s2v_library_embeddings": "/Users/rtlortega/Documents/PhD/Spec2Vec/model_positive_mode/positive_train_data/150225_CleanedLibraries_Spec2Vec_pos_embeddings.npy",
    "s2v_library_db": "/Users/rtlortega/Documents/PhD/Spec2Vec/model_positive_mode/positive_train_data/150225_CombLibraries_spectra.db",
}

In [5]:
model_parameters = {
    "rm_top": 0, 
    "min_cf": 0,
    "min_df": 3,
    "alpha": 0.6, #A higher alpha makes the document preferences "smoother" over topics
    "eta": 0.01, #and a higher eta makes the topic preferences "smoother" over words.
    "seed": 42,
}

In [6]:
train_parameters = {
    "parallel": 3,
    "workers": 0, 
}

In [7]:
fingerprint_parameters = {
    "fp_type": "maccs",
    "threshold": 0.8,
}

In [8]:
motif_parameter = 50
n_motifs = 100
n_iterations = 10000

In [9]:
dataset_parameters = {
    "acquisition_type": "DDA",
    "significant_digits": 3,
    "charge": 1,
    "name": "test",
    "output_folder": f"CaseStudy_cannabis_{n_motifs}_081025", 
}

In [10]:
from matchms.importing import load_from_mgf
dataset = list(load_from_mgf('/Users/rtlortega/Documents/PhD/Willy_Project/Results_Oct/Positive_files/Preprocessed_Files_071025/Pos_noqc_071025_iimn_gnps.mgf'))
len(dataset)

5680

In [11]:
dataset[1].metadata

{'feature_id': '6',
 'feature_full_id': 'row6_mz114.987575_rt1.5556_id',
 'featurelist_feature_id': 'Aligned feature list subtracted peak gaps dup corr PEARSON r greq 0.85 dp greq 5:6',
 'charge': 1,
 'feature_ms1_height': '2.338E6',
 'spectype': 'SAME_ENERGY',
 'merged_across_n_samples': '24',
 'collision_energy': '[53.333332]',
 'fragmentation_method': 'HCD',
 'isolation_window': '1.5',
 'ionmode': 'positive',
 'source_scan_usi': '["mzspec:DATASET_ID_PLACEHOLDER:pos_UVBR1-6:678,753,832","mzspec:DATASET_ID_PLACEHOLDER:pos_NOUVR2-24:686,768","mzspec:DATASET_ID_PLACEHOLDER:pos_BLUER2-21:680,760","mzspec:DATASET_ID_PLACEHOLDER:pos_UVBR1-4:681,757,836","mzspec:DATASET_ID_PLACEHOLDER:pos_UVBR1-5:681,754","mzspec:DATASET_ID_PLACEHOLDER:pos_NOUVR1-12:689,769","mzspec:DATASET_ID_PLACEHOLDER:pos_UVAR1-1:684,759,860","mzspec:DATASET_ID_PLACEHOLDER:pos_BLUER2-20:687,766","mzspec:DATASET_ID_PLACEHOLDER:pos_BLUER1-9:683,765","mzspec:DATASET_ID_PLACEHOLDER:pos_NOUVR2-22:684,760","mzspec:DATASET_ID_

In [12]:
motif_spectra, optimized_motifs, motif_fps = MS2LDA.run(dataset, n_motifs=n_motifs, n_iterations=n_iterations,
        dataset_parameters=dataset_parameters,
        train_parameters=train_parameters,
        model_parameters=model_parameters,
        convergence_parameters=convergence_parameters,
        annotation_parameters=annotation_parameters,
        motif_parameter=motif_parameter,
        preprocessing_parameters=preprocessing_parameters,
        fingerprint_parameters=fingerprint_parameters)

Cleaning spectra ... 5373 spectra left


  model.train(convergence_parameters["step_size"], **train_parameters)
  8%|▊         | 17/200 [00:35<06:24,  2.10s/it]

Model has converged



100%|██████████| 100/100 [00:00<00:00, 130.70it/s]


m2m folder stored
convergence curve stored
network stored
Visualization data saved (gzipped) to: CaseStudy_cannabis_100_081025/ms2lda_viz.json.gz


In [13]:
%matplotlib inline

In [20]:
import tomotopy as tp
ms2lda = tp.LDAModel.load("CaseStudy_cannabis_100_081025/ms2lda.bin")
len(ms2lda.docs) #we have 2714 preprocessed spectra

5373

In [21]:
top_motifs = []

for i in range(len(ms2lda.docs)):
    topics = ms2lda.docs[i].get_topics()
    doc_motifs = []  
    for motif in topics:
        if motif[1] > 0.02:
            doc_motifs.append(motif)
    top_motifs.append(doc_motifs) 


In [22]:
scans_list = []
retention_list = []
motifs_list = []

In [23]:
for i in range(len(ms2lda.docs)):
    metadata = MS2LDA.retrieve_spec4doc(i).metadata
    scans_list.append(metadata['scans'])
    retention_list.append(metadata['retention_time'])
    motifs_list.append(top_motifs[i])

In [24]:
import pandas as pd

df = pd.DataFrame({
    'scans': scans_list,
    'retention_time': retention_list,
    'motifs': motifs_list
})

print(df.head())

  scans  retention_time                                             motifs
0     1           86.21  [(73, 0.9637612700462341), (93, 0.032619006931...
1     6           93.34  [(17, 0.8685916662216187), (93, 0.097018413245...
2     7           93.60                         [(73, 0.9986182451248169)]
3     9           93.69  [(93, 0.8709614276885986), (68, 0.127664536237...
4    10           93.70  [(73, 0.9678475260734558), (85, 0.031130068004...


In [25]:
df.to_csv('Willy_220825_spec_100_motifs.tsv', index=False, sep='\t')


---