This example notebook shows how to use the cosine similarity kernel for some common tasks. In this case we consider [current home-page example](https://matchms.readthedocs.io/en/latest/#example) of pesticide similarity evaluation.

In [20]:
%load_ext autoreload
%autoreload 2
from nbutils import chdir_to_root
chdir_to_root()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
from matchms.importing import load_from_mgf
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from cudams.similarity import CudaCosineGreedy
from numba import cuda
assert cuda.is_available()

In [22]:
file = list(load_from_mgf('data/pesticides.mgf'))
# Apply filters to clean and enhance each spectrum
spectrums = []

for spectrum in file:
    # Apply default filter to standardize ion mode, correct charge and more.
    # Default filter is fully explained at https://matchms.readthedocs.io/en/latest/api/matchms.filtering.html .
    spectrum = default_filters(spectrum)
    # Scale peak intensities to maximum of 1
    spectrum = normalize_intensities(spectrum)
    spectrums.append(spectrum)

In [42]:
scores = calculate_scores(references=spectrums,
                          queries=spectrums,
                          similarity_function=CosineGreedy(),
                          is_symmetric=True)

In [43]:
print(f"Size of matrix of computed similarities: {scores.scores.shape}")

Size of matrix of computed similarities: (76, 76, 2)


In [44]:
scores.scores['CosineGreedy_matches']

(array([ 0,  0,  0, ..., 75, 75, 75]),
 array([ 0,  1,  2, ..., 73, 74, 75]),
 array([53,  2,  2, ...,  3,  2, 70]))

In [45]:
scores

<76x76x2 stacked sparse array containing scores for ('CosineGreedy_score', 'CosineGreedy_matches') with 5050 stored elements in COOrdinate format>

In [46]:
from matchms.filtering import reduce_to_number_of_peaks
import numpy as np

MAX_PEAKS = 1024

def process_spectrum(spectrum: np.ndarray) -> np.ndarray:
    # spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    # spectrum = normalize_intensities(spectrum)
    # spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    # spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=MAX_PEAKS)
    # spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum

f_specs = [process_spectrum(s) for s in spectrums]

In [54]:
from cudams.similarity import CudaCosineGreedy
# Calculate Cosine similarity scores between all spectrums
# For other similarity score methods see https://matchms.readthedocs.io/en/latest/api/matchms.similarity.html .
# Because references and queries are here the same spectra, we can set is_symmetric=True
scores_cu = calculate_scores(references=f_specs,
                          queries=f_specs,
                          similarity_function=CudaCosineGreedy(batch_size=256))

256 76 76


Batch all references: 1it [00:00, 342.25it/s]
Batch all queries: 1it [00:00, 547.85it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

(2, 76, 247) (2, 76, 247) [[ 53  56  30  66  37  38  43  77  53  34  30  47  71  86  82  81  55  82
   37  59  34  26  59  41  51  30  46  42  21  21  66  41  45  19  84  36
   65  58  42  55  17  47  38  16  37 100  51  70  67  68  55 123  56 159
   34  61  69  90  94  71 103  92  66  43  50  86 247  65  59 103  93 126
   85  57  54  70   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   

100%|██████████| 1/1 [00:01<00:00,  1.06s/it]


In [55]:
scores_cu

<76x76x3 stacked sparse array containing scores for ('CudaCosineGreedy_score', 'CudaCosineGreedy_matches', 'CudaCosineGreedy_overflow') with 5050 stored elements in COOrdinate format>

In [56]:
scores

<76x76x2 stacked sparse array containing scores for ('CosineGreedy_score', 'CosineGreedy_matches') with 5050 stored elements in COOrdinate format>

In [63]:

# This computed all-vs-all similarity scores, the array of which can be accessed as scores.scores
print(f"Size of matrix of computed similarities: {scores.scores.shape}")

# Matchms allows to get the best matches for any query using scores_by_query
query = spectrums[15]  # just an example
best_matches = scores.scores_by_query(query, 'CosineGreedy_score', sort=True)

# Print the calculated scores for each spectrum pair
for (reference, (score, matches)) in best_matches[:10]:
    # Ignore scores between same spectrum
    if reference is not query:
        print(f"Reference scan id: {reference.metadata['scans']}")
        print(f"Query scan id: {query.metadata['scans']}")
        print(f"Score: {score:.4f}")
        print(f"Number of matching peaks: {matches}")
        print("----------------------------")


Size of matrix of computed similarities: (76, 76, 2)
Reference scan id: 613
Query scan id: 2161
Score: 0.8646
Number of matching peaks: 14
----------------------------
Reference scan id: 603
Query scan id: 2161
Score: 0.8237
Number of matching peaks: 14
----------------------------
Reference scan id: 2160
Query scan id: 2161
Score: 0.8015
Number of matching peaks: 25
----------------------------
Reference scan id: 2362
Query scan id: 2161
Score: 0.2923
Number of matching peaks: 7
----------------------------
Reference scan id: 2598
Query scan id: 2161
Score: 0.2231
Number of matching peaks: 5
----------------------------
Reference scan id: 2594
Query scan id: 2161
Score: 0.1761
Number of matching peaks: 3
----------------------------
Reference scan id: 1944
Query scan id: 2161
Score: 0.1396
Number of matching peaks: 5
----------------------------
Reference scan id: 1772
Query scan id: 2161
Score: 0.1037
Number of matching peaks: 4
----------------------------
Reference scan id: 2284
Qu

In [73]:

# This computed all-vs-all similarity scores, the array of which can be accessed as scores.scores
print(f"Size of matrix of computed similarities: {scores_cu.scores.shape}")

# Matchms allows to get the best matches for any query using scores_by_query
query = spectrums[15]  # just an example
best_matches_cu = scores_cu.scores_by_query(query, 'CudaCosineGreedy_score', sort=True)

Size of matrix of computed similarities: (76, 76, 3)


In [74]:

# Print the calculated scores_cu for each spectrum pair
for (reference, (score, matches, overflow)) in best_matches_cu[:10]:
    # Ignore scores_cu between same spectrum
    if reference is not query:
        print(f"Reference scan id: {reference.metadata['scans']}")
        print(f"Query scan id: {query.metadata['scans']}")
        print(f"Score: {score:.4f}")
        print(f"Number of matching peaks: {matches}")
        print(f"Did GPU overflow at this pair: {overflow}")
        print("----------------------------")


Reference scan id: 2161
Query scan id: 2161
Score: 1.0000
Number of matching peaks: 0
Did GPU overflow at this pair: 0
----------------------------
Reference scan id: 613
Query scan id: 2161
Score: 0.8646
Number of matching peaks: 0
Did GPU overflow at this pair: 0
----------------------------
Reference scan id: 603
Query scan id: 2161
Score: 0.8237
Number of matching peaks: 0
Did GPU overflow at this pair: 0
----------------------------
Reference scan id: 2160
Query scan id: 2161
Score: 0.8015
Number of matching peaks: 0
Did GPU overflow at this pair: 0
----------------------------
Reference scan id: 2362
Query scan id: 2161
Score: 0.2923
Number of matching peaks: 0
Did GPU overflow at this pair: 0
----------------------------
Reference scan id: 2598
Query scan id: 2161
Score: 0.2231
Number of matching peaks: 0
Did GPU overflow at this pair: 0
----------------------------
Reference scan id: 2594
Query scan id: 2161
Score: 0.1761
Number of matching peaks: 0
Did GPU overflow at this pai

In [68]:
best_matches[0]

(Spectrum(precursor m/z=526.98, 81 fragments between 70.1 and 554.9), (1., 81))