In [1]:
%load_ext autoreload
%autoreload 2
from nbutils import chdir_to_root
chdir_to_root()
%pwd

'/home/tornikeo/Documents/work/scalexa/pangeaai/optimize-cosine'

In [2]:
from cudams.utils import argbatch, mkdir
from cudams.data import get_ref_spectra_from_df
from cudams.kernel import compile
from cudams.utils import name2idx
from cudams.cosine import similarity
import math
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from cudams.data import spectra_peaks_to_tensor
from cudams.processor import Config
from numba import cuda
from itertools import product
from time import perf_counter
from multiprocessing.pool import ThreadPool
from multiprocessing import shared_memory
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib
from typing import Tuple
from matchms.typing import SpectrumType
from matchms.similarity.BaseSimilarity import BaseSimilarity
from matchms.similarity.spectrum_similarity_functions import (collect_peak_pairs,
                                            score_best_matches)
from matchms.similarity import CosineGreedy as OriginalCosineGreedy

assert cuda.is_available()

In [3]:
## Define constants
tolerance: float = 0.1
shift: float = 0
mz_power: float = 0
int_power: float = 1

## How many pairs per batch. Has to be a power of 2.
# Hardware specific - An RTX2070 works best at around 1024 * 2
# But Colab T4 GPU might work best at 1024 * 4
BATCH_SIZE = 512

# MAX NUMBER OF PEAKS 
MAX_PEAKS = 1024

# MATCH_LIMIT specifies max how many mz-mz pairs we could consider for each RQ pair, before we sort and filter. 
# E.g. a value of 256 usually causes around ~0.003% of RQ pairs to "overflow".
# The overflown RQ scores will be strictly less than or equal to perfectly accurate score.
# The mean absolute difference at 256, for all overflown pairs is on the order of ~1e-3
# Small values of MATCH_LIMIT (e.g. 128, 64,) cause a dramatic speedup in the processing speed.
MATCH_LIMIT = 1024 * 2

## GPU-specific constants
THREADS_PER_BLOCK = (32, 32)
BLOCKS_PER_GRID_X = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[0])
BLOCKS_PER_GRID_Y = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[1])
BLOCKS_PER_GRID = (BLOCKS_PER_GRID_X, BLOCKS_PER_GRID_Y)

# Since Greedy cosine is an unstable algorithm, because approximate mz-mz values do not
# result in approximately the same scores and number of matches.
# So we need to use fp64 to minimize the deviation as much as possible.
# Using float32 causes a significant speedup in the processing speed.
dtype = 'float32'

# Data path
reference_csv_file = Path("data/input/test_set_cosine.csv")
query_csv_file = Path("data/input/test_set_cosine.csv")

In [4]:
from cudams.processor import CudaCosineGreedy, CpuCosineGreedy
from collections import defaultdict
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from tqdm import tqdm
from matchms.filtering import normalize_intensities, select_by_mz, select_by_relative_intensity, reduce_to_number_of_peaks, \
    require_minimum_number_of_peaks
from cudams.utils import mute_stdout

def process_spectrum(spectrum: np.ndarray) -> np.ndarray:
    # spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    # spectrum = normalize_intensities(spectrum)
    # spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    # spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=MAX_PEAKS)
    # spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum

ref_spectra_df_path = Path(reference_csv_file)
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
references = get_ref_spectra_from_df(ref_spectra_df, 
                                    spectrum_processor=process_spectrum,
                                    limit=BATCH_SIZE * 2,)

query_spectra_df_path = Path(query_csv_file)
query_spectra_df = pd.read_csv(query_spectra_df_path)
queries = get_ref_spectra_from_df(query_spectra_df, 
                                spectrum_processor=process_spectrum,
                                limit=BATCH_SIZE * 2,)

100%|██████████| 1024/1024 [00:02<00:00, 442.42it/s]
100%|██████████| 1024/1024 [00:00<00:00, 4684.70it/s]


In [5]:
import matchms.similarity.CosineGreedy as OriginalCosineGreedy

class CosineGreedy(OriginalCosineGreedy):
    """Stable implementation of original cosine greedy"""
    def __init__(self, tolerance: float = 0.1, mz_power: float = 0, intensity_power: float = 1):
        super().__init__(tolerance, mz_power, intensity_power)
        
    def pair(self, reference: SpectrumType, query: SpectrumType) -> Tuple[float, int]:
        """Calculate cosine score between two spectra.

        Parameters
        ----------
        reference
            Single reference spectrum.
        query
            Single query spectrum.

        Returns
        -------
        Score
            Tuple with cosine score and number of matched peaks.
        """
        def get_matching_pairs():
            """Get pairs of peaks that match within the given tolerance."""
            matching_pairs = collect_peak_pairs(spec1, spec2, self.tolerance,
                                                shift=0.0, mz_power=self.mz_power,
                                                intensity_power=self.intensity_power)
            if matching_pairs is None:
                return None
            # This is the only argument that we change `kind='mergesort'`
            matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2],kind='mergesort')[::-1], :]
            return matching_pairs

        spec1 = reference.peaks.to_numpy
        spec2 = query.peaks.to_numpy
        matching_pairs = get_matching_pairs()
        if matching_pairs is None:
            return np.asarray((float(0), 0), dtype=self.score_datatype)
        score = score_best_matches(matching_pairs, spec1, spec2,
                                   self.mz_power, self.intensity_power)
        return np.asarray(score, dtype=self.score_datatype)


similarity_measure = CosineGreedy(tolerance=tolerance, 
                                mz_power= 0.0, 
                                intensity_power = 1.0)

duration = -perf_counter()
scores = calculate_scores(references, queries, similarity_measure, is_symmetric=False)
duration += perf_counter()

In [None]:
n_pairs = len(references) * len(queries)
total_nbytes = scores._scores.data.nbytes
print(f"Num of outputs {scores}")
print(f"Pairs processed {n_pairs:.1e}")
n_pairs = len(references) * len(queries)
pair_per_hr = (n_pairs / duration) * 3600

print(f"pairs per hr {pair_per_hr:.1e}")
print(f"Full run (100kx1.5mln) est: {100_000 * 1_500_000 / pair_per_hr:.3f}hrs")
print(f"Full run (100kx1.5mln) est GBs: {(total_nbytes/n_pairs)*(100_000*1_500_000)*1e-9:.2f}GB")
mergesort_pairs_perh = pair_per_hr

Num of outputs StackedSparseArray array of shape (1024, 1024, 2) containing scores for ('CosineGreedy_score', 'CosineGreedy_matches').
Pairs processed 1.0e+06
pairs per hr 5.7e+07
Full run (100kx1.5mln) est: 2625.623hrs
Full run (100kx1.5mln) est GBs: 1591.96GB


In [None]:
from matchms.similarity import CosineGreedy

similarity_measure = CosineGreedy(tolerance=tolerance, 
                                mz_power= 0.0, 
                                intensity_power = 1.0)
duration = -perf_counter()
scores = calculate_scores(references, queries, similarity_measure, is_symmetric=False)
duration += perf_counter()

In [None]:
n_pairs = len(references) * len(queries)
total_nbytes = scores._scores.data.nbytes
print(f"Num of outputs {scores}")
print(f"Pairs processed {n_pairs:.1e}")
n_pairs = len(references) * len(queries)
pair_per_hr = (n_pairs / duration) * 3600

print(f"pairs per hr {pair_per_hr:.1e}")
print(f"Full run (100kx1.5mln) est: {100_000 * 1_500_000 / pair_per_hr:.3f}hrs")
print(f"Full run (100kx1.5mln) est GBs: {(total_nbytes/n_pairs)*(100_000*1_500_000)*1e-9:.2f}GB")
quicksort_pairs_per_hr = pair_per_hr

Num of outputs StackedSparseArray array of shape (1024, 1024, 2) containing scores for ('CosineGreedy_score', 'CosineGreedy_matches').
Pairs processed 1.0e+06
pairs per hr 5.8e+07
Full run (100kx1.5mln) est: 2586.891hrs
Full run (100kx1.5mln) est GBs: 1591.96GB


In [None]:
relative_speed = 100 * (1 - (mergesort_pairs_perh / quicksort_pairs_per_hr))
print(f"Mergesort has {relative_speed:.2f}% the processing speed of quicksort")