In [1]:
# All required imports
# import tensorflow as tf
# print("GPU available", tf.test.is_gpu_available())
from joblib import Parallel, delayed
from tqdm import tqdm
import numba
from typing import Tuple, List
from matchms import Spectrum
from matchms.typing import SpectrumType
import numpy as np
import pandas as pd
from pathlib import Path
import json

from matchms import Spectrum

from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def process_spectrum(spectrum):
    spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum


def get_ref_spectra_from_df(spectra_df, limit=None):
    # This function will take a dataframe with spectra and return a list of matchms spectra
    # Argh, This function is annoyingly slow. Added simple parallelization.
    
    # for index, row in spectra_df.iterrows():
    def fn(index, row):
        pbid = row["pbid"]
        precursor_mz = row["precursor_mz"]
        smiles = row["pb_smiles"]
        inchikey = row["pb_inchikey"]
        mz_array = np.array(json.loads(row["peaks_mz"]))
        intensity_array = np.array(json.loads(row["peaks_intensities"]))
        sp = Spectrum(mz=mz_array, intensities=intensity_array,
                        metadata={'id': pbid, 
                                'precursor_mz': precursor_mz, 
                                'smiles': smiles, 
                                'inchikey': inchikey}) 
        sp = process_spectrum(sp)
        return sp
    if limit is not None:
        spectra_df = spectra_df.head(limit)
    spectra = Parallel(-2)(delayed(fn)(index, row) for index, row in tqdm(spectra_df.iterrows(), total=len(spectra_df)) )
    spectra = [s for s in spectra if s is not None]
    return spectra

In [2]:
ref_spectra_df_path = Path("data/input/example_dataset_tornike.csv")
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
large_references = get_ref_spectra_from_df(ref_spectra_df, limit=2000)

100%|██████████| 2000/2000 [00:02<00:00, 876.74it/s] 


In [32]:
queries = large_references[:10]
references = large_references[10:30]

print(f"Total iterations: {len(queries) * len(references)}")

Total iterations: 200


In [33]:
from numba import cuda

In [34]:
def spectra_peaks_to_tensor(spectra: list, fill: float):
    sp_max_shape = max(len(s.peaks) for s in spectra)
    sp = np.full((len(spectra), sp_max_shape, 2), fill, 'float32')
    # batch = np.zeros(len(spectra),dtype=np.uint64)
    for i, s in enumerate(spectra):
        sp[i, :len(s.peaks)] = s.peaks.to_numpy
        # batch[i] = len(s.peaks)
    return sp

references_batch = spectra_peaks_to_tensor(references, fill=-1e6)
queries_batch = spectra_peaks_to_tensor(queries, fill=-1e6)

In [66]:
from numba.cuda.cudadrv.devicearray import DeviceNDArray
from numba import types
import math

rmz_cu = cuda.to_device(np.ascontiguousarray(references_batch[...,0]))
rint_cu = cuda.to_device(np.ascontiguousarray(references_batch[...,1]))
qmz_cu = cuda.to_device(np.ascontiguousarray(queries_batch[...,0]))
qint_cu = cuda.to_device(np.ascontiguousarray(queries_batch[...,1]))
out_cu = cuda.to_device(
    np.full((len(references_batch), len(queries_batch), 3), 
            fill_value=-1,
            dtype='float32')
)

In [68]:
@cuda.jit
def process(rmz: DeviceNDArray, 
            qmz: DeviceNDArray,
            rint: DeviceNDArray,
            qint: DeviceNDArray,
            out: DeviceNDArray,
            tolerance: float,
            shift: float,
            mz_power: float,
            int_power: float,
            ):
    i,j = cuda.grid(2)
    ti = cuda.threadIdx.x
    tj = cuda.threadIdx.y
    
    lowest_idx = types.int32(0)
    if i < rmz.shape[0] and j < qmz.shape[0]:
        spec1_mz = rmz[i]
        spec2_mz = qmz[j]
        
        for peak1_idx in range(spec1_mz.shape[0]):
            mz = spec1_mz[peak1_idx]
            low_bound = mz - tolerance
            high_bound = mz + tolerance
            
            for peak2_idx in range(lowest_idx, spec2_mz.shape[0]):
                mz2 = spec2_mz[peak2_idx] + shift
                if mz2 > high_bound:
                    break
                if mz2 < low_bound:
                    lowest_idx = peak2_idx
                else:
                    # If is empty (just store first)
                    if out[i, j, 0] < 0:
                        out[i, j, 0] = peak1_idx
                        out[i, j, 1] = peak2_idx
TPB = (32,32)
BPG_x = math.ceil(rmz_cu.shape[0] / TPB[0])
BPG_y = math.ceil(qmz_cu.shape[0] / TPB[1])
BPG = (BPG_x, BPG_y)
tolerance = types.float32(0.1)
shift = types.float32(0.0)
mz_power = types.float32(0.0)
int_power = types.float32(1.0)
process[BPG, TPB](rmz_cu, qmz_cu, rint_cu, 
                  qint_cu, out_cu, tolerance, shift, mz_power, int_power)
out = out_cu.copy_to_host()
print(out.shape)
print(out[...,0])
print(out[...,1])
print(out[...,2])



(20, 10, 3)
[[73. 73. 73. 76. 76. 76. 73. 73. 73. 73.]
 [76. 76. 76. 76. 76. 76. 76. 76. 76. 76.]
 [13. 13. 12. 76. 76. 76. 76. 76. 76. 12.]
 [ 9.  9.  9. 76. 76. 76.  9.  9.  9.  9.]
 [11. 11. 11. 76. 76. 76. 11. 11. 11. 11.]
 [12. 12. 12. 76. 76. 76. 12. 12. 12. 12.]
 [ 8.  8.  8. 76. 76. 76.  8.  8.  8.  8.]
 [ 9.  9.  9.  9.  9.  9.  4.  4.  4.  4.]
 [ 9.  9.  9. 76. 76. 76.  9.  9.  9.  9.]
 [13. 13. 13. 76. 76. 76. 13. 13. 13. 13.]
 [16. 16. 16. 76. 76. 76. 16. 16. 16. 16.]
 [26. 26. 26. 76. 76. 76. 26. 26. 26. 26.]
 [40. 40. 40. 76. 76. 76. 40. 40. 40. 40.]
 [38. 38. 38. 76. 76. 76. 38. 38. 38. 38.]
 [29. 30. 30. 30. 30. 31. 32. 32. 32. 32.]
 [32. 33. 33. 33. 33. 33. 33. 33. 33. 33.]
 [33. 34. 34. 34. 34. 34. 34. 34. 34. 34.]
 [-1. -1. -1. -1. -1. 25. 32. 32. 32. 32.]
 [10. 10. 10. 10. 10. 10. 10. 10. 10. 10.]
 [18. 18. 18. 18. 18. 19. 19. 19. 19. 19.]]
[[10. 11. 12. 70. 70. 70. 41. 50. 60. 68.]
 [10. 11. 12. 15. 20. 29. 41. 50. 60. 68.]
 [11. 12. 12. 70. 70. 70. 70. 70. 70. 68.

In [38]:
import time

def collect_peak_pairs(spec1: np.ndarray, spec2: np.ndarray,
                       tolerance: float, shift: float = 0, mz_power: float = 0.0,
                       intensity_power: float = 1.0):
    # pylint: disable=too-many-arguments
    """Find matching pairs between two spectra.

    Args
    ----
    spec1:
        Spectrum peaks and intensities as numpy array.
    spec2:
        Spectrum peaks and intensities as numpy array.
    tolerance
        Peaks will be considered a match when <= tolerance appart.
    shift
        Shift spectra peaks by shift. The default is 0.
    mz_power:
        The power to raise mz to in the cosine function. The default is 0, in which
        case the peak intensity products will not depend on the m/z ratios.
    intensity_power:
        The power to raise intensity to in the cosine function. The default is 1.

    Returns
    -------
    matching_pairs : numpy array
        Array of found matching peaks.
    """
    matches = find_matches(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global a
    # a = matches
    # matches_op = find_matches_opt(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global b
    # b = matches_op
    # assert np.allclose(matches, matches_op)
    
    idx1 = [x[0] for x in matches]
    idx2 = [x[1] for x in matches]
    if len(idx1) == 0:
        return None
    matching_pairs = []
    for i, idx in enumerate(idx1):
        power_prod_spec1 = (spec1[idx, 0] ** mz_power) * (spec1[idx, 1] ** intensity_power)
        power_prod_spec2 = (spec2[idx2[i], 0] ** mz_power) * (spec2[idx2[i], 1] ** intensity_power)
        # print((idx, idx2[i], power_prod_spec1 * power_prod_spec2))
        # raise
        matching_pairs.append([idx, idx2[i], power_prod_spec1 * power_prod_spec2])
    # print(matching_pairs)
    # raise
    return np.array(matching_pairs.copy())


# @numba.njit
def find_matches(spec1_mz: np.ndarray, spec2_mz: np.ndarray,
                 tolerance: float, shift: float = 0) -> List[Tuple[int, int]]:
    """Faster search for matching peaks.
    Makes use of the fact that spec1 and spec2 contain ordered peak m/z (from
    low to high m/z).

    Parameters
    ----------
    spec1_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    spec2_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    tolerance
        Peaks will be considered a match when <= tolerance appart.
    shift
        Shift peaks of second spectra by shift. The default is 0.

    Returns
    -------
    matches
        List containing entries of type (idx1, idx2).

    """
    
    lowest_idx = 0
    matches = []
    for peak1_idx in range(spec1_mz.shape[0]):
        mz = spec1_mz[peak1_idx]
        low_bound = mz - tolerance
        high_bound = mz + tolerance
        for peak2_idx in range(lowest_idx, spec2_mz.shape[0]):
            mz2 = spec2_mz[peak2_idx] + shift
            if mz2 > high_bound:
                break
            if mz2 < low_bound:
                lowest_idx = peak2_idx
            else:
                matches.append((peak1_idx, peak2_idx))
                # print((peak1_idx, peak2_idx))
    # print(matches)
    return matches


@numba.njit(fastmath=True)
def score_best_matches(matching_pairs: np.ndarray, spec1: np.ndarray,
                       spec2: np.ndarray, mz_power: float = 0.0,
                       intensity_power: float = 1.0) -> Tuple[float, int]:
    """Calculate cosine-like score by multiplying matches. Does require a sorted
    list of matching peaks (sorted by intensity product)."""
    score = float(0.0)
    used_matches = int(0)
    used1 = set()
    used2 = set()
    for i in range(matching_pairs.shape[0]):
        if not matching_pairs[i, 0] in used1 and not matching_pairs[i, 1] in used2:
            score += matching_pairs[i, 2]
            used1.add(matching_pairs[i, 0])  # Every peak can only be paired once
            used2.add(matching_pairs[i, 1])  # Every peak can only be paired once
            # print(i, matching_pairs[i,0], matching_pairs[i,1], used_matches, score)
            used_matches += 1
    # print(score)
    # raise
    # Normalize score:
    spec1_power = spec1[:, 0] ** mz_power * spec1[:, 1] ** intensity_power
    
    spec2_power = spec2[:, 0] ** mz_power * spec2[:, 1] ** intensity_power

    # print(spec1_power)
    # print(spec2_power)
    # raise
    score_norm = (np.sum(spec1_power ** 2) ** 0.5 * np.sum(spec2_power ** 2) ** 0.5)
    # print(score, score_norm, used_matches)
    score = score/score_norm
    # print(score, "/", score_norm)
    # raise
    return score, used_matches


In [63]:
start_collect_peaks = time.time()
pairs_to_score_list = []
scores = []
grid_outp = np.full((len(references), len(queries), 3), 
                    fill_value=-1, 
                    dtype='float32')
for i,spectrum_1 in tqdm(enumerate(references)):
    for j,spectrum_2 in enumerate(queries):
        spec1 = spectrum_1.peaks.to_numpy
        spec2 = spectrum_2.peaks.to_numpy
        
        matching_pairs = collect_peak_pairs(
                    spectrum_1.peaks.to_numpy, 
                    spectrum_2.peaks.to_numpy, 
                    tolerance=0.1,
                    shift=0.0, 
                    mz_power=0.0,
                    intensity_power=1.0
        )
        if matching_pairs is not None:
            # Store in grid
            grid_outp[i,j,0] = matching_pairs[0,0]
            grid_outp[i,j,1] = matching_pairs[0,1]
            grid_outp[i,j,2] = matching_pairs[0,2]
            matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2])[::-1], :] 
            pairs_to_score_list.append([ matching_pairs, spectrum_1, spectrum_2]) 
for matching_pairs, spectrum_1, spectrum_2 in tqdm(pairs_to_score_list):
    scores.append(score_best_matches(matching_pairs, spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy,
                                0.0, 1.0))
end_collect_peaks = time.time()
print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)

20it [00:00, 871.57it/s]
100%|██████████| 195/195 [00:00<00:00, 31363.19it/s]

Time to collect matching pairs:  0.033110857009887695





In [64]:
print(grid_outp[...,0])
print(grid_outp[...,1])
print(grid_outp[...,2])

[[49. 49. 47. 47. 25. 20.  7.  1.  1.  0.]
 [54. 54. 52. 52. 30. 26. 10.  1.  1.  0.]
 [ 1.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 2.  2.  2.  2.  1.  0.  0.  0.  0.  0.]
 [ 1.  1.  1.  1.  0.  0.  0.  0.  0.  0.]
 [ 2.  2.  2.  2.  1.  0.  0.  0.  0.  0.]
 [ 2.  2.  2.  2.  1.  0.  0.  0.  0.  0.]
 [ 5.  5.  5.  5.  4.  0.  0.  0.  0.  0.]
 [ 8.  8.  8.  8.  7.  3.  3.  0.  0.  0.]
 [16. 16. 16. 16.  9.  6.  4.  0.  0.  0.]
 [28. 28. 28. 28. 19. 14.  6.  0.  0.  0.]
 [30. 30. 30. 30. 20. 15.  7.  1.  1.  0.]
 [29. 29. 29. 29. 21. 17.  9.  1.  1.  0.]
 [32. 32. 32. 32. 25. 21. 17.  1.  1.  0.]
 [33. 33. 33. 33. 33. 24. 18.  1.  1.  0.]
 [-1. -1. -1. -1. -1. 25. 20.  2.  2.  0.]
 [ 6.  6.  2.  2.  0.  0.  0.  0.  0.  0.]
 [ 8.  8.  6.  6.  0.  0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  2.  5. 14. 22. 29. 37.]
 [ 1.  1. 

In [47]:
pairs_to_score_list[0]

[array([[7.00000000e+01, 5.00000000e+00, 3.30030030e-02],
        [5.20000000e+01, 1.00000000e+00, 1.84744123e-02],
        [6.90000000e+01, 5.00000000e+00, 5.59559560e-03],
        [7.10000000e+01, 8.00000000e+00, 1.52285980e-03],
        [7.30000000e+01, 1.00000000e+01, 5.38289641e-04],
        [5.00000000e+01, 0.00000000e+00, 1.28455984e-04],
        [4.90000000e+01, 0.00000000e+00, 1.01638275e-04],
        [7.10000000e+01, 7.00000000e+00, 4.72404336e-05]]),
 Spectrum(precursor m/z=285.12, 74 fragments between 50.0 and 155.1),
 Spectrum(precursor m/z=285.12, 14 fragments between 118.1 and 286.1)]

In [40]:
scores

[(0.02605932348386438, 5),
 (0.03442222082648898, 6),
 (0.06452274444206799, 8),
 (0.11209947205892355, 8),
 (0.1623664287234474, 13),
 (0.23840139483322562, 22),
 (0.3848047853392426, 33),
 (0.5657118496972595, 43),
 (0.7429066000116887, 52),
 (0.9162835881551422, 62),
 (0.005728378647648435, 5),
 (0.009132206272256797, 6),
 (0.021709608341931294, 8),
 (0.04213699303551976, 8),
 (0.06706815456959091, 13),
 (0.11193900110163978, 21),
 (0.20943092375678907, 32),
 (0.3547077252764144, 41),
 (0.5405889742079147, 49),
 (0.7662681701490394, 59),
 (0.9839733090233771, 12),
 (0.9972879869804555, 12),
 (0.9955911964844311, 12),
 (0.964456618462334, 13),
 (0.9148682500738545, 13),
 (0.8527410050476982, 13),
 (0.7035701317343943, 11),
 (0.4606751332163862, 11),
 (0.23167265520749364, 10),
 (0.10444423283199249, 9),
 (0.3253888505462898, 3),
 (0.32565130293613376, 4),
 (0.27739187239428803, 4),
 (0.196922979415211, 5),
 (0.14655759650837452, 4),
 (0.15012987464297978, 4),
 (0.18799849781377598, 3

In [None]:
from itertools import product
start_collect_peaks = time.time()
queries_npy = [q.peaks.to_numpy.astype('float32') for q in queries]
references_npy = [q.peaks.to_numpy.astype('float32') for q in references]

def fn(spectrum_1, spectrum_2):
    matching_pairs = collect_peak_pairs(
                spectrum_1, 
                spectrum_2, 
                tolerance=0.1,
                shift=0.0, 
                mz_power=0.0,
                intensity_power=1.0
    ) 
    if matching_pairs is not None:
        return score_best_matches(matching_pairs, spectrum_1, spectrum_2, 0.0, 1.0)
    
total_len = len(references_npy) * len(queries_npy)
scores = Parallel(-1)(delayed(fn)(spectrum_1, spectrum_2)
                      for (spectrum_1, spectrum_2) in 
                      tqdm(product(references_npy,queries_npy), total=total_len))

end_collect_peaks = time.time()
print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)

  0%|          | 0/947000 [00:00<?, ?it/s]

100%|██████████| 947000/947000 [01:37<00:00, 9744.21it/s] 


Time to collect matching pairs:  97.95659852027893


In [None]:
def spectra_peaks_to_tensor(spectra: list, fill: float):
    sp_max_shape = max(len(s.peaks) for s in spectra)
    sp = np.full((len(spectra), sp_max_shape, 2), fill, 'float32')
    batch = np.zeros(len(spectra),dtype=np.uint64)
    for i, s in enumerate(spectra):
        sp[i, :len(s.peaks)] = s.peaks.to_numpy
        batch[i] = len(s.peaks)
    return sp, batch

# queries = large_references[:1000]
# references = large_references[1000:]

references_batch, references_batch_size \
    = spectra_peaks_to_tensor(references, fill=-1e6)
queries_batch, queries_batch_size \
    = spectra_peaks_to_tensor(queries, fill=-1e6)

In [None]:
np.save('data/references_mz.npy', references_batch[...,0])
np.save('data/references_int.npy', references_batch[...,1])
np.save('data/queries_mz.npy', queries_batch[...,0])
np.save('data/queries_int.npy', queries_batch[...,1])
# np.save('data/scores_100x100.npy', np.array(scores))

In [None]:
load_scores_true = np.load('data/scores_100x100.npy')
print(load_scores_true)
print(load_scores_true.shape, load_scores_true.dtype)
print(np.sort(load_scores_true[:,0]))

FileNotFoundError: [Errno 2] No such file or directory: 'data/scores_100x100.npy'

In [None]:
load_scores = np.load('data/results.npy')
print(load_scores)
print(load_scores.shape, load_scores.dtype)
print(np.sort(load_scores[:,0]))

[[0.0000000e+00 5.0000000e+00 2.9956270e-06 1.0000000e+00]
 [0.0000000e+00 6.0000000e+00 7.0604798e-04 5.0000000e+00]
 [0.0000000e+00 7.0000000e+00 2.6819189e-03 1.0000000e+01]
 ...
 [7.6410000e+03 9.9700000e+02 5.7682420e-05 1.0000000e+00]
 [7.6410000e+03 9.9800000e+02 2.6733984e-05 1.0000000e+00]
 [7.6410000e+03 9.9900000e+02 7.2767034e-06 1.0000000e+00]]
(65748867, 4) float32
[    0.     0.     0. ... 91697. 91697. 91697.]


In [None]:
load_scores.nbytes / 1e9

1.051981872

In [None]:
import matplotlib.pyplot as plt
plt.plot(load_scores[:,1] - load_scores_true[:len(load_scores[:,1]),1])

ValueError: operands could not be broadcast together with shapes (1740258,) (22970,) 