In [1]:
# All required imports
# import tensorflow as tf
# print("GPU available", tf.test.is_gpu_available())
from joblib import Parallel, delayed
from tqdm import tqdm
import numba
from typing import Tuple, List
from matchms import Spectrum
from matchms.typing import SpectrumType
import numpy as np
import pandas as pd
from pathlib import Path
import json
import math
np.set_printoptions(precision=3)

from matchms import Spectrum

from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def process_spectrum(spectrum):
    spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum


def get_ref_spectra_from_df(spectra_df, limit=None):
    # This function will take a dataframe with spectra and return a list of matchms spectra
    # Argh, This function is annoyingly slow. Added simple parallelization.
    
    # for index, row in spectra_df.iterrows():
    def fn(index, row):
        pbid = row["pbid"]
        precursor_mz = row["precursor_mz"]
        smiles = row["pb_smiles"]
        inchikey = row["pb_inchikey"]
        mz_array = np.array(json.loads(row["peaks_mz"]))
        intensity_array = np.array(json.loads(row["peaks_intensities"]))
        sp = Spectrum(mz=mz_array, intensities=intensity_array,
                        metadata={'id': pbid, 
                                'precursor_mz': precursor_mz, 
                                'smiles': smiles, 
                                'inchikey': inchikey}) 
        sp = process_spectrum(sp)
        return sp
    if limit is not None:
        spectra_df = spectra_df.head(limit)
    spectra = Parallel(-2)(delayed(fn)(index, row) for index, row in tqdm(spectra_df.iterrows(), total=len(spectra_df)) )
    spectra = [s for s in spectra if s is not None]
    return spectra



In [2]:
ref_spectra_df_path = Path("data/input/example_dataset_tornike.csv")
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
large_references = get_ref_spectra_from_df(ref_spectra_df, limit=10000)

100%|██████████| 10000/10000 [00:05<00:00, 1809.16it/s]


In [9]:
R = 64 * 4
Q = 64 * 4
references = large_references[Q:Q+R]
queries = large_references[:Q]

print(f"Total iterations: {len(queries) * len(references)}")

Total iterations: 65536


In [10]:
import time

@numba.njit
def find_matches(spec1_mz: np.ndarray, spec2_mz: np.ndarray,
                 tolerance: float, shift: float = 0) -> List[Tuple[int, int]]:
    """Faster search for matching peaks.
    Makes use of the fact that spec1 and spec2 contain ordered peak m/z (from
    low to high m/z).

    Parameters
    ----------
    spec1_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    spec2_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    tolerance
        Peaks will be considered a match when <= tolerance appart.
    shift
        Shift peaks of second spectra by shift. The default is 0.

    Returns
    -------
    matches
        List containing entries of type (idx1, idx2).

    """
    
    lowest_idx = 0
    matches = []
    for peak1_idx in range(spec1_mz.shape[0]):
        mz = spec1_mz[peak1_idx]
        low_bound = mz - tolerance
        high_bound = mz + tolerance
        for peak2_idx in range(lowest_idx, spec2_mz.shape[0]):
            mz2 = spec2_mz[peak2_idx] + shift
            if mz2 > high_bound:
                break
            if mz2 < low_bound:
                lowest_idx = peak2_idx
            else:
                matches.append((peak1_idx, peak2_idx))
                # print((peak1_idx, peak2_idx))
    # print(matches)
    return matches


@numba.njit(fastmath=True)
def score_best_matches(matching_pairs: np.ndarray, spec1: np.ndarray,
                       spec2: np.ndarray, mz_power: float = 0.0,
                       intensity_power: float = 1.0) -> Tuple[float, int]:
    """Calculate cosine-like score by multiplying matches. Does require a sorted
    list of matching peaks (sorted by intensity product)."""
    score = float(0.0)
    used_matches = int(0)
    used1 = set()
    used2 = set()
    for i in range(matching_pairs.shape[0]):
        if not matching_pairs[i, 0] in used1 and not matching_pairs[i, 1] in used2:
            score += matching_pairs[i, 2]
            used1.add(matching_pairs[i, 0])  # Every peak can only be paired once
            used2.add(matching_pairs[i, 1])  # Every peak can only be paired once
            # print(i, matching_pairs[i,0], matching_pairs[i,1], used_matches, score)
            used_matches += 1

    # Normalize score:
    spec1_power = spec1[:, 0] ** mz_power * spec1[:, 1] ** intensity_power    
    spec2_power = spec2[:, 0] ** mz_power * spec2[:, 1] ** intensity_power

    # print(spec1_power)
    # print(spec2_power)
    # raise
    score_norm = (np.sum(spec1_power ** 2) ** 0.5 * np.sum(spec2_power ** 2) ** 0.5)
    # print(score, score_norm, used_matches)
    score = score/score_norm
    # print(score, "/", score_norm)
    # raise
    return score, used_matches

@numba.njit
def collect_peak_pairs(spec1: np.ndarray, spec2: np.ndarray,
                       tolerance: float, shift: float = 0, mz_power: float = 0.0,
                       intensity_power: float = 1.0):
    # pylint: disable=too-many-arguments
    """Find matching pairs between two spectra.

    Args
    ----
    spec1:
        Spectrum peaks and intensities as numpy array.
    spec2:
        Spectrum peaks and intensities as numpy array.
    tolerance
        Peaks will be considered a match when <= tolerance appart.
    shift
        Shift spectra peaks by shift. The default is 0.
    mz_power:
        The power to raise mz to in the cosine function. The default is 0, in which
        case the peak intensity products will not depend on the m/z ratios.
    intensity_power:
        The power to raise intensity to in the cosine function. The default is 1.

    Returns
    -------
    matching_pairs : numpy array
        Array of found matching peaks.
    """
    matches = find_matches(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global a
    # a = matches
    # matches_op = find_matches_opt(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global b
    # b = matches_op
    # assert np.allclose(matches, matches_op)
    
    idx1 = [x[0] for x in matches]
    idx2 = [x[1] for x in matches]
    if len(idx1) == 0:
        return None
    matching_pairs = []
    for i, idx in enumerate(idx1):
        power_prod_spec1 = (spec1[idx, 0] ** mz_power) * (spec1[idx, 1] ** intensity_power)
        power_prod_spec2 = (spec2[idx2[i], 0] ** mz_power) * (spec2[idx2[i], 1] ** intensity_power)
        # print((idx, idx2[i], power_prod_spec1 * power_prod_spec2))
        matching_pairs.append([idx, idx2[i], power_prod_spec1 * power_prod_spec2])
    return np.array(matching_pairs.copy())


In [11]:
def spectra_peaks_to_tensor(spectra: list, fill: float):
    sp_max_shape = max(len(s.peaks) for s in spectra)
    sp = np.full((len(spectra), sp_max_shape, 2), fill, 'float32')
    # batch = np.zeros(len(spectra),dtype=np.uint64)
    for i, s in enumerate(spectra):
        sp[i, :len(s.peaks)] = s.peaks.to_numpy
        # batch[i] = len(s.peaks)
    return sp

references_batch = spectra_peaks_to_tensor(references, fill=-1e6)
queries_batch = spectra_peaks_to_tensor(queries, fill=-1e6)

start_collect_peaks = time.time()
pairs_to_score_list = []
scores = []
grid_outp = np.full((len(references), len(queries), 3), 
                    fill_value=-1, 
                    dtype='float32')
for i,spectrum_1 in tqdm(enumerate(references),total=len(references)):
    for j,spectrum_2 in enumerate(queries):
        spec1 = spectrum_1.peaks.to_numpy
        spec2 = spectrum_2.peaks.to_numpy
        
        matching_pairs = collect_peak_pairs(
                    spectrum_1.peaks.to_numpy, 
                    spectrum_2.peaks.to_numpy, 
                    tolerance=0.1,
                    shift=0.0, 
                    mz_power=0.0,
                    intensity_power=1.0
        )
        if matching_pairs is not None:
            # Store in grid
            matching_pairs = matching_pairs[np.argsort(matching_pairs[:, 2])[::-1], :] 
            pairs_to_score_list.append([ matching_pairs, spectrum_1, spectrum_2]) 
            # for matching_pairs, spectrum_1, spectrum_2 in tqdm(pairs_to_score_list):
            score = score_best_matches(matching_pairs, spec1, spec2, 0.0, 1.0)
            grid_outp[i,j,0] = score[0]
            grid_outp[i,j,1] = score[1]
            # grid_outp[i,j,3] = scores[]
            # grid_outp[i,j,4] = matching_pairs[0,2]
            scores.append(score)
end_collect_peaks = time.time()
print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)
print(grid_outp[...,1])
print(grid_outp[...,0])
np.save('data/grid_outp.npy', grid_outp)
raise

  0%|          | 0/256 [00:00<?, ?it/s]

100%|██████████| 256/256 [00:05<00:00, 43.63it/s]

Time to collect matching pairs:  5.87218976020813
[[ 2.  2.  3. ... 72. 80. 92.]
 [ 2.  2.  3. ... 65. 72. 79.]
 [-1. -1. -1. ...  8.  7.  6.]
 ...
 [-1. -1. -1. ... 13. 16. 17.]
 [-1. -1. -1. ... 13. 16. 17.]
 [-1. -1. -1. ...  2.  2.  2.]]
[[ 1.251e-04  1.537e-04  2.803e-04 ...  7.860e-01  8.709e-01  9.535e-01]
 [ 1.569e-04  2.013e-04  3.836e-04 ...  6.003e-01  7.067e-01  8.311e-01]
 [-1.000e+00 -1.000e+00 -1.000e+00 ...  6.077e-02  2.185e-02  6.530e-03]
 ...
 [-1.000e+00 -1.000e+00 -1.000e+00 ...  6.530e-03  1.579e-02  3.446e-02]
 [-1.000e+00 -1.000e+00 -1.000e+00 ...  5.267e-03  1.350e-02  3.051e-02]
 [-1.000e+00 -1.000e+00 -1.000e+00 ...  1.003e-02  6.107e-03  3.059e-03]]





RuntimeError: No active exception to reraise

In [None]:
np.zeros((100,5),'float32').nbytes

2000

In [None]:
64000

0.100663296

In [None]:
score

(0.268704144384625, 18)

In [None]:
matching_pairs.shape

(35, 3)

In [None]:
grid_outp.shape

(20, 10, 3)

In [None]:
raise

RuntimeError: No active exception to reraise

In [None]:
16 * 16 * 2

512

In [None]:
from itertools import product
start_collect_peaks = time.time()
queries_npy = [q.peaks.to_numpy.astype('float32') for q in queries]
references_npy = [q.peaks.to_numpy.astype('float32') for q in references]

def fn(spectrum_1, spectrum_2):
    matching_pairs = collect_peak_pairs(
                spectrum_1, 
                spectrum_2, 
                tolerance=0.1,
                shift=0.0, 
                mz_power=0.0,
                intensity_power=1.0
    ) 
    if matching_pairs is not None:
        return score_best_matches(matching_pairs, spectrum_1, spectrum_2, 0.0, 1.0)
    
total_len = len(references_npy) * len(queries_npy)
scores = Parallel(-1)(delayed(fn)(spectrum_1, spectrum_2)
                      for (spectrum_1, spectrum_2) in 
                      tqdm(product(references_npy,queries_npy), total=total_len))

end_collect_peaks = time.time()
print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)

  0%|          | 0/947000 [00:00<?, ?it/s]

100%|██████████| 947000/947000 [01:37<00:00, 9744.21it/s] 


Time to collect matching pairs:  97.95659852027893


In [None]:
def spectra_peaks_to_tensor(spectra: list, fill: float):
    sp_max_shape = max(len(s.peaks) for s in spectra)
    sp = np.full((len(spectra), sp_max_shape, 2), fill, 'float32')
    batch = np.zeros(len(spectra),dtype=np.uint64)
    for i, s in enumerate(spectra):
        sp[i, :len(s.peaks)] = s.peaks.to_numpy
        batch[i] = len(s.peaks)
    return sp, batch

# queries = large_references[:1000]
# references = large_references[1000:]

references_batch, references_batch_size \
    = spectra_peaks_to_tensor(references, fill=-1e6)
queries_batch, queries_batch_size \
    = spectra_peaks_to_tensor(queries, fill=-1e6)

In [None]:
np.save('data/references_mz.npy', references_batch[...,0])
np.save('data/references_int.npy', references_batch[...,1])
np.save('data/queries_mz.npy', queries_batch[...,0])
np.save('data/queries_int.npy', queries_batch[...,1])
# np.save('data/scores_100x100.npy', np.array(scores))

In [None]:
load_scores_true = np.load('data/scores_100x100.npy')
print(load_scores_true)
print(load_scores_true.shape, load_scores_true.dtype)
print(np.sort(load_scores_true[:,0]))

FileNotFoundError: [Errno 2] No such file or directory: 'data/scores_100x100.npy'

In [None]:
load_scores = np.load('data/results.npy')
print(load_scores)
print(load_scores.shape, load_scores.dtype)
print(np.sort(load_scores[:,0]))

[[0.0000000e+00 5.0000000e+00 2.9956270e-06 1.0000000e+00]
 [0.0000000e+00 6.0000000e+00 7.0604798e-04 5.0000000e+00]
 [0.0000000e+00 7.0000000e+00 2.6819189e-03 1.0000000e+01]
 ...
 [7.6410000e+03 9.9700000e+02 5.7682420e-05 1.0000000e+00]
 [7.6410000e+03 9.9800000e+02 2.6733984e-05 1.0000000e+00]
 [7.6410000e+03 9.9900000e+02 7.2767034e-06 1.0000000e+00]]
(65748867, 4) float32
[    0.     0.     0. ... 91697. 91697. 91697.]


In [None]:
load_scores.nbytes / 1e9

1.051981872

In [None]:
import matplotlib.pyplot as plt
plt.plot(load_scores[:,1] - load_scores_true[:len(load_scores[:,1]),1])

ValueError: operands could not be broadcast together with shapes (1740258,) (22970,) 