In [1]:
# All required imports
# import tensorflow as tf
# print("GPU available", tf.test.is_gpu_available())
from joblib import Parallel, delayed
from tqdm import tqdm
import numba
from typing import Tuple, List
from matchms import Spectrum
from matchms.typing import SpectrumType
import numpy as np
import pandas as pd
from pathlib import Path
import json

from matchms import Spectrum

from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

def process_spectrum(spectrum):
    spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=1000)
    spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum


def get_ref_spectra_from_df(spectra_df):
    # This function will take a dataframe with spectra and return a list of matchms spectra
    # Argh, This function is annoyingly slow. Added simple parallelization.
    
    # for index, row in spectra_df.iterrows():
    def fn(index, row):
        pbid = row["pbid"]
        precursor_mz = row["precursor_mz"]
        smiles = row["pb_smiles"]
        inchikey = row["pb_inchikey"]
        mz_array = np.array(json.loads(row["peaks_mz"]))
        intensity_array = np.array(json.loads(row["peaks_intensities"]))
        sp = Spectrum(mz=mz_array, intensities=intensity_array,
                        metadata={'id': pbid, 
                                'precursor_mz': precursor_mz, 
                                'smiles': smiles, 
                                'inchikey': inchikey}) 
        sp = process_spectrum(sp)
        return sp
    
    spectra = Parallel(-2)(delayed(fn)(index, row) for index, row in tqdm(spectra_df.iterrows(), total=len(spectra_df)))
    spectra = [s for s in spectra if s is not None]
    return spectra


In [2]:
ref_spectra_df_path = Path("data/example_dataset_tornike.csv")
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
large_references = get_ref_spectra_from_df(ref_spectra_df)

100%|██████████| 100001/100001 [00:25<00:00, 3875.62it/s]


In [3]:
queries = large_references[:100]
references = large_references[100:400]

In [19]:
import time

def collect_peak_pairs(spec1: np.ndarray, spec2: np.ndarray,
                       tolerance: float, shift: float = 0, mz_power: float = 0.0,
                       intensity_power: float = 1.0):
    # pylint: disable=too-many-arguments
    """Find matching pairs between two spectra.

    Args
    ----
    spec1:
        Spectrum peaks and intensities as numpy array.
    spec2:
        Spectrum peaks and intensities as numpy array.
    tolerance
        Peaks will be considered a match when <= tolerance appart.
    shift
        Shift spectra peaks by shift. The default is 0.
    mz_power:
        The power to raise mz to in the cosine function. The default is 0, in which
        case the peak intensity products will not depend on the m/z ratios.
    intensity_power:
        The power to raise intensity to in the cosine function. The default is 1.

    Returns
    -------
    matching_pairs : numpy array
        Array of found matching peaks.
    """
    matches = find_matches(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global a
    # a = matches
    # matches_op = find_matches_opt(spec1[:, 0], spec2[:, 0], tolerance, shift)
    # global b
    # b = matches_op
    # assert np.allclose(matches, matches_op)
    
    idx1 = [x[0] for x in matches]
    idx2 = [x[1] for x in matches]
    if len(idx1) == 0:
        return None
    matching_pairs = []
    for i, idx in enumerate(idx1):
        power_prod_spec1 = (spec1[idx, 0] ** mz_power) * (spec1[idx, 1] ** intensity_power)
        power_prod_spec2 = (spec2[idx2[i], 0] ** mz_power) * (spec2[idx2[i], 1] ** intensity_power)
        # print((idx, idx2[i], power_prod_spec1 * power_prod_spec2))
        # raise
        matching_pairs.append([idx, idx2[i], power_prod_spec1 * power_prod_spec2])
    # print(matching_pairs)
    # raise
    return np.array(matching_pairs.copy())


# @numba.njit
def find_matches(spec1_mz: np.ndarray, spec2_mz: np.ndarray,
                 tolerance: float, shift: float = 0) -> List[Tuple[int, int]]:
    """Faster search for matching peaks.
    Makes use of the fact that spec1 and spec2 contain ordered peak m/z (from
    low to high m/z).

    Parameters
    ----------
    spec1_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    spec2_mz:
        Spectrum peak m/z values as numpy array. Peak mz values must be ordered.
    tolerance
        Peaks will be considered a match when <= tolerance appart.
    shift
        Shift peaks of second spectra by shift. The default is 0.

    Returns
    -------
    matches
        List containing entries of type (idx1, idx2).

    """
    
    lowest_idx = 0
    matches = []
    for peak1_idx in range(spec1_mz.shape[0]):
        mz = spec1_mz[peak1_idx]
        low_bound = mz - tolerance
        high_bound = mz + tolerance
        for peak2_idx in range(lowest_idx, spec2_mz.shape[0]):
            mz2 = spec2_mz[peak2_idx] + shift
            if mz2 > high_bound:
                break
            if mz2 < low_bound:
                lowest_idx = peak2_idx
            else:
                matches.append((peak1_idx, peak2_idx))
                # print((peak1_idx, peak2_idx))
    # print(matches)
    return matches


@numba.njit(fastmath=True)
def score_best_matches(matching_pairs: np.ndarray, spec1: np.ndarray,
                       spec2: np.ndarray, mz_power: float = 0.0,
                       intensity_power: float = 1.0) -> Tuple[float, int]:
    """Calculate cosine-like score by multiplying matches. Does require a sorted
    list of matching peaks (sorted by intensity product)."""
    score = float(0.0)
    used_matches = int(0)
    used1 = set()
    used2 = set()
    for i in range(matching_pairs.shape[0]):
        if not matching_pairs[i, 0] in used1 and not matching_pairs[i, 1] in used2:
            score += matching_pairs[i, 2]
            used1.add(matching_pairs[i, 0])  # Every peak can only be paired once
            used2.add(matching_pairs[i, 1])  # Every peak can only be paired once
            # print(i, matching_pairs[i,0], matching_pairs[i,1], used_matches, score)
            used_matches += 1
    # print(score)
    # raise
    # Normalize score:
    spec1_power = spec1[:, 0] ** mz_power * spec1[:, 1] ** intensity_power
    
    spec2_power = spec2[:, 0] ** mz_power * spec2[:, 1] ** intensity_power

    # print(spec1_power)
    # print(spec2_power)
    # raise
    score_norm = (np.sum(spec1_power ** 2) ** 0.5 * np.sum(spec2_power ** 2) ** 0.5)
    print(score, score_norm, used_matches)
    score = score/score_norm
    # print(score, "/", score_norm)
    # raise
    return score, used_matches

start_collect_peaks = time.time()
pairs_to_score_list = []

for spectrum_1 in tqdm(references):
    for spectrum_2 in queries:
        spec1 = spectrum_1.peaks.to_numpy
        spec2 = spectrum_2.peaks.to_numpy
        
        matching_pairs = collect_peak_pairs(
                    spectrum_1.peaks.to_numpy, 
                    spectrum_2.peaks.to_numpy, 
                    tolerance=0.1,
                    shift=0.0, 
                    mz_power=0.0,
                    intensity_power=1.0
        ) 
        if matching_pairs is not None:
            pairs_to_score_list.append([ matching_pairs, spectrum_1, spectrum_2])  
scores = []
for matching_pairs, spectrum_1, spectrum_2 in tqdm(pairs_to_score_list):
    scores.append(score_best_matches(matching_pairs, spectrum_1.peaks.to_numpy, spectrum_2.peaks.to_numpy,
                                0.0, 1.0))
end_collect_peaks = time.time()
print("Time to collect matching pairs: ", end_collect_peaks - start_collect_peaks)

  0%|          | 0/300 [00:00<?, ?it/s]

100%|██████████| 300/300 [00:03<00:00, 82.51it/s]
 18%|█▊        | 4184/22970 [00:00<00:02, 7492.00it/s]

0.012345400054709363 1.430493433144009 5
0.02224039184329474 1.4079002042780564 6
0.05907430293156018 1.3858633378015937 7
0.11391423655888122 1.3935591487743797 7
0.16742308875442014 1.444530927560186 10
0.23740204288372452 1.5495554386013486 16
0.42078639931222506 1.9098003195909157 25
0.5449609533457381 1.743347499731398 31
0.7822219688156625 1.7744911952003546 36
1.299224971317664 2.214054378188289 44
1.8376445489533577 2.531426839374535 47
1.7523498626754883 2.188470697940592 51
0.01994667570473376 1.377669813060547 6
0.11768360352344337 1.3854088857982232 2
0.29683653673693716 1.666499992608804 2
0.3914211109006905 1.4595591676062933 3
0.39032991960929897 1.3364206359615751 2
0.391329929629329 1.3267125047549708 3
0.39342608584560534 1.3262595379216764 3
0.4119899051203355 1.3264932144564228 7
0.5018617549481413 1.332969199461391 10
0.8030322033745457 1.4187117667679419 19
1.685021742162583 2.068554213766789 27
1.5359386046707364 1.742807570650829 29
1.4532790235681123 1.67612511

 36%|███▋      | 8379/22970 [00:01<00:01, 13242.72it/s]

2.8293558824089354e-05 1.3286203359790234 2
1.702403103804505e-05 1.2777853098311145 2
1.359718076434793e-05 1.369530082046465 2
2.2033044055066077e-05 1.5829048051092354 3
2.3917811705599492e-05 1.3728949543807376 2
1.0170330490650811e-05 1.0298632211035144 2
5.658310963616268e-05 1.0926430680023147 3
0.00023609395180966753 1.4292719478126032 3
0.000358482606730855 1.6545493773715956 4
9.220531843154466e-05 1.4427359060473885 6
6.225845465084704e-05 1.4499381155768565 6
0.00019076754432109785 1.6240415903996568 7
0.0002881963044125206 1.5031887264524806 7
0.00039562224887550217 1.3895663665227536 6
0.000615788360933506 1.6805233641222213 4
8.29658487316145e-06 1.323608790304179 1
1.772242713183654e-05 1.0406326361229712 1
4.6046046046046044e-05 1.0177821514868695 1
0.0001687773859946032 1.0569024394588435 2
0.0006224103983863743 1.3654444778081043 3
0.0011207900593285978 1.0567772135076787 5
0.0026593197802407013 1.055178400390262 6
0.005541139738336936 1.1722219393295914 8
0.00936350

 54%|█████▍    | 12498/22970 [00:01<00:00, 16666.50it/s]

0.0020380620861101344 1.3456561318586213 1
0.001637920202484767 1.2941692903609558 1
0.0011110990870750628 1.3870904296468354 1
0.0006017549080612142 1.6032010796930543 2
0.0002154727299872445 1.3904984469463741 2
4.328652977301626e-06 1.1066531232240187 1
1.646090534979424e-05 1.4475983157749217 1
2.5927829731633533e-05 1.6757642908441102 2
1.3214415616818016e-05 1.4612349111717373 2
8.885762639516394e-06 1.4685294686565267 2
1.3016019022025027e-05 1.644865327839773 3
1.564126689251814e-05 1.5224628679200558 4
2.2255488721955188e-05 1.407383622769846 2
3.9770501231962694e-05 1.7020713204697244 2
0.00016508620732844957 1.0252361368059972 1
0.00017971845719593466 1.0391219976502655 1
0.00019865430996562127 1.3405803272929215 1
4.13145878611344e-05 1.0539758047427528 1
1.1339667996324653e-05 1.070454222263675 2
7.247096946796647e-05 1.3829524390961743 2
0.00020521442363284205 1.0703273906440696 3
0.000552644235827419 1.0687080772729836 6
0.0020733196660123587 1.1872523683718281 9
0.00490

 63%|██████▎   | 14502/22970 [00:01<00:00, 16774.19it/s]

0.0036570313055798538 1.5671065921365064 7
0.00855000616231847 1.5647356945207436 14
0.024075025776527276 1.738300850065684 24
0.0674482553624696 2.65616585297695 32
0.06097548579610639 2.1640841267938504 30
0.05052712602492381 2.1375831626242654 26
0.03756624011398786 1.9262645171783708 22
0.0019032045058071084 1.5033919446946802 2
0.0019052886720554386 1.504805826585073 2
0.001916541165790415 1.646210004949182 3
0.0009778447115784453 1.6658007164566602 5
0.0003707982256530805 1.5111519122603747 9
0.002505199092986881 1.550811963591618 21
0.008324952680408136 2.2785688656895107 27
0.0065611327042758485 2.0507066602341273 26
0.0036264550837123415 1.9986543077912788 24
0.0033234893552210863 1.770834018812884 22
0.0036312594877159427 2.072312737373465 18
0.0021054079104129157 1.71714892694491 14
0.0018345983621258896 1.555447906837817 3
0.004795016437859281 1.546377381489927 11
0.03364157520884248 2.0409646689237753 23
0.11584324835345855 2.7730258385383615 38
0.07968338919500081 2.48428

 80%|████████  | 18485/22970 [00:01<00:00, 17554.16it/s]

0.4071368679991301 2.1024672606150214 42
0.34482347813278746 3.073463071577058 45
0.22977216475734993 2.575230431671388 44
0.20094471799126457 2.4579971783243546 42
0.18716607668729787 2.3295082043446302 41
0.12397772617462309 2.025795078259902 40
0.07185702208715221 1.679237089656513 34
0.00013488172857542224 1.5306316437857128 1
0.002670695019343668 1.7043154574488202 4
0.0043241473705938165 1.4772796695110095 7
0.008799684970255541 1.4728382877234838 10
0.02578337787236686 1.537122465418699 15
0.07408998137276415 1.9372390457567656 22
0.11481374768161554 1.9689487563310903 24
0.13795441627814 2.2632557805610456 29
0.0934912177442708 1.847654614714571 28
0.07214511448385323 1.8788955041216022 30
0.00020747684621558497 1.8702952249757945 1
9.277656034412792e-05 1.9391312402871455 1
0.0005562897231565899 1.536052839751853 2
0.0012067206345484623 1.5516030168825534 3
0.002447371395419443 1.8140616001017065 8
0.0030630600570540514 1.4988617382708898 9
0.005798991383776168 1.4882027403197

100%|██████████| 22970/22970 [00:01<00:00, 12813.45it/s]

5.7770483195908616e-05 1.6876844959417596 4
0.0001849316784251719 1.3957152804458928 5
0.004058982305628953 1.460398733433578 8
0.03234264204144084 2.1348639577260395 11
0.051656613069525986 1.7887856477785813 14
0.07148291344397449 1.7073540374455753 15
0.08024039504970437 1.6181040698597409 16
0.07832449927404883 1.4071413248172602 16
0.06340842113384657 1.1664180293355373 16
0.0036036036036036037 1.0230509697994807 1
0.031534049565080595 1.0677035232278034 3
0.19356293721148574 1.3456292527255158 6
0.5271947050153256 1.3676551943550386 11
1.0104503917330745 1.5720843493185757 14
1.0175451023596171 1.2834028427926836 15
1.0308227509792074 1.3051031356705003 16
2.416330244158072e-05 1.0669605485340365 1
5.569132696259823e-05 1.0777618862821923 1
0.004473858242627011 1.2600687358071168 3
0.016559273988703417 1.0411271676699225 3
0.05901027604180758 1.033723301079895 6
0.17759835901968032 1.056384317765994 8
0.43214612600588576 1.1633777507896996 12
1.06476516666817 1.6258321466349952 1




In [22]:
len(queries) * len(references)

30000

In [5]:
def spectra_peaks_to_tensor(spectra: list, fill: float):
    sp_max_shape = max(len(s.peaks) for s in spectra)
    sp = np.full((len(spectra), sp_max_shape, 2), fill, 'float32')
    batch = np.zeros(len(spectra),dtype=np.uint64)
    for i, s in enumerate(spectra):
        sp[i, :len(s.peaks)] = s.peaks.to_numpy
        batch[i] = len(s.peaks)
    return sp, batch

# queries = large_references[:1000]
# references = large_references[1000:]

references_batch, references_batch_size \
    = spectra_peaks_to_tensor(references, fill=-1e6)
queries_batch, queries_batch_size \
    = spectra_peaks_to_tensor(queries, fill=-1e6)

In [23]:
np.save('data/references_mz.npy', references_batch[...,0])
np.save('data/references_int.npy', references_batch[...,1])
np.save('data/queries_mz.npy', queries_batch[...,0])
np.save('data/queries_int.npy', queries_batch[...,1])
np.save('data/scores_100x100.npy', np.array(scores))

In [24]:
load_scores_true = np.load('data/scores_100x100.npy')
print(load_scores_true)
print(load_scores_true.shape, load_scores_true.dtype)
print(np.sort(load_scores_true[:,0]))

[[8.63016898e-03 5.00000000e+00]
 [1.57968525e-02 6.00000000e+00]
 [4.26263552e-02 7.00000000e+00]
 ...
 [6.88239339e-03 6.00000000e+00]
 [1.81943264e-02 9.00000000e+00]
 [4.84264270e-02 1.10000000e+01]]
(22970, 2) float64
[6.47754888e-07 7.81780158e-07 8.52989985e-07 ... 9.97076190e-01
 9.97387727e-01 9.97983564e-01]


In [26]:
load_scores = np.load('data/results.npy')
print(load_scores)
print(load_scores.shape, load_scores.dtype)
print(np.sort(load_scores[:,0]))

[[8.63017049e-03 5.00000000e+00]
 [1.57968551e-02 6.00000000e+00]
 [4.26263586e-02 7.00000000e+00]
 ...
 [6.88239373e-03 6.00000000e+00]
 [1.81943253e-02 9.00000000e+00]
 [4.84264232e-02 1.10000000e+01]]
(22970, 2) float64
[6.47754860e-07 7.81780216e-07 8.52989956e-07 ... 9.97076392e-01
 9.97388005e-01 9.97983754e-01]


In [9]:
import matplotlib.pyplot as plt
plt.plot(load_scores[:,1] - load_scores_true[:len(load_scores[:,1]),1])

ValueError: operands could not be broadcast together with shapes (1740258,) (22970,) 