In [1]:
%load_ext autoreload
%autoreload 2
from nbutils import chdir_to_root
chdir_to_root()
%pwd

'/home/tornikeo/Documents/work/scalexa/pangeaai/optimize-cosine'

In [2]:
from cudams.utils import argbatch, mkdir
from cudams.data import get_ref_spectra_from_df
from cudams.kernel import compile
from cudams.utils import name2idx
from cudams.cosine import similarity
import math
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from cudams.data import spectra_peaks_to_tensor
from cudams.processor import Config
from numba import cuda
from itertools import product
from time import perf_counter
from multiprocessing.pool import ThreadPool
from multiprocessing import shared_memory
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib

assert cuda.is_available()

In [3]:
## Define constants
tolerance: float = 0.1
shift: float = 0
mz_power: float = 0
int_power: float = 1

## How many pairs per batch. Has to be a power of 2.
# Hardware specific - An RTX2070 works best at around 1024 * 2
# But Colab T4 GPU might work best at 1024 * 4
BATCH_SIZE = 1024

# MAX NUMBER OF PEAKS during filtering. Due to nature of matrices, having large number of 
# peaks will increase memory requirements. After 1024, this has diminishing benefits, as 
# smaller and smaller (likely noisy) peaks are taken into consideration when running similarity.
MAX_PEAKS = 1024

# MATCH_LIMIT specifies max how many mz-mz pairs we could consider for each RQ pair, before we sort and filter. 
# E.g. a value of 256 usually causes around ~0.003% of RQ pairs to "overflow".
# The overflown RQ scores will be strictly less than or equal to perfectly accurate score.
# The mean absolute difference at 256, for all overflown pairs is on the order of ~1e-3
# Small values of MATCH_LIMIT (e.g. 128, 64,) cause a dramatic speedup in the processing speed.
MATCH_LIMIT = 1024

# Since Greedy cosine is an unstable algorithm, because approximate mz-mz values do not
# result in approximately the same scores and number of matches.
# So we need to use fp64 to minimize the deviation as much as possible.
# Using float32 causes a significant speedup in the processing speed.
dtype = 'float32'

# Data path
reference_csv_file = Path("data/input/example_dataset_tornike.csv")
query_csv_file = Path("data/input/example_dataset_tornike.csv")

In [25]:
from cudams.processor import CudaCosineGreedy, CpuCosineGreedy
from collections import defaultdict
from matchms import calculate_scores
from matchms.similarity import CosineGreedy
from tqdm import tqdm
from matchms.filtering import normalize_intensities, select_by_mz, select_by_relative_intensity, reduce_to_number_of_peaks, \
    require_minimum_number_of_peaks
from cudams.utils import mute_stdout

def process_spectrum(spectrum: np.ndarray) -> np.ndarray:
    spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=MAX_PEAKS)
    spectrum = require_minimum_number_of_peaks(spectrum, n_required=5)
    return spectrum

ref_spectra_df_path = Path(reference_csv_file)
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
references = get_ref_spectra_from_df(ref_spectra_df, 
                                    spectrum_processor=process_spectrum,
                                    limit=BATCH_SIZE * 40)

query_spectra_df_path = Path(query_csv_file)
query_spectra_df = pd.read_csv(query_spectra_df_path)
queries = get_ref_spectra_from_df(query_spectra_df, 
                                spectrum_processor=process_spectrum,
                                limit=BATCH_SIZE * 40)

100%|██████████| 40960/40960 [00:12<00:00, 3283.30it/s]
100%|██████████| 40960/40960 [00:10<00:00, 4091.55it/s]


In [14]:
# references = references[-BATCH_SIZE:]
# queries = queries[-BATCH_SIZE:]

In [26]:
from cudams.processor import CudaCosineGreedy
from tqdm import tqdm

refs = list([r.peaks.to_numpy for r in references])
ques = list([q.peaks.to_numpy for q in queries])

rlims = argbatch(refs, BATCH_SIZE)
qlims = argbatch(ques, BATCH_SIZE)
R = len(references)
Q = len(queries)

batches_rq = list(product(rlims, qlims))

cosine = CudaCosineGreedy(
    tolerance=tolerance,
    mz_power=0,
    intensity_power=1, 
    shift=0,
    batch_size=BATCH_SIZE,
    match_limit=MATCH_LIMIT,
)
cosine.compile()
t = perf_counter()
ri, qi, out, overflows = cosine.matrix(
    references=references, 
    queries=queries, 
    array_type="sparse",
    sparse_threshold=.75,
)
t = perf_counter() - t
sum_nbytes = sum(o.nbytes for o in [ri, qi, out, overflows])
print(f"Output size {sum_nbytes / 1e9:.2f}GB")
print(f"Num of output {len(ri)}")
print(f"Pairs processed {len(references) * len(queries):.1e}")
n_pairs = len(references) * len(queries)
perh = (n_pairs / t) * 3600

print(f"pairs per hr {perh:.1e}")
print(f"Full run (100kx1.5mln) est: {100_000 * 1_500_000 / perh:.3f}hrs")
print(f"Full run (100kx1.5mln) est GBs: {(sum_nbytes/n_pairs)*(100_000*1_500_000)*1e-9:.2f}GB")

Batch all references: 40it [00:00, 61.47it/s]
Batch all queries: 40it [00:00, 54.36it/s]
100%|██████████| 1600/1600 [03:03<00:00,  8.70it/s]


Output size 0.15GB
Num of output 6155331
Pairs processed 1.6e+09
pairs per hr 3.1e+10
Full run (100kx1.5mln) est: 4.804hrs
Full run (100kx1.5mln) est GBs: 14.36GB


In [52]:
from numba import cuda, types

@cuda.jit
def threshold_kernel(
    scores,
    threshold: float,
    out,
):
    i, j = cuda.grid(2)
    v = 0
    if i < BATCH_SIZE and j < BATCH_SIZE:
        v = scores[i,j,0] >= threshold
    out[i,j] = v
    
@cuda.reduce
def sum_kernel(
    a, b,
):
    return a + b


THREADS_PER_BLOCK = (32, 32)
BLOCKS_PER_GRID_X = math.ceil(R / THREADS_PER_BLOCK[0])
BLOCKS_PER_GRID_Y = math.ceil(Q / THREADS_PER_BLOCK[1])
BLOCKS_PER_GRID = (BLOCKS_PER_GRID_X, BLOCKS_PER_GRID_Y)


scores_cu = cuda.to_device(out)
threshold = .85
scores_thr_cu = cuda.device_array(
    (BATCH_SIZE, BATCH_SIZE),
    dtype='uint32',
)
threshold_kernel[THREADS_PER_BLOCK, BLOCKS_PER_GRID](
    scores_cu,
    threshold,
    scores_thr_cu,   
)

s = int(sum_kernel(
    scores_thr_cu.ravel(),
))

r = cuda.device_array(s,'int32')
c = cuda.device_array(s,'int32')
v = cuda.device_array(s,'float32')

assert s == (out[...,0] >= threshold).sum()



In [56]:
out_r = np.random.uniform(size=(BATCH_SIZE, BATCH_SIZE,2))

In [60]:
%%timeit
r,c = np.nonzero((out_r[...,0]>=threshold))
v = out_r[r,c,0]
v.sum()
# assert v.sum() < r.sum() + c.sum()

6.66 ms ± 99.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


0