In [1]:
%load_ext autoreload
%autoreload 2

In [56]:
# from cudams import cosine, data, utils
from cudams.utils import argbatch, mkdir
from cudams.data import get_ref_spectra_from_df
from cudams.kernel import compile
from cudams.utils import name2idx
import math
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from cudams.data import spectra_peaks_to_tensor
from numba import cuda
from itertools import product
from time import perf_counter
from multiprocessing.pool import ThreadPool
from multiprocessing import shared_memory
import numpy as np
import json

In [59]:
## Define constants
tolerance: float = 0.1
shift: float = 0
mz_power: float = 0
int_power: float = 1

## How many pairs per batch. Has to be a power of 2.
# Hardware specific - An RTX2070 works best at around 1024 * 2
# But Colab T4 GPU might work best at 1024 * 4
BATCH_SIZE = 1024

# MATCH_LIMIT specifies max how many mz-mz pairs we could consider for each RQ pair, before we sort and filter. 
# E.g. a value of 256 usually causes around ~0.003% of RQ pairs to "overflow".
# The overflown RQ scores will be strictly less than or equal to perfectly accurate score.
# The mean absolute difference at 256, for all overflown pairs is on the order of ~1e-3
# Small values of MATCH_LIMIT (e.g. 128, 64,) cause a dramatic speedup in the processing speed.
MATCH_LIMIT = 256

## GPU-specific constants
THREADS_PER_BLOCK = (32, 32)
BLOCKS_PER_GRID_X = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[0])
BLOCKS_PER_GRID_Y = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[1])
BLOCKS_PER_GRID = (BLOCKS_PER_GRID_X, BLOCKS_PER_GRID_Y)

# Since Greedy cosine is an unstable algorithm, because approximate mz-mz values do not
# result in approximately the same scores and number of matches.
# So we need to use fp64 to minimize the deviation as much as possible.
# Using float32 causes a significant speedup in the processing speed.
dtype = 'float64'

# Data path
reference_csv_file = Path("data/input/example_dataset_tornike.csv")
query_csv_file = Path("data/input/example_dataset_tornike.csv")

# Limits
# We consider only first LIMIT number of entries in CSVs
LIMIT = 2048 * 2

# For keeping track of experiments
CONFIG = dict(
    tolerance = tolerance,
    shift = shift,
    mz_power = mz_power,
    int_power = int_power,
    dtype = dtype,
    reference_csv_file = reference_csv_file,
    query_csv_file = query_csv_file,
    BATCH_SIZE = BATCH_SIZE,
    MATCH_LIMIT = MATCH_LIMIT,
    LIMIT = LIMIT,
)

config_str = json.dumps(CONFIG, sort_keys=True, indent=1, default=str)
experiment_hash = abs(hash(config_str))
output_dir = mkdir(Path(f'data/experiments/{experiment_hash}'))
(output_dir / 'config.json').write_text(config_str)

272

In [4]:
# We load CSV files using multiple threads
ref_spectra_df_path = Path(reference_csv_file)
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
references = get_ref_spectra_from_df(ref_spectra_df, limit=LIMIT)

query_spectra_df_path = Path(query_csv_file)
query_spectra_df = pd.read_csv(query_spectra_df_path)
queries = get_ref_spectra_from_df(query_spectra_df, limit=LIMIT)

print(f"We have {len(references)} references and {len(queries)} queries")

100%|██████████| 4096/4096 [00:03<00:00, 1239.99it/s]
100%|██████████| 4096/4096 [00:00<00:00, 4826.40it/s]


We have 3994 references and 3994 queries


In [5]:
kernel = compile(tolerance=tolerance, shift=shift, 
                mz_power=mz_power, int_power=int_power, 
                match_limit=MATCH_LIMIT, batch_size=BATCH_SIZE)

Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 2070 with Max-Q Design'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-f6e241c8-f0ad-720e-be22-2713a6b0868d
                                Watchdog: Enabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


In [6]:
batches_r = []
for bstart, bend in tqdm(argbatch(references, BATCH_SIZE), desc="Batch all references"):
    rbatch = references[bstart:bend]
    rspec, rlen = spectra_peaks_to_tensor(rbatch, dtype=dtype)
    batches_r.append([rspec, rlen, bstart, bend])

batches_q = list()
for bstart, bend in tqdm(argbatch(queries, BATCH_SIZE), desc="Batch all queries"):
    qbatch = queries[bstart:bend]
    qspec, qlen  = spectra_peaks_to_tensor(qbatch, dtype=dtype)
    batches_q.append([qspec, qlen, bstart, bend])

batches_rq = list(product(batches_r, batches_q))
streams = [cuda.stream() for _ in range(len(batches_rq))]

TOTAL_BATCHES = len(batches_rq)

Batch all references: 0it [00:00, ?it/s]

Batch all references: 4it [00:00, 26.60it/s]
Batch all queries: 4it [00:00, 27.28it/s]


In [7]:
! rm -rf data/output/*

# We initialize a pool of 3 workers that will offload results to disk
with ThreadPool(3) as pool:
    # We loop over all batchs in sequence
    for batch_i in tqdm(range(TOTAL_BATCHES)):
        
        # Each batch has own CUDA stream so that the GPU is as busy as possible
        stream = streams[batch_i]
        
        # Shared memory allows pool workers to read array without copying it
        out_shm = shared_memory.SharedMemory(create=True, size=(BATCH_SIZE * BATCH_SIZE * 2 * 4))
        out = np.ndarray(shape=(BATCH_SIZE, BATCH_SIZE, 2), dtype='float32', buffer=out_shm.buf)
        
        overflow_shm = shared_memory.SharedMemory(create=True, size=(BATCH_SIZE * BATCH_SIZE * 1 * 1))
        overflow = np.ndarray(shape=(BATCH_SIZE, BATCH_SIZE, 1), dtype='uint8', buffer=overflow_shm.buf)

        # We order empty space for results on GPU RAM
        out_cu = cuda.device_array((BATCH_SIZE, BATCH_SIZE, 2), dtype='float32', stream=stream)
        overflow_cu = cuda.device_array((BATCH_SIZE, BATCH_SIZE, 1), dtype='uint8', stream=stream)

        # We get our batch and lengths (lengths are different for different spectra)
        (rspec, rlen, rstart, rend), (qspec, qlen, qstart, qend) = batches_rq[batch_i]
        lens = np.zeros((2, BATCH_SIZE), 'int32')
        lens[0,:len(rlen)] = rlen
        lens[1,:len(qlen)] = qlen
        
        # We make sure main resources remain on CPU RAM
        with cuda.pinned(rspec, qspec, lens, out, overflow,):
            
            # We order the stream to copy input data to GPU RAM
            rspec_cu = cuda.to_device(rspec, stream=stream)
            qspec_cu = cuda.to_device(qspec, stream=stream)
            lens_cu = cuda.to_device(lens, stream=stream)
            
            # We order the stream to execute kernel (this is scheduled, it will execute, but we can't force it)
            kernel(rspec_cu, qspec_cu,
                    lens_cu,
                    out_cu, overflow_cu,
                    stream=stream)
            
            # We order a data return
            out_cu.copy_to_host(out, stream=stream)
            overflow_cu.copy_to_host(overflow, stream=stream)

            # We create a function that will execute when this stream is done working
            # It is important to be quick here - so main work of writing to disk
            # Is handled by pool workers, not callback stream.
            def end_of_stream_callback(*args):
                def thread_worker(name1, name2):
                    ex_shm = shared_memory.SharedMemory(name=name1)
                    out = np.ndarray(shape=(BATCH_SIZE, BATCH_SIZE, 2), dtype=np.float32, buffer=ex_shm.buf)
                    np.save(f'data/output/{rstart}-{rend}.{qstart}-{qend}.score.npy', out)

                    ex_shm.unlink()
                    ex_shm = shared_memory.SharedMemory(name=name2)
                    overflow = np.ndarray(shape=(BATCH_SIZE, BATCH_SIZE, 1), dtype=np.uint8, buffer=ex_shm.buf)
                    np.save(f'data/output/{rstart}-{rend}.{qstart}-{qend}.ovfl.npy', overflow)
                    ex_shm.unlink()
                    
                pool.apply_async(
                    thread_worker, 
                    args=[out_shm.name, overflow_shm.name], 
                    error_callback=lambda e: print("Thread error", e)
                )
            stream.add_callback(
                callback=end_of_stream_callback,
            )

# We wait for all streams to finish their work everywhere 
cuda.synchronize()

100%|██████████| 16/16 [00:02<00:00,  6.02it/s]


In [27]:
from cudams.cosine import similarity
from cudams.utils import batches


# ! rm -r data/tests/d7/*

cpu_output_dir = Path('data/tests/d7/')

    
refs = list([r.peaks.to_numpy for r in references])
ques = list([q.peaks.to_numpy for q in queries])

rlims = argbatch(refs, BATCH_SIZE)
qlims = argbatch(ques, BATCH_SIZE)

batches_rq = list(product(rlims, qlims))

for (rstart, rend), (qstart, qend) in tqdm(batches_rq, total=len(batches_rq)):
    rspec = refs[rstart:rend]
    qspec = ques[qstart:qend]
    out_true = np.full((BATCH_SIZE, BATCH_SIZE, 2), fill_value=0, dtype='float32')
    for (i, spec1), (j, spec2) in product(enumerate(rspec), enumerate(qspec)):
            score = similarity(
                spec1,
                spec2,
                tolerance=tolerance,
                shift=shift,
                mz_power=mz_power,
                int_power=int_power,
            )
            if score is not None:
                out_true[i,j,0] = score[0]
                out_true[i,j,1] = score[1]
    np.save(cpu_output_dir / f'{rstart}-{rend}.{qstart}-{qend}.score.npy', out_true)

rm: cannot remove 'data/tests/d7/*': No such file or directory


100%|██████████| 16/16 [01:35<00:00,  5.94s/it]


# Error analysis

In [32]:
import gc; gc.collect()
R = math.ceil( len(references) / BATCH_SIZE ) * BATCH_SIZE
Q = math.ceil( len(queries) / BATCH_SIZE ) * BATCH_SIZE

G = np.empty((R,Q), dtype='float32')
scores = sorted(output_dir.glob('*.score.npy'))
for score in scores:
    rstart, rend, qstart, qend = name2idx(score)
    chunk = np.load(score)
    G[rstart:rend, qstart:qend] = chunk[...,0] # Get only scores

In [33]:
C = np.empty((R,Q), dtype='float32')
scores = sorted(cpu_output_dir.glob('*.score.npy'))
for score in scores:
    rstart, rend, qstart, qend = name2idx(score)
    chunk = np.load(score)
    C[rstart:rend, qstart:qend] = chunk[...,0] 

# Score-only error analysis

In [38]:
cl = np.isclose(G, C)
cl

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [39]:
cl.mean()

0.9997949004173279

In [18]:
C.mean()

0.0031445846