In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import math
from pathlib import Path
import pandas as pd
import numpy as np
from numba import cuda
from itertools import product
from tqdm import tqdm
from data import get_ref_spectra_from_df, batches, mkdir, spectra_peaks_to_tensor
from kernel import compile
from cosine import similarity

In [21]:
## Define constants
tolerance: float = 0.1
shift: float = 0
mz_power: float = 0
int_power: float = 1

## How many pairs per batch. Has to be a power of 2.
# Hardware specific - An RTX2070 works best at around 1024 * 2
# But Colab T4 GPU might work best at 1024 * 4
BATCH_SIZE = 1024

# MATCH_LIMIT specifies max how many mz-mz pairs we could consider for each RQ pair, before we sort and filter. 
# E.g. a value of 256 usually causes around ~0.003% of RQ pairs to "overflow".
# The overflown RQ scores will be strictly less than or equal to perfectly accurate score.
# The mean absolute difference at 256, for all overflown pairs is on the order of ~1e-3
# Small values of MATCH_LIMIT (e.g. 128, 64,) cause a dramatic speedup in the processing speed.
MATCH_LIMIT = 256

## GPU-specific constants
THREADS_PER_BLOCK = (32, 32)
BLOCKS_PER_GRID_X = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[0])
BLOCKS_PER_GRID_Y = math.ceil(BATCH_SIZE / THREADS_PER_BLOCK[1])
BLOCKS_PER_GRID = (BLOCKS_PER_GRID_X, BLOCKS_PER_GRID_Y)

# Since Greedy cosine is an unstable algorithm, because approximate mz-mz values do not
# result in approximately the same scores and number of matches.
# So we need to use fp64 to minimize the deviation as much as possible.
# Using float32 causes a significant speedup in the processing speed.
dtype = 'float64'

# Data path
reference_csv_file = "data/input/example_dataset_tornike.csv"
query_csv_file = "data/input/example_dataset_tornike.csv"
output_dir = 'data/output/'

# Limits
# We consider only first LIMIT number of entries in CSVs
LIMIT = 2048

# For keeping track of experiments
CONFIG = dict(
    tolerance = tolerance,
    shift = shift,
    mz_power = mz_power,
    int_power = int_power,
    match_limit = MATCH_LIMIT,
    batch_size = BATCH_SIZE,
    limit = LIMIT,
)

In [24]:
# We load CSV files using multiple threads
ref_spectra_df_path = Path(reference_csv_file)
ref_spectra_df = pd.read_csv(ref_spectra_df_path)
references = get_ref_spectra_from_df(ref_spectra_df, limit=LIMIT)

query_spectra_df_path = Path(query_csv_file)
query_spectra_df = pd.read_csv(query_spectra_df_path)
queries = get_ref_spectra_from_df(query_spectra_df, limit=LIMIT)

print(f"We have {len(ref_spectra_df)} references and {len(query_spectra_df)} queries")

100%|██████████| 2048/2048 [00:00<00:00, 3000.07it/s]
100%|██████████| 2048/2048 [00:00<00:00, 5882.54it/s]


We have 100001 references and 100001 queries


In [15]:
# Numba Just-in-time compiles our kernel and bakes in our constants for performance.
kernel = compile(tolerance=tolerance, shift=shift, 
                 mz_power=mz_power, int_power=int_power, 
                 match_limit=MATCH_LIMIT, batch_size=BATCH_SIZE)

Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 2070 with Max-Q Design'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-f6e241c8-f0ad-720e-be22-2713a6b0868d
                                Watchdog: Enabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


In [30]:
output_dir = mkdir(output_dir)

TOTAL_BATCHES = math.ceil( len(references) / BATCH_SIZE ) * math.ceil( len(queries) / BATCH_SIZE)
print("Total batches: ", TOTAL_BATCHES)
print(f"Total pairs considered: {len(references)} * {len(queries)} = {len(references) * len(queries)}")

if len(references) % BATCH_SIZE != 0:
    print(f"Since {len(references)} isn't divisible by BATCH_SIZE, last batch will have {len(references) % BATCH_SIZE} empty ROWS at the end")
if len(queries) % BATCH_SIZE != 0:
    print(f"Since {len(queries)} isn't divisible by BATCH_SIZE, last batch will have {len(queries) % BATCH_SIZE} empty COLUMNS at the end")

Total batches:  4
Total pairs considered: 1993 * 1993 = 3972049
Since 1993 isn't divisible by BATCH_SIZE, last batch will have 969 empty ROWS at the end
Since 1993 isn't divisible by BATCH_SIZE, last batch will have 969 empty COLUMNS at the end


In [32]:
# Load each batch in memory so that we don't have to load any R,Q twice
batches_r = []
for rbatch in tqdm(batches(references, BATCH_SIZE), desc="Batch all references"):
    rspec, rlen = spectra_peaks_to_tensor(rbatch, dtype=dtype)
    batches_r.append([rspec, rlen])

batches_q = list()
for qbatch in tqdm(batches(queries, BATCH_SIZE), desc="Batch all queries"):
    qspec, qlen  = spectra_peaks_to_tensor(qbatch, dtype=dtype)
    batches_q.append([qspec, qlen])

Batch all references: 2it [00:00, 27.38it/s]
Batch all queries: 2it [00:00, 19.86it/s]


## Explanation


Picture is worth a thousand words - so to understand what we are doing here, take a look at this image below:


![alt text](assets/cosine-batch-layout-grid.jpg "Title")


GPUs are fundamentally a large 2D grid of very small CPUs. There are several ways of making our problem "fit" to the enviroment of GPUs, and I have chosen the following layout as shown above.

GPU can processes a single batch at a time - per-batch processing speed is near-instatanous, regardless of batch size, as long as the batch can fit into memory.

So - every batch is a 2D grid of references and queries that will be compared pairwise by different threads. If we zoom into the batch#0, we see:


![alt text](assets/cosine-batch-layout-batch.jpg "Title")


Meaning that a GPU has a separate small CPU (thread) for every pair in the cartesian product of references and queries in that batch. We see that every thread takes in it's own reference and query and returns three values:
score (float), num_matches (int, but casted to float), overflow (bool).

If we further zoom into the first thread, we see this pseudo-code being executed:


![alt text](assets/cosine-batch-layout-thread.jpg "Title")

This code is what is called a CUDA kernel - and it is exactly the same for every single thread in all batches. What changes is the input data (per batch) and which reference and query we work with (per thread).

The algorithm has two parts.

First loop collects all possible mzmz pairs (up to MATCH_LIMIT size), and report an overflow if it happens.

Second loop is essentially a bubble sort. Since "sorted()" isn't available to CUDA threads, we have to manually loop over the matches (nested loop) and, while we have left over scores:
- Get largest score
- Discard all other scores that have same index
- We normalize the score

# Main loop

In [None]:

streams = [cuda.stream() for _ in range(TOTAL_BATCHES)]
