In [1]:
%load_ext autoreload
%autoreload 2
%pwd

'/workspace'

In [2]:
! pip uninstall cudams -q -y
! pip install git+https://github.com/tornikeo/cosine-similarity.git@pre-release-1

[0mCollecting git+https://github.com/tornikeo/cosine-similarity.git@dev
  Cloning https://github.com/tornikeo/cosine-similarity.git (to revision dev) to /tmp/pip-req-build-uh4skuf9
  Running command git clone --filter=blob:none --quiet https://github.com/tornikeo/cosine-similarity.git /tmp/pip-req-build-uh4skuf9
  Running command git checkout -b dev --track origin/dev
  Switched to a new branch 'dev'
  Branch 'dev' set up to track remote branch 'dev' from 'origin'.
  Resolved https://github.com/tornikeo/cosine-similarity.git to commit 8a84557132bd6a6bc1b759681b7bdc7e5a2a058d
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: cudams
  Building wheel for cudams (pyproject.toml) ... [?25ldone
[?25h  Created wheel for cudams: filename=cudams-0.0.1-py3-none-any.whl size=18065 sha256=a3914f4ceaa198a9131d0c10d63d8204a08f87f8c71c9490d5210c

# Load data

In [3]:
from cudams.utils import \
    argbatch, mkdir, get_ref_spectra_from_df
import math
from pathlib import Path
import pandas as pd
from itertools import product
from time import perf_counter
from multiprocessing.pool import ThreadPool
from multiprocessing import shared_memory
import numpy as np
import json
import matplotlib.pyplot as plt
import matplotlib
from tqdm import tqdm
import torch
import numba
from numba import cuda

assert torch.cuda.is_available()
assert cuda.is_available()

In [5]:
from cudams.similarity.kernels import compile_cuda_cosine_greedy_kernel

match_limit = 1024
max_peaks = 1024
batch_size = 2048 * 2 # Works best on rtx4090. Use half for most other less advanced hardware (i.e. T4)
threshold = .75

kernel = compile_cuda_cosine_greedy_kernel(
    tolerance=.1,
    shift=0,
    mz_power=0,
    int_power=1,
    match_limit=match_limit,
    batch_size=batch_size,
)

In [6]:
from cudams.utils import download_cosine_100k_sample
from pathlib import Path

spectra_file = download_cosine_100k_sample('spectra_100k.csv')

In [7]:
! du -h *

58M	data/output
58M	data
4.0K	onstart.sh
4.0K	ports.log
60M	spectra_100k.csv
32K	vastai_sparse_sample.ipynb


In [8]:
from cudams.utils import get_spectra_batches

references, queries, batched_inputs = get_spectra_batches(
    reference_csv_file=spectra_file, # File paths
    query_csv_file=spectra_file,
    batch_size=batch_size, # Batch size for processing - large batch size requires more powerful hardware, and is faster
    max_peaks=max_peaks, # Number of max peaks in each batch. Large number requires much more memory and time, but is more accurate (after 1024, it doesn't matter much)
    padding=None, # Not required for current kernels
    # max_pairs=(batch_size**2) * 256 # we use 256 batches
    max_pairs=None # load ALL of the rows for pairwise comparison!
)

100%|██████████| 100001/100001 [00:28<00:00, 3512.24it/s]


In [15]:
print(f'Number of pairs {len(references) * len(queries):.3e}')

Number of pairs 1.000e+10


In [16]:
device = torch.device('cuda')
host = torch.device('cpu')

! rm -rf data/output
! mkdir -p data/output

with torch.no_grad():
    for batch_i in tqdm(range(len(batched_inputs))):
        (rspec, rlen, rstart, rend), (qspec, qlen, qstart, qend) = batched_inputs[
            batch_i
        ]
        
        lens = torch.zeros(2, batch_size, dtype=torch.int32)
        lens[0, :len(rlen)] = torch.from_numpy(rlen)
        lens[1, :len(qlen)] = torch.from_numpy(qlen)
        
        lens = lens.to(device)
        out = torch.zeros(3, batch_size, batch_size, dtype=torch.float32, device=device)
        
        rspec = torch.from_numpy(rspec).to(device)
        qspec = torch.from_numpy(qspec).to(device)
        
        rspec = cuda.as_cuda_array(rspec)
        qspec = cuda.as_cuda_array(qspec)
        lens = cuda.as_cuda_array(lens)
        out = cuda.as_cuda_array(out)
        
        kernel(rspec, qspec, lens, out)

        out = torch.as_tensor(out, device=device)
        mask = out[0] >= threshold
        row, col = torch.nonzero(mask, as_tuple=True)
        rabs = rstart + row
        qabs = qstart + col
        score, matches, overflow = out[:, mask].to(host)
        np.savez_compressed(
            f'data/output/{rstart}-{rend}-{qstart}-{qend}.npz', 
            rabs=rabs.int().to(host), 
            qabs=qabs.int().to(host), 
            score=score.float(),
            matches=matches.int(),
            overflow=overflow.bool()
        )

100%|██████████| 625/625 [01:51<00:00,  5.63it/s]


In [41]:
math.ceil(100_000 / batch_size)**2

625

In [19]:
from pathlib import Path

! du -hs data/output/

total_size = sum(f.stat().st_size for f in Path('data/output').glob('**/*') if f.is_file())
print(f'Total file size {total_size/1e9:.3f} GB')

117M	data/output/
Total file size 0.121 GB


In [20]:
qabs = []
rabs = []
score = []
matches = []
overflow = []
for file in tqdm(Path('data/output').glob('*.npz')):
    bunch = np.load(file)
    qabs += [bunch['qabs']]
    rabs += [bunch['rabs']]
    score += [bunch['score']]
    matches += [bunch['matches']]
    overflow += [bunch['overflow']]

In [21]:
qabs = np.concatenate(qabs)
rabs = np.concatenate(rabs)
score = np.concatenate(score)
matches = np.concatenate(matches)
overflow = np.concatenate(overflow)

In [23]:
# Suppose we want to query these absolute query IDs, and sort their results
query = np.array([1, 42, 121, 99_999])

In [38]:
from IPython.display import display
for q in query:
    idx = qabs == q
    res = np.stack([rabs[idx], score[idx], matches[idx], overflow[idx]],axis=1)
    res = pd.DataFrame(res, columns='ReferenceID Score Matches Overflow'.split())
    print(f"Similarity for chemical with QueryID={q}")
    display(res)

Similarity for chemical with QueryID=1


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,0.0,0.990495,14.0,0.0
1,1.0,1.000000,15.0,0.0
2,2.0,0.992786,12.0,0.0
3,3.0,0.954024,12.0,0.0
4,4.0,0.899819,12.0,0.0
...,...,...,...,...
263,86600.0,0.934475,3.0,0.0
264,86601.0,0.933939,2.0,0.0
265,86602.0,0.931802,1.0,0.0
266,86603.0,0.919519,5.0,0.0


Similarity for chemical with QueryID=42


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,40.0,0.794153,5.0,0.0
1,41.0,0.862276,8.0,0.0
2,42.0,1.0,11.0,0.0
3,43.0,0.800063,9.0,0.0
4,428.0,0.762022,2.0,0.0
5,456.0,0.762229,2.0,0.0
6,21673.0,0.794153,5.0,0.0
7,21674.0,0.862276,8.0,0.0
8,21675.0,1.0,11.0,0.0
9,21676.0,0.800063,9.0,0.0


Similarity for chemical with QueryID=121


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,116.0,0.883796,4.0,0.0
1,117.0,0.890043,6.0,0.0
2,118.0,0.892549,6.0,0.0
3,119.0,0.901887,6.0,0.0
4,120.0,0.940933,9.0,0.0
...,...,...,...,...
214,83483.0,0.765802,2.0,0.0
215,83484.0,0.755249,2.0,0.0
216,83487.0,0.862562,2.0,0.0
217,83488.0,0.790962,2.0,0.0


Similarity for chemical with QueryID=99999


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,64.0,0.774653,24.0,0.0
1,65.0,0.960330,25.0,0.0
2,77.0,0.859591,26.0,0.0
3,78.0,0.963014,26.0,0.0
4,451.0,0.825203,20.0,0.0
...,...,...,...,...
1221,99988.0,0.946494,25.0,0.0
1222,99989.0,0.900003,22.0,0.0
1223,99997.0,0.783493,21.0,0.0
1224,99998.0,0.956027,25.0,0.0
