In [1]:
%load_ext autoreload
%autoreload 2
%pwd
%env PYTHONWARNINGS ignore



In [2]:
! pip uninstall cudams -q -y
! pip install git+https://github.com/tornikeo/cosine-similarity.git@dev

[0mCollecting git+https://github.com/tornikeo/cosine-similarity.git@dev
  Cloning https://github.com/tornikeo/cosine-similarity.git (to revision dev) to /tmp/pip-req-build-cg4nlml4
  Running command git clone --filter=blob:none --quiet https://github.com/tornikeo/cosine-similarity.git /tmp/pip-req-build-cg4nlml4
  Running command git checkout -b dev --track origin/dev
  Switched to a new branch 'dev'
  Branch 'dev' set up to track remote branch 'dev' from 'origin'.
  Resolved https://github.com/tornikeo/cosine-similarity.git to commit 21629031dc8c044a9454f521a2d265bd2ec234cf
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: cudams
  Building wheel for cudams (pyproject.toml) ... [?25ldone
[?25h  Created wheel for cudams: filename=cudams-0.0.1-py3-none-any.whl size=18593 sha256=59ce6a95d3c66d4979156c1109cb3a29e32287b7270c6b29ecf917

# Load data

In [3]:
from cudams.utils import \
    argbatch, mkdir, get_ref_spectra_from_df
import math
from pathlib import Path
from time import perf_counter
import numpy as np
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import numba
from numba import cuda

assert torch.cuda.is_available()
assert cuda.is_available()

In [4]:
from cudams.similarity.kernels import compile_cuda_cosine_greedy_kernel

match_limit = 1024
max_peaks = 1024
batch_size = 2048 * 4 # Works best on rtx4090. Use half for most other less advanced hardware (i.e. T4)

# IMPORTANT! Keep this value above .5, especially for large spectra files. The score results might get *extremely* large (100s of GB)
# for low sparsity thresholds. This value dictates the minimum cosine greedy similarity threshold at which we keep the result
# similarity results with a score below threshold are discarded.
threshold = .75

kernel = compile_cuda_cosine_greedy_kernel(
    tolerance=.1,
    shift=0,
    mz_power=0,
    int_power=1,
    match_limit=match_limit,
    batch_size=batch_size,
)

We will run a pairwise cosine similarity on the entirety of the GNPS dataset (around 500_000 spectra).

Parsing these many spectra takes a while, so I already have a pickled version of the same dataset ready to go in `ALL_GNPS.pickle`.

Alternatively, you can use `ALL_GNPS.mgf` and wait for the parsing to finish.

In [5]:
from cudams.utils import download
from pathlib import Path
from joblib import Parallel, delayed
from matchms.filtering import default_filters, normalize_intensities, reduce_to_number_of_peaks
from matchms.importing import load_from_mgf
import pickle

spectra_file = download('ALL_GNPS.mgf')
def parse_spectrum(spectrum):
    spectrum = default_filters(spectrum)
    spectrum = reduce_to_number_of_peaks(spectrum, n_max=max_peaks)
    spectrum = normalize_intensities(spectrum)
    return spectrum

spectrums = Parallel(-1)(delayed(parse_spectrum)(spec) for spec in tqdm(load_from_mgf(spectra_file)))
spectrums = [spe for spe in spectrums if spe is not None]

## Download and read prepared pickle
# spectra_file = download('ALL_GNPS.pickle')
# spectrums = tuple(pickle.load(open(spectra_file, 'rb')))

1682it [00:04, 615.42it/s]



1871it [00:05, 274.54it/s]



3344it [00:08, 462.60it/s]



3445it [00:08, 393.71it/s]



3752it [00:09, 423.67it/s]



3979it [00:09, 539.23it/s]



4525it [00:10, 698.70it/s]



5030it [00:11, 691.48it/s]



5191it [00:11, 565.83it/s]



6289it [00:13, 585.16it/s]



7434it [00:15, 908.69it/s]



11777it [00:22, 820.06it/s]



11867it [00:22, 642.40it/s]



13365it [00:25, 401.02it/s]



14942it [00:28, 666.83it/s]



15552it [00:29, 692.93it/s]



16066it [00:30, 803.96it/s]



16651it [00:31, 747.30it/s]



93422it [02:31, 610.35it/s] 



93618it [02:32, 742.89it/s]



93812it [02:32, 758.21it/s]



98798it [02:38, 703.93it/s]



98980it [02:39, 802.71it/s]



129526it [03:15, 744.73it/s] 



129699it [03:15, 742.44it/s]



147186it [03:41, 730.04it/s]



147477it [03:41, 875.55it/s]



153786it [03:50, 636.19it/s]



157630it [03:56, 670.31it/s]



163725it [04:04, 696.45it/s]



175136it [04:22, 805.80it/s]



187464it [04:41, 644.86it/s]



511815it [11:01, 1048.28it/s]



512682it [11:02, 884.83it/s] 



512861it [11:02, 850.96it/s]



513056it [11:03, 885.49it/s]



513252it [11:03, 869.41it/s]



513621it [11:03, 868.98it/s] 



513824it [11:03, 860.10it/s]



514147it [11:04, 889.30it/s]



514446it [11:04, 922.66it/s]



514668it [11:04, 892.88it/s]



515150it [11:05, 749.14it/s]



515586it [11:05, 906.81it/s]



516114it [11:06, 956.23it/s]



516214it [11:06, 735.18it/s]



516549it [11:07, 838.23it/s]



516730it [11:07, 756.58it/s]



516988it [11:07, 817.13it/s]



517229it [11:07, 983.39it/s]



517420it [11:08, 693.62it/s]



517857it [11:08, 864.68it/s]



518321it [11:09, 1048.19it/s]



518552it [11:09, 824.15it/s] 



518816it [11:09, 892.13it/s]



520066it [11:11, 809.64it/s] 



520415it [11:11, 859.82it/s]



520508it [11:11, 751.89it/s]



521072it [11:12, 1023.25it/s]



521504it [11:12, 904.31it/s] 



521599it [11:12, 810.19it/s]



521915it [11:13, 815.54it/s]



522721it [11:14, 883.76it/s] 



522968it [11:14, 1016.09it/s]



560642it [12:39, 822.32it/s] 



563576it [12:42, 783.48it/s] 



563804it [12:42, 765.26it/s]



591667it [13:14, 744.51it/s] 


In [6]:
# Pairwise similarity between all
references, queries = spectrums, spectrums

references = references[:100_000]
queries = queries[:100_000]

print(f"We have {len(references) + len(queries):.3e} spectra")
print(f"Pairwise comparisons have {len(references)*len(queries):.3e} pairs in total")

We have 2.000e+05 spectra
Pairwise comparisons have 1.000e+10 pairs in total


In [7]:
from cudams.utils import spectra_peaks_to_tensor
from itertools import product
dtype = np.float32
padding = None

batches_r = []
for bstart, bend in tqdm(
    argbatch(references, batch_size), desc="Batch all references",
    total=len(references)//batch_size
):
    rbatch = references[bstart:bend]
    rspec, rlen = spectra_peaks_to_tensor(rbatch, dtype=dtype)
    batches_r.append([rspec, rlen, bstart, bend])

batches_q = []
for bstart, bend in tqdm(
    argbatch(queries, batch_size), desc="Batch all queries",
    total=len(queries)//batch_size
):
    qbatch = queries[bstart:bend]
    qspec, qlen = spectra_peaks_to_tensor(qbatch, dtype=dtype)
    batches_q.append([qspec, qlen, bstart, bend])
    
batched_inputs = tuple(product(batches_r, batches_q))

Batch all references: 13it [00:02,  4.52it/s]                        
Batch all queries: 13it [00:02,  4.55it/s]                        


In [8]:
device = torch.device('cuda')
host = torch.device('cpu')

! rm -rf data/output
! mkdir -p data/output

with torch.no_grad():
    for batch_i in tqdm(range(len(batched_inputs))):
        (rspec, rlen, rstart, rend), (qspec, qlen, qstart, qend) = batched_inputs[
            batch_i
        ]
        
        lens = torch.zeros(2, batch_size, dtype=torch.int32)
        lens[0, :len(rlen)] = torch.from_numpy(rlen)
        lens[1, :len(qlen)] = torch.from_numpy(qlen)
        
        lens = lens.to(device)
        
        rspec = torch.from_numpy(rspec).to(device)
        qspec = torch.from_numpy(qspec).to(device)
    
        rspec = cuda.as_cuda_array(rspec)
        qspec = cuda.as_cuda_array(qspec)
        lens = cuda.as_cuda_array(lens)
            
        out = torch.empty(3, batch_size, batch_size, dtype=torch.float32, device=device)
        out = cuda.as_cuda_array(out)
        
        kernel(rspec, qspec, lens, out)
        
        out = torch.as_tensor(out, device=device)
        mask = out[0] >= threshold
        row, col = torch.nonzero(mask, as_tuple=True)
        rabs = rstart + row
        qabs = qstart + col
        score, matches, overflow = out[:, mask].to(host)
        
        np.savez_compressed(
            f'data/output/{rstart}-{rend}-{qstart}-{qend}.npz', 
            rabs=rabs.int().to(host), 
            qabs=qabs.int().to(host), 
            score=score.float(),
            matches=matches.int(),
            overflow=overflow.bool()
        )

100%|██████████| 169/169 [14:31<00:00,  5.16s/it] 


In [9]:
! du -hs data/output/

667M	data/output/


In [10]:
from pathlib import Path

! du -hs data/output/

total_size = sum(f.stat().st_size for f in Path('data/output').glob('**/*') if f.is_file())
print(f'Total file size {total_size/1e9:.3f} GB')

667M	data/output/
Total file size 0.699 GB


In [11]:
qabs = []
rabs = []
score = []
matches = []
overflow = []
for file in tqdm(Path('data/output').glob('*.npz')):
    bunch = np.load(file)
    qabs += [bunch['qabs']]
    rabs += [bunch['rabs']]
    score += [bunch['score']]
    matches += [bunch['matches']]
    overflow += [bunch['overflow']]

169it [00:07, 23.78it/s]


In [12]:
qabs = np.concatenate(qabs)
rabs = np.concatenate(rabs)
score = np.concatenate(score)
matches = np.concatenate(matches)
overflow = np.concatenate(overflow)

In [13]:
# Suppose we want to query these absolute query IDs, and sort their results
query = np.array([1, 42, 121, 99_999])

In [14]:
import pandas as pd
from IPython.display import display
for q in query:
    idx = qabs == q
    res = np.stack([rabs[idx], score[idx], matches[idx], overflow[idx]],axis=1)
    res = pd.DataFrame(res, columns='ReferenceID Score Matches Overflow'.split())
    print(f"Similarity for chemical with QueryID={q}")
    display(res)

Similarity for chemical with QueryID=1


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,1.0,1.0,335.0,0.0


Similarity for chemical with QueryID=42


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,42.0,1.0,14.0,0.0
1,44.0,0.920637,10.0,0.0
2,31928.0,0.767068,4.0,0.0


Similarity for chemical with QueryID=121


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,121.0,1.0,16.0,0.0
1,137.0,0.996308,6.0,0.0
2,7200.0,0.90934,1.0,0.0
3,27747.0,0.898213,1.0,0.0


Similarity for chemical with QueryID=99999


Unnamed: 0,ReferenceID,Score,Matches,Overflow
0,2250.0,0.860784,2.0,0.0
1,16953.0,0.883292,6.0,0.0
2,17925.0,0.926368,1.0,0.0
3,24743.0,0.84118,8.0,0.0
4,24823.0,0.854928,2.0,0.0
5,24949.0,0.769974,5.0,0.0
6,31472.0,0.887954,3.0,0.0
7,31681.0,0.848997,1.0,0.0
8,31824.0,0.755243,6.0,0.0
9,34241.0,0.848997,1.0,0.0
