Here's what I'll do.

1. Get 100 random spectra.
1. Run msbuddy in serial
-> 
1. See how many cand forms we have
1. See what's inside each, and how `enumerate_subformula` and `_calc_subform_mass` work

->->

1. `enumerate_subformula` - what does it do?

Input: pre_charged_arr, list of ints
Inside:

1. For given precursor formula, split it in all possible ways
1. Create an array with all possible resultant subformulas

Output: 2D array, each row is subformula array


In [1]:
from msbuddy.utils import enumerate_subformula

In [7]:
import numpy as np

In [84]:
from msbuddy import read_formula
from msbuddy import form_arr_to_str
from msbuddy import enumerate_subform_arr

formula_array = read_formula("C10H20O5")
print(formula_array)

formula_str = form_arr_to_str([10, 20, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0])
print(formula_str)

all_subform_arr = enumerate_subform_arr([10, 20, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0])
print(all_subform_arr)

[10 20  0  0  0  0  0  0  0  5  0  0]
C10H20O5
[[ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 2  0  0 ...  0  0  0]
 ...
 [ 8 20  0 ...  5  0  0]
 [ 9 20  0 ...  5  0  0]
 [10 20  0 ...  5  0  0]]


In [9]:
from numba import njit

@njit
def enumerate_subformula(pre_charged_arr: np.array) -> np.array:
    """
    Enumerate all subformulas of a candidate formula. (Numba version)
    :param pre_charged_arr: precursor charged array
    :return: 2D array, each row is a subformula array
    """
    n = len(pre_charged_arr)
    total_subform_cnt = np.prod(pre_charged_arr + 1)

    subform_arr = np.zeros((total_subform_cnt, n), dtype=np.int16)
    tempSize = 1

    for i in range(n):
        count = pre_charged_arr[i]
        repeatSize = tempSize
        tempSize *= (count + 1)

        pattern = np.arange(count + 1)

        repeated_pattern = np.empty(repeatSize * len(pattern), dtype=np.int16)
        for j in range(len(pattern)):
            repeated_pattern[j * repeatSize: (j + 1) * repeatSize] = pattern[j]

        full_repeats = total_subform_cnt // len(repeated_pattern)

        for j in range(full_repeats):
            start_idx = j * len(repeated_pattern)
            end_idx = (j + 1) * len(repeated_pattern)
            subform_arr[start_idx:end_idx, i] = repeated_pattern

    return subform_arr

In [58]:
arr = enumerate_subformula(
    formula_array
)
print(arr.shape)
arr

(1386, 12)


array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0],
       [ 2,  0,  0, ...,  0,  0,  0],
       ...,
       [ 8, 20,  0, ...,  5,  0,  0],
       [ 9, 20,  0, ...,  5,  0,  0],
       [10, 20,  0, ...,  5,  0,  0]], dtype=int16)

In [74]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
dims = [torch.arange(0, e+1, dtype=torch.float32, device=device)  for e in formula_array]
prod = torch.cartesian_prod(*dims)
print(prod.shape)
prod[:30]

torch.Size([1386, 12])


tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 5., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 3., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 5., 0., 0.],
        [0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 2., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 2., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0.],
        [0., 2., 0., 0., 0., 0., 0., 0., 0., 3., 0., 0.],
        [0., 2., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0.],
        [0., 2

In [87]:
ele_mass_arr = torch.tensor([
    12.000000, 1.007825, 78.918336, 34.968853, 18.998403, 126.904473, 38.963707, 14.003074,
    22.989769, 15.994915, 30.973762, 31.972071
], dtype=torch.float32, device=device)
electron_mass = 0.0005486
adduct_charge = 1
mass_arr = prod @ ele_mass_arr - adduct_charge * electron_mass
mass_arr

tensor([-5.4860e-04,  1.5994e+01,  3.1989e+01,  ...,  1.8814e+02,
         2.0414e+02,  2.2013e+02], device='cuda:0')

In [94]:
np.meshgrid(*formula_array)

[array([[[[[[[[[[[[10]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[20]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[5]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16),
 array([[[[[[[[[[[[0]]]]]]]]]]]], dtype=int16)]

In [83]:
mass_arr.shape

torch.Size([1386])

In [73]:
dims

[tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], device='cuda:0',
        dtype=torch.int32),
 tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0, 1, 2, 3, 4, 5], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32),
 tensor([0], device='cuda:0', dtype=torch.int32)]

In [67]:
prod[:30]

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0],
        [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
        [0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0],
        [0, 2, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0],
        [0, 2, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0],
        [0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
        [0, 3, 0, 0, 0, 0, 0, 0, 0

In [23]:
import matplotlib.pyplot as plt

# plt.figure(figsize=(10,50))
# plt.imshow(arr)

In [24]:
(arr > 0).mean()

0.22456709956709955

In [14]:
formula_array

array([10, 20,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0], dtype=int16)

In [None]:
import torch