# Mass Spec Data Processing

In [1]:
import os
import json
from tqdm import tqdm
MS_DATA_PATH = '/root/gurusmart/json'
all_data = []
for filepath in tqdm(os.listdir(MS_DATA_PATH)):
    all_data += json.load(open(os.path.join(MS_DATA_PATH, filepath), 'r'))

100%|██████████| 1884/1884 [10:58<00:00,  2.86it/s]


In [2]:
from rdkit import Chem
# it's ok for it to say WARNING: not removing hydrogen atom without neighbors
def standardize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f'Failed to convert smiles string {smiles}, continuing...')
        return None
    Chem.RemoveStereochemistry(mol)
    return Chem.MolToSmiles(mol, canonical=True)

update_cnt = 0
err_cnt = 0
for data in tqdm(all_data):
    standardized = standardize(data['SMILES'])
    if not standardized:
        err_cnt += 1
        continue
    if data['SMILES'] != standardized:
        update_cnt += 1
    data['SMILES'] = standardized
print(f'Updated {update_cnt}/{len(all_data)} to canonical smiles, {err_cnt} failed to convert from smiles.')

 53%|█████▎    | 100013/188385 [00:24<00:21, 4034.04it/s][21:52:36] Explicit valence for atom # 20 N, 5, is greater than permitted
 54%|█████▎    | 100837/188385 [00:24<00:21, 4052.71it/s]

Failed to convert smiles string CC=CC1CC2C(=O)OC(C)C(NC(=O)C(CC3CC3[NH2+2]([O-])[O-])NC(=O)c3ccc(Cl)n3O)C(=O)NC(C(C)c3ccccc3)C(=O)NC(CC3CC3[NH2+2]([O-])[O-])C(=O)NC(C(C)c3ccccc3)C(=O)NC(C(C)CC)C(=O)N2C1, continuing...


100%|██████████| 188385/188385 [00:45<00:00, 4110.70it/s]

Updated 0/188385 to canonical smiles, 1 failed to convert from smiles.





In [6]:
Chem.MolFromSmiles('CC=CC1CC2C(=O)OC(C)C(NC(=O)C(CC3CC3[NH2+2]([O-])[O-])NC(=O)c3ccc(Cl)n3O)C(=O)NC(C(C)c3ccccc3)C(=O)NC(CC3CC3[NH2+2]([O-])[O-])C(=O)NC(C(C)c3ccccc3)C(=O)NC(C(C)CC)C(=O)N2C1')

[21:29:45] Explicit valence for atom # 20 N, 5, is greater than permitted


In [11]:
import pickle
from collections import defaultdict

HSQC_DATA_PATH = '/workspace/SMILES_dataset'
ONED_DATA_PATH = '/workspace/OneD_Only_Dataset'

hsqc_train_data = pickle.load(open(os.path.join(HSQC_DATA_PATH, 'train', 'SMILES', 'index.pkl'), 'rb'))
hsqc_val_data = pickle.load(open(os.path.join(HSQC_DATA_PATH, 'val', 'SMILES', 'index.pkl'), 'rb'))
hsqc_test_data = pickle.load(open(os.path.join(HSQC_DATA_PATH, 'test', 'SMILES', 'index.pkl'), 'rb'))
oned_train_data = pickle.load(open(os.path.join(ONED_DATA_PATH, 'train', 'SMILES', 'index.pkl'), 'rb'))
oned_val_data = pickle.load(open(os.path.join(ONED_DATA_PATH, 'val', 'SMILES', 'index.pkl'), 'rb'))
oned_test_data = pickle.load(open(os.path.join(ONED_DATA_PATH, 'test', 'SMILES', 'index.pkl'), 'rb'))

maps = [hsqc_train_data, hsqc_val_data, hsqc_test_data, oned_train_data, oned_val_data, oned_test_data]
inverse_maps = [defaultdict(list) for _ in maps]
for data, rev_data in zip(maps, inverse_maps):
    for k, v in data.items():
        rev_data[v].append(k)

In [12]:
for data in maps:
    print(len(data))

109793
13756
13718
66951
8337
8390


In [13]:
for data in inverse_maps:
    print(sum(len(ls) for ls in data.values()))

109793
13756
13718
66951
8337
8390


In [14]:
for data in inverse_maps:
    print(len(data))

89208
11151
11150
66951
8337
8390


In [9]:
data[8]

'CCCCCCCCCCCCCCCCCCC(=O)O'

In [19]:
for split in ('train', 'val', 'test'):
    for ds in 'OneD_Only_Dataset', 'SMILES_dataset':
        os.makedirs(os.path.join('/workspace', ds, split, 'MassSpec'), exist_ok=True)

In [25]:
import torch
for entry in tqdm(all_data):
    for map_idx in range(6):
        idxs = inverse_maps[map_idx].get(entry['SMILES'])
        if idxs is not None:
            data_split = ['SMILES_dataset/train', 'SMILES_dataset/val', 'SMILES_dataset/test', 'OneD_Only_Dataset/train', 'OneD_Only_Dataset/val', 'OneD_Only_Dataset/test'][map_idx]
            break
    if idxs is None:
        print(f'Failed to match {entry["SMILES"]}')
        break
    for idx in idxs:
        with open(os.path.join('/workspace', data_split, 'MassSpec', f'{idx}.pt'), 'wb') as f:
            torch.save(torch.tensor(all_data[0]['peaks'], dtype=torch.float64), f)
    
    

100%|██████████| 188385/188385 [00:34<00:00, 5450.43it/s]


In [26]:
hsqc_val_data[10112]


'COC1CCC2(C)C(=CCC3C2CCC2(C)C3CCC2C(C)N(C)C)C1'