In [1]:
import os, pickle
oneD_dataset_root_path = '/workspace/OneD_Only_Dataset'
files = os.listdir(oneD_dataset_root_path+"/train/oneD_NMR")
from pathlib import Path
import torch, tqdm


In [2]:
import pickle
with open('/workspace/SMILES_dataset/train/Chemical/index.pkl','rb') as r:
    index = pickle.load(r)
len(index)

109793

In [3]:
import json

with open('/root/data/NP-MRD-dataset/NP-MRD_metadata/npmrd_natural_products.json') as f:
    data = json.load(f)
names_to_npmrd_id = dict([[d['name'], d['accession'] ]for d in data['np_mrd']['natural_product']])


In [4]:
npmrd_id_to_names = dict([v,k] for k,v in names_to_npmrd_id.items())


In [2]:
def get_nmr_tensors(file_path):
    c_values = set()
    h_values = set()

    try:
        with open(file_path, 'r') as file:
            # print('ys')
            for line in file:
              
                parts = line.strip().split(',')
                if len(parts) < 3 or not parts[2]:
                    continue  # Skip lines that don't have enough parts or the third column is empty

                element, _, value, *_ = parts
                value = float(value)

                if element == 'C':
                    c_values.add(value)
                elif element == 'H':
                    h_values.add(value)
    except Exception as e:
        print("error", e)
        print("file_path", file_path)
        
        
    # Convert sets to tensors
    c_tensor = torch.tensor(list(c_values), dtype=torch.float32)
    h_tensor = torch.tensor(list(h_values), dtype=torch.float32)
    # print("returning")
    return c_tensor, h_tensor

weird_H_file_paths = []
NP_MRD_FILES_dir = '/root/data/NP-MRD-dataset/NP-MRD-shift-assignments'
npmrd_files_txt_only = os.listdir(NP_MRD_FILES_dir)
for f in tqdm.tqdm(npmrd_files_txt_only):
    c, h = get_nmr_tensors(NP_MRD_FILES_dir + '/' + f)
    if len(h) and max(h)<0.5:
            # print(h_tensor)
            weird_H_file_paths.append(f"{f}")


  0%|          | 340/265316 [00:57<12:22:07,  5.95it/s]


KeyboardInterrupt: 

In [7]:

dataset_path= Path('/workspace/OneD_Only_Dataset/')

for split in ['test', 'val', "train"]:

    index_to_chemical_name = pickle.load(open(dataset_path / split / 'Chemical/index.pkl', 'rb'))
    chemical_name_to_indx = dict([[v,k] for k,v in index_to_chemical_name.items()])
    path_1d = Path( dataset_path / split)
    oneD_NMRs = os.listdir(path_1d/'oneD_NMR')
    # look for both empty tensors
    only_c, only_h, c_and_h = 0, 0, 0
    empty = []
    for i in oneD_NMRs:
        c_tensor, h_tensor = torch.load(path_1d / 'oneD_NMR'/ i)
        if len(c_tensor) > 0 and len(h_tensor) > 0:
            c_and_h += 1
        elif len(c_tensor) > 0:
            only_c += 1
        elif len(h_tensor) > 0:
            only_h += 1
        else:
            empty.append(i)
            
    print(f'Only C: {only_c} | Only H: {only_h} | Both C and H: {c_and_h}')


    np_ids_with_empty_tensors = [names_to_npmrd_id[index_to_chemical_name[int(file_index.split(".")[0])]] for file_index in empty]
    txt_file_locations = filter(lambda x: x.split("_")[0] in np_ids_with_empty_tensors, npmrd_files_txt_only)

    txt_file_locations= list(txt_file_locations)
    assert (len(txt_file_locations) == len (empty))


    for txt_f in tqdm.tqdm(txt_file_locations):
        chemical_name = npmrd_id_to_names [txt_f.split("_")[0]]
        file_index = chemical_name_to_indx[chemical_name]
        save_path = dataset_path / split / 'oneD_NMR' / f"{file_index}.pt"
        
        c_values = set()
        h_values = set()
        with open(Path(NP_MRD_FILES_dir) / txt_f) as f:
            
            try:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) < 3 or not parts[2]:
                        continue  # Skip lines that don't have enough parts or the third column is empty

                    id, element, value, *_ = parts
                    # print(element, value)
                    if value in ["chemical_shift", "shift", "NA"]:
                        continue
                    
                    value = float(value)

                    if element == 'C':
                        c_values.add(value)
                    elif element == 'H':
                        h_values.add(value)
                if len(c_values)==0 and len(h_values)==0:
                    print("No c and H")
                
            except Exception as e:
                print(txt_f)
                print(element, value )
                print(parts)               
                    
                raise(e)
        c_tensor = torch.tensor(list(c_values), dtype=torch.float32)
        h_tensor = torch.tensor(list(h_values), dtype=torch.float32)
        torch.save([c_tensor, h_tensor], save_path)
        
                


Only C: 1122 | Only H: 2 | Both C and H: 19011


100%|██████████| 1866/1866 [06:54<00:00,  4.50it/s]


Only C: 1147 | Only H: 4 | Both C and H: 19163


100%|██████████| 1843/1843 [06:33<00:00,  4.69it/s]


Only C: 9127 | Only H: 33 | Both C and H: 153513


100%|██████████| 14989/14989 [52:11<00:00,  4.79it/s] 


In [11]:
from rdkit import Chem

def get_canonical_smiles(datum):
    """
    Converts a SMILES string to its canonical form.


    Returns:
    str: The canonical SMILES string.
    """
    smiles = datum['smiles']
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise Exception("Invalid SMILES string")
        canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
        return canonical_smiles
    except:
        try:
            inchi = datum['inchi']
            mol = Chem.MolFromInchi(inchi)
            if mol is None:
                raise Exception("Invalid Inchi string")
            canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
            return canonical_smiles
        except:
            # # print("BAD: ", smiles, inchi)
            return None
canonical_smiles_to_npmrd_id = dict([[get_canonical_smiles(d), d['accession'] ]for d in tqdm.tqdm(data['np_mrd']['natural_product'])])

 10%|▉         | 27350/280242 [00:06<01:19, 3187.58it/s][01:59:19] Conflicting single bond directions around double bond at index 2.
[01:59:19]   BondStereo set to STEREONONE and single bond directions set to NONE.
 18%|█▊        | 51591/280242 [00:13<01:02, 3655.43it/s][01:59:26] Can't kekulize mol.  Unkekulized atoms: 2 3 31

 21%|██        | 58499/280242 [00:15<00:56, 3921.12it/s][01:59:27] Explicit valence for atom # 2 N, 4, is greater than permitted
[01:59:27] Explicit valence for atom # 9 N, 4, is greater than permitted
[01:59:27] ERROR: Explicit valence for atom # 9 N, 4, is greater than permitted

 21%|██        | 58899/280242 [00:15<00:58, 3807.48it/s][01:59:27] Explicit valence for atom # 10 O, 3, is greater than permitted
[01:59:27] Explicit valence for atom # 40 B, 7, is greater than permitted
[01:59:27] ERROR: Explicit valence for atom # 40 B, 7, is greater than permitted

 23%|██▎       | 64253/280242 [00:16<00:45, 4797.80it/s][01:59:28] Explicit valence for atom # 20 N, 

In [14]:
npmrd_id_to_smiles = dict([v,k] for k,v in canonical_smiles_to_npmrd_id.items())

dataset_path=  Path('/workspace/SMILES_dataset')

for split in ['test', 'val', "train"]:

    index_to_smiles = pickle.load(open(dataset_path / split / 'SMILES/index.pkl', 'rb'))
    smiles_to_index = dict([[v,k] for k,v in index_to_smiles.items()])
    path_1d = Path( dataset_path / split)
    oneD_NMRs = os.listdir(path_1d/'oneD_NMR')
    # look for both empty tensors
    only_c, only_h, c_and_h = 0, 0, 0
    empty = []
    for i in oneD_NMRs:
        c_tensor, h_tensor = torch.load(path_1d / 'oneD_NMR'/ i)
        if len(c_tensor) > 0 and len(h_tensor) > 0:
            c_and_h += 1
        elif len(c_tensor) > 0:
            only_c += 1
        elif len(h_tensor) > 0:
            only_h += 1
        else:
            empty.append(i)
            
    print(f'Only C: {only_c} | Only H: {only_h} | Both C and H: {c_and_h}')


    np_ids_with_empty_tensors = [canonical_smiles_to_npmrd_id[index_to_smiles[int(file_index.split(".")[0])]] for file_index in empty]
    txt_file_locations = filter(lambda x: x.split("_")[0] in np_ids_with_empty_tensors, npmrd_files_txt_only)

    txt_file_locations= list(txt_file_locations)
    assert (len(txt_file_locations) == len (empty))
    print(len(empty))
    
    # continue

    for txt_f in tqdm.tqdm(txt_file_locations):
        smiles = npmrd_id_to_smiles [txt_f.split("_")[0]]
        file_index = smiles_to_index[smiles]
        save_path = dataset_path / split / 'oneD_NMR' / f"{file_index}.pt"
        
        c_values = set()
        h_values = set()
        with open(Path(NP_MRD_FILES_dir) / txt_f) as f:
            
            try:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) < 3 or not parts[2]:
                        continue  # Skip lines that don't have enough parts or the third column is empty

                    id, element, value, *_ = parts
                    # print(element, value)
                    if value in ["chemical_shift", "shift", "NA"]:
                        continue
                    
                    value = float(value)

                    if element == 'C':
                        c_values.add(value)
                    elif element == 'H':
                        h_values.add(value)
                if len(c_values)==0 and len(h_values)==0:
                    print("No c and H")
                
            except Exception as e:
                print(txt_f)
                print(element, value )
                print(parts)               
                    
                raise(e)
        c_tensor = torch.tensor(list(c_values), dtype=torch.float32)
        h_tensor = torch.tensor(list(h_values), dtype=torch.float32)
        torch.save([c_tensor, h_tensor], save_path)
        
                


Only C: 215 | Only H: 0 | Both C and H: 3930
118


  3%|▎         | 4/118 [00:01<00:41,  2.77it/s]

100%|██████████| 118/118 [00:35<00:00,  3.33it/s]


Only C: 220 | Only H: 0 | Both C and H: 3986
126


100%|██████████| 126/126 [00:33<00:00,  3.79it/s]


Only C: 1851 | Only H: 0 | Both C and H: 31938
1017


100%|██████████| 1017/1017 [03:31<00:00,  4.80it/s]


In [2]:
import os 
file_paths = os.listdir('/workspace/OneD_Only_Dataset/train/oneD_NMR/')
len(file_paths)

6713

In [26]:
out_2 = pickle.load(open('/root/MorganFP_prediction/reproduce_previous_works/smart4.5/out_2.pkl', 'rb'))

In [2]:
import pickle
a = pickle.load(open('/root/MorganFP_prediction/reproduce_previous_works/smart4.5/notebooks/dataset_building/FP_on_bits_pickles/Exact_FP_on_bits_r0_r15_len_1024_2d_train.pkl', 'rb'))

In [10]:
(a[0][34])

1478.0

In [2]:
import numpy as np
a = np.array([1,2,3,4,5])
b = np.array([2,3])
a[list(b)]

array([3, 4])