2024/4/19

build r0-r'n' Fingerprint with exact FP and normal FP
with n in [1,15]

In the end, we will (usually) pick the 6144 positions with the highest entropy   


In [1]:

import pickle
from pathlib import Path
import time, torch, os
from fingerprint_utils import FP_generator
batch_size=64
import tqdm
import numpy as np
from matplotlib import pyplot as plt
from rdkit.Chem import AllChem
from rdkit import Chem

import sys, pathlib
repo_path = pathlib.Path.cwd().parents[2]
repo_path

PosixPath('/root/MorganFP_prediction/reproduce_previous_works')

In [2]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

def generate_FP_with_exact_radius(mol, radius=2, length=6144):

    # Dictionary to store information about which substructures contribute to setting which bits
    bitInfo = {}
    
    # Generate the fingerprint with bitInfo to track the substructures contributing to each bit
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=length, bitInfo=bitInfo)
    
    # Create an array of zeros to represent the new fingerprint
    new_fp = [0] * length
    
    # Filter bitInfo to keep only entries where substructures have the exact radius
    for bit, atoms in bitInfo.items():
        # Check if any substructure at this bit has the exact specified radius
        if any(radius_tuple[1] == radius for radius_tuple in atoms):
            # Set the corresponding bit in the new fingerprint
            new_fp[bit] = 1
    
    # Return the new filtered fingerprint as a list of bits
    return new_fp


def generate_normal_FP(mol, radius=2, length=6144):

    # Dictionary to store information about which substructures contribute to setting which bits
    bitInfo = {}
    
    # Generate the fingerprint with bitInfo to track the substructures contributing to each bit
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=length, bitInfo=bitInfo)
    new_fp = [0] * length
    for bit, atoms in bitInfo.items():
            new_fp[bit] = 1
            
    return new_fp
   


In [3]:

def generate_FP_on_bits_with_exact_radius(mol, radius=2, length=6144):

    # Dictionary to store information about which substructures contribute to setting which bits
    bitInfo = {}
    on_bits = []
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=length, bitInfo=bitInfo)
    # Filter bitInfo to keep only entries where substructures have the exact radius
    for bit, atoms in bitInfo.items():
        # Check if any substructure at this bit has the exact specified radius
        if any(radius_tuple[1] == radius for radius_tuple in atoms):
            # Set the corresponding bit in the new fingerprint
            on_bits.append(bit)
    
    # Return the new filtered fingerprint as a list of bits
    return np.array(on_bits)


def generate_normal_FP_on_bits(mol, radius=2, length=6144):

    # Dictionary to store information about which substructures contribute to setting which bits
    bitInfo = {}
    
    # Generate the fingerprint with bitInfo to track the substructures contributing to each bit
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=length, bitInfo=bitInfo)
    on_bits = np.array(fp.GetOnBits())
    return on_bits

In [4]:
'''This generates exact-r FP and save the count at each position !!!
Gonna take you a few minutes 
'''

os.makedirs(Path(f'{repo_path}/notebooks/dataset_building/FP_on_bits_pickles'), exist_ok=True)
def generate_FP_indices_of_r0_r15(split, FP_length, generation_method, dataset="2d"):
    num_plain_FPs = 16
    if generation_method == "exact":
        generate_FP_on_bis = generate_FP_on_bits_with_exact_radius
        save_name = f"Exact_FP_on_bits_r0_r15_len_{FP_length}_{dataset}_{split}.pkl"
    elif generation_method == "normal":
        generate_FP_on_bis = generate_normal_FP_on_bits
        save_name = f"Normal_FP_on_bits_r0_r15_len_{FP_length}_{dataset}_{split}.pkl"
    else:
        raise ValueError("generation_method should be exact or normal")
    if dataset=="2d":
        path_dir = Path("/workspace/SMILES_dataset/")
    elif dataset=="1d":
        path_dir = Path("/workspace/OneD_Only_Dataset/")
    else:
        raise ValueError("dataset should be 2d or 1d")    
    smile_nmr = pickle.load(open(path_dir / split/ "SMILES/index.pkl", "rb"))

    FP_on_bits = {}
    for file_idx, smile_str in tqdm.tqdm(smile_nmr.items()):
        mol = Chem.MolFromSmiles(smile_str)
        mol_H = Chem.AddHs(mol) # add implicit Hs to the molecule
        all_plain_fps = []
        for radius in range(num_plain_FPs):
            all_plain_fps.append(generate_FP_on_bis(mol_H, radius=radius, length=FP_length) + radius*FP_length)
        concated_FP = np.concatenate(all_plain_fps)

        FP_on_bits[file_idx] = concated_FP

    save_dir = Path(f'{repo_path}/notebooks/dataset_building/FP_on_bits_pickles')
    
    FP_on_bits_path = save_dir / save_name 
    with open(FP_on_bits_path, 'wb') as f:
        pickle.dump(FP_on_bits, f)
        
    return FP_on_bits

# count = np.zeros(6144*num_plain_FPs)
    
    
    
    # count+= concated_FP
# np.save(f"count_exact_r0_to_r{num_plain_FPs-1}_FP.npy", count)

In [5]:
generate_FP_indices_of_r0_r15("test", 6144, "normal", dataset="2d")
generate_FP_indices_of_r0_r15("val", 6144, "normal", dataset="2d")
generate_FP_indices_of_r0_r15("test", 6144, "normal", dataset="1d")
generate_FP_indices_of_r0_r15("val", 6144, "normal", dataset="1d")


generate_FP_indices_of_r0_r15("test", 1024, "exact", dataset="2d")
generate_FP_indices_of_r0_r15("val", 1024, "exact", dataset="2d")
generate_FP_indices_of_r0_r15("test", 1024, "exact", dataset="1d")
generate_FP_indices_of_r0_r15("val", 1024, "exact", dataset="1d")

print("done")
# DONE

100%|██████████| 13718/13718 [02:41<00:00, 84.84it/s] 
100%|██████████| 13756/13756 [02:41<00:00, 85.05it/s] 
100%|██████████| 8390/8390 [01:35<00:00, 87.90it/s] 
100%|██████████| 8337/8337 [01:34<00:00, 87.91it/s] 
100%|██████████| 13718/13718 [01:59<00:00, 114.35it/s]
100%|██████████| 13756/13756 [02:00<00:00, 114.14it/s]
100%|██████████| 8390/8390 [01:10<00:00, 119.36it/s]
100%|██████████| 8337/8337 [01:09<00:00, 120.29it/s]


done


In [6]:
FP_on_bits_6144_1d_train_normal = generate_FP_indices_of_r0_r15("train", 6144, "normal", dataset="1d")
FP_on_bits_6144_2d_train_normal = generate_FP_indices_of_r0_r15("train", 6144, "normal", dataset="2d")
# DONE 





  0%|          | 26/66951 [00:00<09:24, 118.64it/s]

100%|██████████| 66951/66951 [13:14<00:00, 84.32it/s] 
100%|██████████| 109793/109793 [22:14<00:00, 82.30it/s] 


In [7]:
# Done
# FP_on_bits_1024_2d_train_exact = generate_FP_indices_of_r0_r15("train", 1024, "exact", dataset="2d")
# FP_on_bits_1024_1d_train_exact = generate_FP_indices_of_r0_r15("train", 1024, "exact", dataset="1d")

In [11]:
import torch
a = torch.load("/workspace/SMILES_dataset/train/HSQC/10.pt")

In [12]:
a

tensor([[ 1.1858e+02,  6.6889e+00,  5.1766e+03],
        [ 1.1525e+02,  6.6885e+00,  3.4739e+03],
        [ 1.1416e+02,  6.4485e+00,  5.7442e+03],
        [ 1.1269e+02,  7.4327e+00,  5.2823e+03],
        [ 5.8650e+01,  3.3635e+00,  5.4089e+03],
        [ 5.6020e+01,  2.8700e+00, -3.4106e+03],
        [ 5.6020e+01,  2.4000e+00, -4.4193e+03],
        [ 4.9340e+01,  3.1400e+00, -4.6795e+03],
        [ 4.9340e+01,  2.4500e+00, -5.8852e+03],
        [ 3.6460e+01,  3.1278e+00, -5.4089e+03],
        [ 3.6460e+01,  2.4146e+00, -8.6082e+03],
        [ 2.8980e+01,  3.1750e+00, -6.1454e+03],
        [ 2.8980e+01,  2.7850e+00, -7.5481e+03],
        [ 1.9100e+01,  1.5700e+00, -1.2291e+04],
        [ 1.2000e+01,  9.4000e-01,  1.7656e+04]], dtype=torch.float64)