In [3]:
# Download and unpack data
import utils.data_prep.config as c
from utils.data_prep.download import download


download()

# Get all the CIDs in PCBA
pcba_cid_set = set(pd.read_csv(
    c.PCBA_CID_FILE_PATH,
    sep='\t',
    header=0,
    index_col=None,
    usecols=[0]).values.reshape(-1))


In [4]:
# Structures for overall data storage
import pandas as pd
from rdkit import Chem
from utils.data_prep.smiles_prep import mol_to_smiles

unused_cid_list = []
atom_dict = {}

# Iterate through all the CIDs and get Mol
for chunk_idx, chunk_cid_inchi_df in enumerate(
        pd.read_csv(c.CID_INCHI_FILE_PATH,
                    sep='\t',
                    header=None,
                    index_col=[0],
                    usecols=[0, 1],
                    chunksize=2 ** 12)):
    
    for cid, row in chunk_cid_inchi_df.iterrows():
        
        # Skip this compound if it is not in PCBA and the dataset is PCBA
        if cid not in pcba_cid_set and c.PCBA_ONLY:
            continue
        
        # Convert to Mol and get SMILES
        mol: Chem.rdchem.Mol = Chem.MolFromInchi(row[1])
        if mol is None:
            unused_cid_list.append(cid)
            continue
        smiles: str = mol_to_smiles(mol)
        
        # Skip the molecules with too many atoms or lengthy SMILES
        if mol.GetNumAtoms() > c.MAX_NUM_ATOMS or \
                len(smiles) > c.MAX_LEN_SMILES:
            unused_cid_list.append(cid)
            continue
        
        # Count the atoms in this molecule
        # Same atoms in a molecule only count once
        atom_set = set([a.GetSymbol() for a in mol.GetAtoms()])
        for a in atom_set:
            atom_dict[a] = (atom_dict[a] + 1) if a in atom_dict else 1
        
        cid_mol_list.append((cid, mol))


KeyboardInterrupt: 

In [None]:
# Saving metadata into files
import h5py
import json

SAME_DATASET = True
with h5py.File(c.CID_MOL_HDF5_PATH, 'w') as f:
    
    # Different approaches: Same dataset or different ones?
    if SAME_DATASET:
        # Same dataset 
        f.create_dataset(name='CID-Mol',
                         data=cid_mol_list)
    else:
        # Different datasets in the same group
        g = f.create_group(name='CID-Mol')
        for cid, mol in cid_mol_list:
            g.create_dataset(name=cid, data=mol)
        
# Convert atom count into frequency for further usage
for a in atom_dict:
    atom_dict[a] = atom_dict[a] / len(cid_mol_list)
with open(c.ATOM_DICT_TXT_PATH, 'w') as f:
    json.dump(atom_dict, f, indent=4)

# Dump unused CIDs for further usage
with open(c.UNUSED_CID_TXT_PATH, 'w') as f:
    json.dump(unused_cid_list, f, indent=4)
