This file unpack and split the SDF files downloaded from PubChem FTP server.

In [1]:
import gzip
import shutil
from glob import glob
from os.path import abspath, basename, join
from rdkit.Chem import Mol, SDMolSupplier, SDWriter, MolToSmiles

from rdkit import RDLogger
RDLogger.logger().setLevel(RDLogger.CRITICAL)



In [2]:
NUM_MOLS = 4096

RAW_DATA_DIR = abspath('../data/raw')
INTERIM_DATA_DIR = abspath('../data/interim')
PROCESSED_DATA_DIR = abspath('../bcgraph/faker/data')
PROCESSED_SDF_PATH = join(PROCESSED_DATA_DIR, f'pubchem_mols.sdf')

print(f'Raw data directory: {RAW_DATA_DIR}')
print(f'Interim data directory: {INTERIM_DATA_DIR}')
print(f'Processed data directory: {PROCESSED_DATA_DIR}')
print(f'Processed SDF file path: {PROCESSED_SDF_PATH}')

Raw data directory: /Users/xduan7/projects/biochem-graph/data/raw
Interim data directory: /Users/xduan7/projects/biochem-graph/data/interim
Processed data directory: /Users/xduan7/projects/biochem-graph/bcgraph/faker/data
Processed SDF file path: /Users/xduan7/projects/biochem-graph/bcgraph/faker/data/pubchem_mols.sdf


In [3]:
# unpack the .gz files into interim data directory
sdf_file_paths = []
for _sdf_gz_file_path in glob(join(RAW_DATA_DIR, '*.sdf.gz')):
    
    _sdf_file_path = join(
        INTERIM_DATA_DIR, 
        basename(_sdf_gz_file_path).replace('.gz', ''),
    )
    
    with gzip.open(_sdf_gz_file_path, 'rb') as _f_in:
        with open(_sdf_file_path, 'wb') as _f_out:
            shutil.copyfileobj(_f_in, _f_out)
    
    sdf_file_paths.append(_sdf_file_path)

In [4]:
# save molecules into SDF files
_sd_writer = SDWriter(PROCESSED_SDF_PATH)
_included_cids = set()
_mol: Mol
for _sdf_file_path in sdf_file_paths:
    
    if len(_included_cids) >= NUM_MOLS:
        break
    
    for _mol in SDMolSupplier(_sdf_file_path):
        
        _cid = _mol.GetProp('PUBCHEM_COMPOUND_CID')
        for _p in _mol.GetPropsAsDict().keys():
            if _p != 'PUBCHEM_COMPOUND_CID':
                _mol.ClearProp(_p)
        
        if _cid not in _included_cids:
            _included_cids.add(_cid)
            _sd_writer.write(_mol)
            if len(_included_cids) >= NUM_MOLS:
                break

_sd_writer.close()

In [5]:
# test the saved SDF files
_sd_supp = SDMolSupplier(PROCESSED_SDF_PATH)
assert len(_sd_supp) == NUM_MOLS
for _mol in SDMolSupplier(PROCESSED_SDF_PATH):
    assert _mol is not None
    # print(MolToSmiles(_mol))
