This file unpack and split the SDF files downloaded from PubChem FTP server.

In [1]:
import gzip
import shutil
from glob import glob
from os.path import abspath, basename, join
from rdkit.Chem import Mol, SDMolSupplier, SDWriter, MolToSmiles

from rdkit import RDLogger
RDLogger.logger().setLevel(RDLogger.CRITICAL)



In [2]:
RAW_DATA_DIR = abspath('../data/raw')
INTERIM_DATA_DIR = abspath('../data/interim')
PROCESSED_DATA_DIR = abspath('../bcgraph/data')

print(f'Raw data directory: {RAW_DATA_DIR}')
print(f'Interim data directory: {INTERIM_DATA_DIR}')
print(f'Processed data directory: {PROCESSED_DATA_DIR}')

Raw data directory: /Users/xduan7/projects/biochem-graph/data/raw
Interim data directory: /Users/xduan7/projects/biochem-graph/data/interim
Processed data directory: /Users/xduan7/projects/biochem-graph/bcgraph/data


In [3]:
# unpack the .gz files into interim data directory
sdf_file_paths = []
for _sdf_gz_file_path in glob(join(RAW_DATA_DIR, '*.sdf.gz')):
    
    _sdf_file_path = join(
        INTERIM_DATA_DIR, 
        basename(_sdf_gz_file_path).replace('.gz', ''),
    )
    
    with gzip.open(_sdf_gz_file_path, 'rb') as _f_in:
        with open(_sdf_file_path, 'wb') as _f_out:
            shutil.copyfileobj(_f_in, _f_out)
    
    sdf_file_paths.append(_sdf_file_path)

In [4]:
# split and save molecules into SDF files
_mol: Mol
for _sdf_file_path in sdf_file_paths:
    for _mol in SDMolSupplier(_sdf_file_path):
        _cid = _mol.GetProp('PUBCHEM_COMPOUND_CID')
        _sd_writer = SDWriter(join(PROCESSED_DATA_DIR, f'{_cid}.sdf'))
        _sd_writer.write(_mol)
        _sd_writer.close()

In [5]:
# test the saved SDF files
for _sdf_file_path in glob(join(PROCESSED_DATA_DIR, '*.sdf')):
    for _mol in SDMolSupplier(_sdf_file_path):
        assert _mol is not None
        # print(MolToSmiles(_mol))