In [1]:
import os

# Local and FTP addresses 
DATA_ROOT = './data/'
RAW_DATA_PATH = './data/raw/'
PROCESSED_DATA_PATH = './data/processed/'

PCBA_CID_FILE_NAME = 'Cid2BioactivityLink'
CID_INCHI_FILE_NAME = 'CID-InChI-Key'

PCBA_CID_FTP_ADDRESS = 'ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/Extras/%s.gz' % PCBA_CID_FILE_NAME
CID_INCHI_FTP_ADDRESS = 'ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/%s.gz' % CID_INCHI_FILE_NAME

PCBA_CID_FILE_PATH = os.path.join(RAW_DATA_PATH, PCBA_CID_FILE_NAME)
CID_INCHI_FILE_PATH = os.path.join(RAW_DATA_PATH, CID_INCHI_FILE_NAME)

# Download and unpack data 
if not os.path.exists(PCBA_CID_FILE_PATH):
    os.system('wget -r -nd -nc %s -P %s' % (PCBA_CID_FTP_ADDRESS, RAW_DATA_PATH))
    os.system('find %s -type f -iname \"*.gz\" -exec gunzip {} +' % RAW_DATA_PATH)

if not os.path.exists(CID_INCHI_FILE_PATH):
    os.system('wget -r -nd -nc %s -P %s'  % (CID_INCHI_FTP_ADDRESS, RAW_DATA_PATH))
    os.system('find %s -type f -iname \"*.gz\" -exec gunzip {} +' % RAW_DATA_PATH)


In [20]:
import json
import pandas as pd
from rdkit import Chem
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

CHUNK_SIZE = 2 **18
CID_MOL_FILE_NAME = 'CID-Mol_%03i.pkl'
CID_MOL_FILE_PATH = os.path.join(PROCESSED_DATA_PATH, CID_MOL_FILE_NAME)
CID_CHUNK_FILE_NAME = 'CID-chunk_num.txt'
CID_CHUNK_FILE_PATH = os.path.join(PROCESSED_DATA_PATH, CID_CHUNK_FILE_NAME)
UNUSED_CID_FILE_NAME = 'unused_CID.txt'
UNUSED_CID_FILE_PATH = os.path.join(PROCESSED_DATA_PATH, UNUSED_CID_FILE_NAME)

# cid_mol = [] 
unused_cid = []
cid_chunk_num_dict = {}
for chunk_idx, cid_inchi_df_chunk in enumerate(
        pd.read_csv(CID_INCHI_FILE_PATH,
                    sep='\t',
                    header=None,
                    index_col=[0],
                    usecols=[0, 1],
                    chunksize=CHUNK_SIZE)):
    
    cid_mol_chunk = [] 
    for cid, row in cid_inchi_df_chunk.iterrows():
        
        inchi = row[1]
        try:
            mol = Chem.MolFromInchi(inchi)
            assert mol
        except AssertionError:
            # print('Failed converting compoud (CID=%i) to molecule.' % cid)
            unused_cid.append(cid)
            continue
        
        cid_chunk_num_dict[cid] = chunk_idx
        cid_mol_chunk.append(row)

    with open(CID_MOL_FILE_PATH % chunk_idx, 'w+b') as f:
        pickle.dump(cid_mol_chunk, f, pickle.HIGHEST_PROTOCOL)

with open(CID_CHUNK_FILE_PATH, 'w') as f:
    json.dump(cid_chunk_num_dict, f, indent=4)
with open(UNUSED_CID_FILE_PATH, 'w') as f:
    json.dump(unused_cid, f, indent=4)
