In [1]:
import glob
import gzip
import os
import re

In [None]:
def get_smiles_list(filename):
    ret_list = []
    for line in gzip.open(filename, 'rb'):
        line = line.decode('utf-8')
        if line[0] == '#': 
            continue
        # structure of line: [external ID, internal ID, SMILES]]
        ext_id, inter_id, smiles = line.rstrip().split()
        ret_list.append(f"{smiles} {ext_id}")
    return ret_list

In [41]:
# ChEMBL
active_files = glob.glob("./data/benchmarking_platform/compounds/ChEMBL/*_actives.dat.gz")
decoy_file = "./data/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz"
decoy_smiles_list = get_smiles_list(decoy_file)


pattern = r'ChEMBL_\d+'

for fname in active_files:
    target_name = re.search(pattern, fname).group()
    base_dir = os.path.dirname(fname)
    output_dir = os.path.join(base_dir, target_name)
    os.makedirs(output_dir, exist_ok=True)
    active_smiles_list = get_smiles_list(fname)
    with open(os.path.join(output_dir, "actives.smi"), 'w') as f:
        f.write("\n".join(active_smiles_list))
    with open(os.path.join(output_dir, "inactives.smi"), 'w') as f:
        f.write("\n".join(decoy_smiles_list))

In [61]:
# DUD
active_files = glob.glob("./data/benchmarking_platform/compounds/DUD/*_actives.dat.gz")
#decoy_file = "./data/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz"
#decoy_smiles_list = get_smiles_list(decoy_file)

pattern = r'DUD_([a-z0-9_]+)_actives'

for fname in active_files:
    target_name = re.search(pattern, fname).group(1)
    base_dir = os.path.dirname(fname)
    output_dir = os.path.join(base_dir, target_name)
    os.makedirs(output_dir, exist_ok=True)
    active_smiles_list = get_smiles_list(fname)
    decoy_fname = os.path.join(base_dir, f"cmp_list_DUD_{target_name}_decoys.dat.gz")
    if not os.path.exists(decoy_fname):
        raise
    decoy_smiles_list = get_smiles_list(decoy_fname)
    with open(os.path.join(output_dir, "actives.smi"), 'w') as f:
        f.write("\n".join(active_smiles_list))
    with open(os.path.join(output_dir, "inactives.smi"), 'w') as f:
        f.write("\n".join(decoy_smiles_list))

In [64]:
re.search(pattern, fname).group()

'MUV_466'

In [65]:
# MUV
active_files = glob.glob("./data/benchmarking_platform/compounds/MUV/*_actives.dat.gz")
#decoy_file = "./data/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz"
#decoy_smiles_list = get_smiles_list(decoy_file)

pattern = r'MUV_\d+'

for fname in active_files:
    target_name = re.search(pattern, fname).group()
    base_dir = os.path.dirname(fname)
    output_dir = os.path.join(base_dir, target_name)
    os.makedirs(output_dir, exist_ok=True)
    active_smiles_list = get_smiles_list(fname)
    decoy_fname = os.path.join(base_dir, f"cmp_list_{target_name}_decoys.dat.gz")
    if not os.path.exists(decoy_fname):
        raise
    decoy_smiles_list = get_smiles_list(decoy_fname)
    with open(os.path.join(output_dir, "actives.smi"), 'w') as f:
        f.write("\n".join(active_smiles_list))
    with open(os.path.join(output_dir, "inactives.smi"), 'w') as f:
        f.write("\n".join(decoy_smiles_list))

In [35]:
from rdkit.Chem.Scaffolds import MurckoScaffold 
from rdkit import Chem
from rdkit.Chem import Draw

all_active_fnames = glob.glob("./data/benchmarking_platform/compounds/*/*/actives.smi")
ba_ratio_dict = {}
for fname in all_active_fnames:
    target_name = os.path.basename(os.path.dirname(fname))
    with open(fname, 'r') as f:
        smiles_list = [l.split(' ')[0] for l in f.readlines()]
        bms_set = set([MurckoScaffold.MurckoScaffoldSmilesFromSmiles(s) for s in smiles_list])
    ba_ratio_dict[target_name] = round(len(bms_set) / len(smiles_list), 2)
#Draw.MolsToGridImage([Chem.MolFromSmiles(s) for s in bms_set])

In [36]:
ba_ratio_dict

{'ChEMBL_100126': 0.78,
 'ChEMBL_100166': 0.54,
 'ChEMBL_100579': 0.7,
 'ChEMBL_100': 0.91,
 'ChEMBL_10188': 0.95,
 'ChEMBL_10193': 0.71,
 'ChEMBL_10198': 0.69,
 'ChEMBL_10260': 0.86,
 'ChEMBL_10280': 0.97,
 'ChEMBL_10378': 0.83,
 'ChEMBL_10417': 0.65,
 'ChEMBL_10434': 0.94,
 'ChEMBL_10475': 0.45,
 'ChEMBL_10498': 0.9,
 'ChEMBL_104': 0.73,
 'ChEMBL_10579': 0.64,
 'ChEMBL_105': 0.66,
 'ChEMBL_10752': 0.64,
 'ChEMBL_10773': 0.55,
 'ChEMBL_107': 0.91,
 'ChEMBL_108': 0.86,
 'ChEMBL_10927': 0.59,
 'ChEMBL_10980': 0.98,
 'ChEMBL_11085': 0.71,
 'ChEMBL_11140': 0.86,
 'ChEMBL_11225': 0.81,
 'ChEMBL_11265': 0.59,
 'ChEMBL_11279': 0.65,
 'ChEMBL_11336': 0.77,
 'ChEMBL_11359': 0.81,
 'ChEMBL_11365': 0.96,
 'ChEMBL_11442': 0.55,
 'ChEMBL_11488': 0.47,
 'ChEMBL_11489': 0.99,
 'ChEMBL_114': 0.96,
 'ChEMBL_11534': 0.91,
 'ChEMBL_11536': 0.87,
 'ChEMBL_11575': 0.8,
 'ChEMBL_11631': 0.77,
 'ChEMBL_11682': 0.76,
 'ChEMBL_116': 0.64,
 'ChEMBL_121': 0.87,
 'ChEMBL_12209': 0.66,
 'ChEMBL_12252': 0.93,
 'Ch

In [11]:
all_active_fnames[-1]

'./data/benchmarking_platform/compounds/MUV/MUV_859/actives.smi'