In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem 
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from io import StringIO
import subprocess
import re
import os 
from butina import ParallelClusterData 
from multiprocessing import Pool, cpu_count


In [2]:
#download excape if it doesn't exists in current folder
if not os.path.exists("pubchem.chembl.dataset4publication_inchi_smiles_v2.tsv.xz"):
    print(subprocess.check_output(["wget", "-O", "pubchem.chembl.dataset4publication_inchi_smiles_v2.tsv.xz", "https://zenodo.org/record/2543724/files/pubchem.chembl.dataset4publication_inchi_smiles_v2.tsv.xz?download=1"]))
    

In [3]:
targets = ["DRD2", "HTR1A"]
activity_threshold = 7.0

rnd_seed = 1234

DIST_THRESHOLD = 0.6

In [4]:
import numba
from numba import njit, prange

@njit(parallel=True,fastmath=True)
def _minmaxkernel_numba(data_1, data_2):
    """
    MinMax kernel
        K(x, y) = SUM_i min(x_i, y_i) / SUM_i max(x_i, y_i)
    bounded by [0,1] as defined in:
    "Graph Kernels for Chemical Informatics"
    Liva Ralaivola, Sanjay J. Swamidass, Hiroto Saigo and Pierre Baldi
    Neural Networks
    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.92.483&rep=rep1&type=pdf
    """


    result = np.zeros((data_1.shape[0], data_2.shape[0]), dtype=np.float64)

    for i in prange(data_1.shape[0]):
        for j in prange(data_2.shape[0]):
            result[i,j] = _minmax_two_fp(data_1[i], data_2[j])
    return result


@njit(fastmath=True)
def _minmax_two_fp(fp1, fp2):
    common = numba.int32(0)
    maxnum = numba.int32(0)
    i = 0

    while i < len(fp1):
        min_ = fp1[i]
        max_ = fp2[i]

        if min_ > max_:
            min_ = fp2[i]
            max_ = fp1[i]

        common += min_
        maxnum += max_

        i += 1

    return numba.float64(common) / numba.float64(maxnum)

def counted_tanimoto_similarity(f1, fp2, return_distance=True):
    if return_distance:
        return 1. - _minmax_two_fp(fp1,fp2)
    else:
        return _minmax_two_fp(fp1,fp2)
    
def bulk_counted_tanimoto(fp1, fps, return_distance=True):
    if return_distance:
        return [1. - _minmax_two_fp(fp1,fp2) for fp2 in fps]
    else: 
        return [_minmax_two_fp(fp1,fp2) for fp2 in fps]



In [5]:
def applyParallel(df, func):
    df_split = np.array_split(df, cpu_count())
    pool = Pool(cpu_count())
    data = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return data


def normalize_smiles(smi):
    try:
        return Chem.MolToSmiles(Chem.MolFromSmiles(smi),isomericSmiles=False)
    except:
        return np.NaN
    
def normalize_smiles_from_inchi(inchi):
    try:
        return Chem.MolToSmiles(Chem.MolFromInchi(inchi),isomericSmiles=False)
    except:
        return np.NaN

    
def normalize_smiles_pandas(df):
    return pd.Series([normalize_smiles(smi) for smi in df], index=df.index)

def normalize_smiles_from_inchi_pandas(df):
    return pd.Series([normalize_smiles_from_inchi(smi) for smi in df], index=df.index)

def zgrep_data(f, string):
    grep = 'grep'
    if f.endswith(".gz"):
        grep = 'zgrep'
    if f.endswith(".xz"):
        grep = 'xzgrep'
    
    if string == '':        
        out = subprocess.check_output([grep, string, f])
        grep_data = StringIO(out)
        data = pd.read_csv(grep_data, sep='\t')
    else:
        # read only the first row to get the columns
        columns = pd.read_csv(f, sep='\t', nrows=1, header=None).values.tolist()[0]    

        out = subprocess.check_output([grep, string, f]).decode("UTF-8")
        grep_data = StringIO(out)

        data = pd.read_csv(grep_data, sep='\t', names=columns, header=None)
    
    return data

def get_data(target):
    print("    read excape")
    excape = zgrep_data("pubchem.chembl.dataset4publication_inchi_smiles_v2.tsv.xz", target).set_index('Ambit_InchiKey')
    excape = excape[excape.Gene_Symbol == target]
    excape = excape.fillna(value={"pXC50": 0.0})
    excape = excape[["Original_Entry_ID","Entrez_ID", "pXC50", "DB", "Gene_Symbol", "InChI"]]
    print("    normalize SMILES")
    RDKIT_SMILES = applyParallel(excape["InChI"], normalize_smiles_from_inchi_pandas).rename("RDKIT_SMILES")
    subset = pd.concat([excape, RDKIT_SMILES], axis=1, join='inner')
    subset = subset.dropna(subset=["RDKIT_SMILES"])
    #we sort by activity and keep the lowest activity value for duplicates
    subset = subset.sort_values(by="pXC50", ascending=True).drop_duplicates(subset="RDKIT_SMILES", keep='first')
    return subset

def cecfp6_from_mol(mol):
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=False)
    size = 2048
    nfp = np.zeros(size, np.int32)
    for idx,v in fp.GetNonzeroElements().items():
        nidx = idx%size
        nfp[nidx] += int(v)
    return nfp

def generate_fingerprints(df):
    mols = [Chem.MolFromSmiles(smi) for smi in df]
    cfp = [cecfp6_from_mol(mol) for mol in mols]
    return pd.DataFrame({"cfp":cfp}, index=df.index)
    
def create_training_test_validation(df, activity_threshold, training_actives=0.6, test_actives=0.20, validation_actives=0.20, force_specific_smiles_into_test=[]):
    print("  assign inactives")
    inactives = df.query('pXC50 < @activity_threshold')
    print(f"\t\tNumber of inactives: {len(inactives)}")
    if len(inactives) < 100000:
        print("\t\t\tless than 100000 inactives. Using only: {}".format(len(inactives)))
        inactives = inactives.sample(n=len(inactives), replace=False, random_state=rnd_seed)
    else:
        inactives = inactives.sample(n=100000, replace=False, random_state=rnd_seed)
    inactives_training = inactives.sample(frac=training_actives, replace=False, random_state=rnd_seed).index
    inactives_test = inactives.drop(inactives_training).sample(n=int(round(len(inactives)*test_actives)), replace=False, random_state=rnd_seed).index
    inactives_validation = inactives.drop(inactives_training).drop(inactives_test).index
    
    df.loc[(inactives_training,"trainingset_class")] = "training"
    df.loc[(inactives_test,"trainingset_class")] = "test"
    df.loc[(inactives_validation,"trainingset_class")] = "validation"
    
    actives = df.query('pXC50 >= @activity_threshold')
    print(f"\t\tNumber of actives: {len(actives)}")
    actives = actives.sample(n=len(actives), replace=False, random_state=rnd_seed)
    
    def cluster_contains_compounds_to_filter(cluster, force_specific_smiles_into_test):
        if len(force_specific_smiles_into_test) == 0:
            return False
        for id_ in cluster:
            if actives.iloc[id_]["RDKIT_SMILES"] in force_specific_smiles_into_test:
                return True
        else:
            return False
    
    print("  cluster actives")
    clusters = ParallelClusterData(actives["cfp"].values, len(actives), DIST_THRESHOLD, distFunc=bulk_counted_tanimoto, reordering=True)
    print("  assign actives")
    counter = 0
    
    nb_test = 0
    nb_validation = 0
    nb_training = 0
    for cluster_id, cluster in enumerate(clusters):
        if cluster_contains_compounds_to_filter(cluster, force_specific_smiles_into_test):
            trainingset_class = "test"
            nb_test += len(cluster)
        else:        
            
            progress_test = nb_test/(len(actives)*test_actives)
            progress_validation = nb_validation/(len(actives)*validation_actives)
            progress_training = nb_training/(len(actives)*training_actives)
            
            progresses = [progress_test,progress_validation,progress_training]
            idx = progresses.index(min(progresses))
            if idx == 0:
                trainingset_class = "test"
                nb_test += len(cluster)
            elif idx == 1:
                trainingset_class = "validation"
                nb_validation += len(cluster)
            elif idx == 2:
                trainingset_class = "training"  
                nb_training += len(cluster)
                
        for id_ in cluster:
            index =  actives.iloc[id_].name
            df.loc[(index,'trainingset_class')] = trainingset_class
            df.loc[(index,'cluster_id')] = cluster_id
            
    return df

In [6]:
for target in targets:
    def do(target):
        print("Process {}".format(target))
        print("  grep data")
        excape_subset = get_data(target)
        print("  generate fingerprints")
        fingerprints  = applyParallel(excape_subset["RDKIT_SMILES"], generate_fingerprints)
        excape_subset = pd.concat([excape_subset, fingerprints], axis=1, join='inner')
        excape_subset["trainingset_class"] = None
        excape_subset["cluster_id"] = None

        exclude_smiles = []
        target_df = excape_subset[excape_subset.Gene_Symbol == target].sample(frac=1. ,random_state=rnd_seed).copy()
        target_df = create_training_test_validation(target_df, activity_threshold=activity_threshold, force_specific_smiles_into_test=exclude_smiles)
        target_df = target_df.dropna(subset=["trainingset_class"])
        target_df["trainingset_class"] = target_df["trainingset_class"].astype('category')
        target_df["cluster_id"] =  target_df["cluster_id"].fillna(-1).astype('int64')
        target_df["activity_label"] = (target_df["pXC50"] >= activity_threshold).astype('int')
        target_df = target_df.sample(frac=1. ,random_state=rnd_seed)
        print("  save dataframe to {}_df.pkl.gz".format(target))
        target_df.to_pickle(target+"_df.pkl.gz")
    do(target)

Process DRD2
  grep data
    read excape
    normalize SMILES


RDKit ERROR: [16:16:21] Explicit valence for atom # 19 N, 5, is greater than permitted


  generate fingerprints
  assign inactives
		Number of inactives: 346206
		Number of actives: 2981
  cluster actives
  assign actives
  save dataframe to DRD2_df.pkl.gz
Process HTR1A
  grep data
    read excape
    normalize SMILES
  generate fingerprints
  assign inactives
		Number of inactives: 66684
			less than 100000 inactives. Using only: 66684
		Number of actives: 3599
  cluster actives
  assign actives
  save dataframe to HTR1A_df.pkl.gz
