In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import time
import datetime

import json
import shutil
import argparse
import subprocess

import numpy as np
import pandas as pd

from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')

from d360api import d360api

In [2]:
def dataDownload(my_query_id=3539, user_name="yjing@kymeratx.com", tokenFile='yjing_D360.token'):
    # Create API connection to the PROD server
    my_d360 = d360api(provider="https://10.3.20.47:8080")  # PROD environment
    user_name = user_name
    tokenFile = tokenFile
    
    with open(tokenFile, 'r') as ofh:
        service_token = ofh.readlines()[0]

    # Authenticate connection using service token
    print(f"\tThe D360 query ID is {my_query_id}")
    my_d360.authenticate_servicetoken(servicetoken=service_token, user=user_name)
    results = my_d360.download_query_results(query_id=my_query_id)
    return results


def _determine_encoding(dataFile):
    import chardet

    # Step 1: Open the CSV file in binary mode
    with open(dataFile, 'rb') as f:
        data = f.read()

    # Step 2: Detect the encoding using the chardet library
    encoding_result = chardet.detect(data)

    # Step 3: Retrieve the encoding information
    encoding = encoding_result['encoding']

    # Step 4: Print/export the detected encoding information
    # print("Detected Encoding:", encoding)
    return encoding
    

def Step_1_load_data(my_query_id=3539, fileName_in=None, tmp_folder="./tmp", sep=','):
    ## count time
    beginTime = time.time()
    ## ------------------------------------------------------------------
    assert my_query_id is not None or fileName_in is not None, f"\tError, both <my_query_id> and <dataFile> are None"
    if my_query_id is not None:
        print(f"\tRun D360 query on ID {my_query_id}")
        ## download data from D360 using API
        dataTableFileName = dataDownload(my_query_id=my_query_id)
        print(f'\tAll data have been downloaded in file {dataTableFileName}')

        ## move the csv file to tmp folder
        fileName_in = f"{tmp_folder}/{dataTableFileName}"
        shutil.move(dataTableFileName, fileName_in)
        print(f"\tMove the downloaded file {dataTableFileName} to {fileName_in}")
    else:
        print(f"\tDirectly loading data from {fileName_in}")
        assert os.path.exists(fileName_in), f"File {fileName_in} does not exist"

    try:
        ## determine encoding type
        encoding = _determine_encoding(fileName_in)
        # encoding = 'ISO-8859-1'
        ## read csv file
        print(f"\tNow reading csv data using <{encoding}> encoding from {fileName_in}")
        dataTable = pd.read_csv(fileName_in, sep=sep, encoding=encoding).reset_index(drop=True)
    except Exception as e:
        print(f'\tError: cannot read output file {fileName_in}; error msg: {e}')
        dataTable = None
    else:
        print(f"\tThe loaded raw data has <{dataTable.shape[0]}> rows and {dataTable.shape[1]} columns")

    ## ------------------------------------------------------------------
    costTime = time.time()-beginTime
    print(f"==> Step 1 <Loading csv data> complete, costs time = %ds ................\n" % (costTime))

    return dataTable

In [3]:
dataTable_raw = Step_1_load_data(my_query_id=None, fileName_in='D360_dataset_q_id3539_160125_1620.csv', tmp_folder="./tmp", sep=',')
print(dataTable_raw.shape)
dataTable_raw.head(3)

	Directly loading data from D360_dataset_q_id3539_160125_1620.csv
	Now reading csv data using <utf-8> encoding from D360_dataset_q_id3539_160125_1620.csv
	The loaded raw data has <348560> rows and 46 columns
==> Step 1 <Loading csv data> complete, costs time = 96s ................

(348560, 46)


Unnamed: 0,Marked,Compound Name,Structure,Concat;Project,Concat;External Id,Created On,Molecular Weight,in Silico PhysChem Property;Mean;ChemAxon PSA;(Mod),in Silico PhysChem Property;Mean;ChemAxon PSA;(Num),in Silico PhysChem Property;Mean;Corr_ChemAxon_bpKa1;(Mod),...,ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Mod),ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Num),ADME Tox-manual patch hERG 34C;Concat;Comments,ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Mod),ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Num),ADME Tox-manual patch hERG 34C;Concat;Date run,ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Mod),ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Num),ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num)
0,UNMARKED,KT-0346391,CC(=O)C3=C(C)c1cnc(nc1N(C2CCCC2)C3=O)Nc4ccc(cn...,CBL-C,PH-CMR-CLB-267-0N-001,31-Oct-2024,1121.226,=,217.96,=,...,,,,,,,,,,
1,UNMARKED,KT-0000036,c1(cc(c(cc1)Nc2nc(ncc2Cl)Nc3c(cc(c(c3)OCCOCCOC...,ZAP-70 and Kinases,ZP-028-001H,11-Feb-2017,911.391,=,246.97,=,...,,,,,,,,,,
2,UNMARKED,KT-0000038,c1(cc(c(cc1)Nc2nc(ncc2Cl)Nc3c(cc(c(c3)OCCOCCOC...,ZAP-70 and Kinases,ZP-030-001F,11-Feb-2017,999.497,=,265.43,=,...,,,,,,,,,,


In [4]:
dataTable_raw2 = dataTable_raw.dropna(subset=['Structure'])
print(dataTable_raw2.shape)

(348550, 46)


In [7]:
def _cleanUpSmiles(smi):
    ## text processing
    try:
        if "|" in smi:
            smi = smi.split("|")[0]
        smi = smi.replace("\n", "").replace("\r", "").replace("\r\n", "")
    except Exception as e:
        print(f"Error: {e}, {smi}")
        smi = None

    ## rdkit smiles vadality checking
    try:
        mol = Chem.MolFromSmiles(smi)
        smi_rdkit = Chem.MolToSmiles(mol)
    except:
        smi_rdkit = np.nan
    return smi_rdkit


def Step_2_clean_data(dataTable, dict_prop_cols, colName_mid, colName_smi, tmp_folder="./tmp"):
    ## count time
    beginTime = time.time()
    print(f"2. Cleaning data ...")
    ## ------------------------------------------------------------------
    print(f'\tChecking the vadality of the SMILES using RDKit ...')
    dataTable[f"{colName_smi}_raw"] = dataTable[colName_smi].apply(lambda x: x)
    dataTable[colName_smi] = dataTable[colName_smi].apply(lambda x: _cleanUpSmiles(x))

    ## ------------------------- remove invalid smiles -------------------------
    dataTable = dataTable.dropna(subset=[colName_mid, colName_smi]).reset_index(drop=True)
    print(f'\tThere are total <{dataTable.shape[0]}> molecules with valid SMILES<{colName_smi}>')
    return dataTable

In [None]:
dataTable_4mmp = Step_2_clean_data(dataTable=dataTable_raw, dict_prop_cols=None, colName_mid='Compound Name', colName_smi='Structure', tmp_folder="./tmp")
print(dataTable_4mmp.shape)
dataTable_4mmp.head(3)