In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import chardet
import numpy as np
import pandas as pd

#### 0_load file

In [2]:
dataFile = f'./D360_dataset_q_id3539_101224_1619.csv'

## determine encoding type
def determine_encoding(dataFile):
    # Step 1: Open the CSV file in binary mode
    with open(dataFile, 'rb') as f:
        data = f.read()
    
    # Step 2: Detect the encoding using the chardet library
    encoding_result = chardet.detect(data)

    # Step 3: Retrieve the encoding information
    encoding = encoding_result['encoding']

    # Step 4: Print/export the detected encoding information
    # print("Detected Encoding:", encoding)
    return encoding

encoding = determine_encoding(dataFile)

## read csv file
dataTable = pd.read_csv(dataFile, encoding=encoding)
print(dataTable.shape)
dataTable.head(3)

(346846, 40)


Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,Created On,Molecular Weight,Marked,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),...,ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Mod),ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Num),ADME Tox-manual patch hERG 34C;Concat;Comments,ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Mod),ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Num),ADME Tox-manual patch hERG 34C;Concat;Date run,ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Mod),ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Num),ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num)
0,KT-0346391,CC(=O)C3=C(C)c1cnc(nc1N(C2CCCC2)C3=O)Nc4ccc(cn...,CBL-C,PH-CMR-CLB-267-0N-001,31-Oct-2024,1121.226,UNMARKED,,,,...,,,,,,,,,,
1,KT-0000036,c1(cc(c(cc1)Nc2nc(ncc2Cl)Nc3c(cc(c(c3)OCCOCCOC...,ZAP-70 and Kinases,ZP-028-001H,11-Feb-2017,911.391,UNMARKED,,,,...,,,,,,,,,,
2,KT-0000038,c1(cc(c(cc1)Nc2nc(ncc2Cl)Nc3c(cc(c(c3)OCCOCCOC...,ZAP-70 and Kinases,ZP-030-001F,11-Feb-2017,999.497,UNMARKED,,,,...,,,,,,,,,,


#### 1_clean up hERG data

In [3]:
## ------------------------------------------------------------------
def CheckThePropertyDataStats(dataTable, col_prop_prefix, propName):
    col_mod, col_num = f"{col_prop_prefix}(Mod)", f"{col_prop_prefix}(Num)"
    if (col_mod in dataTable) and (col_num in dataTable):
        cond_1 = (dataTable[col_mod]=='=')
        cond_2 = (dataTable[col_num].notna())
        # print(dataTable[cond_1].shape, dataTable[cond_2].shape)
        data_size_available = dataTable[cond_1 & cond_2].shape[0]
        print(f"\tThere are total {data_size_available} existing data for {propName}")
        passCheck = True
    else:
        print(f"\tWarning! The column {col_prop_prefix}(Mod)/(Num) is not in the table.")
        passCheck = False
    return passCheck

## ------------------------------------------------------------------
def clean_up_prop_data(row, col_prop_prefix, propName):
    colName_mod = f"{col_prop_prefix}(Mod)"
    colName_num = f"{col_prop_prefix}(Num)"

    if row[colName_mod] == '=' and row.notna()[colName_num]:
        result = row[colName_num] 
    else:
        result = np.nan
    return result

## ------------------------------------------------------------------
def rm_elacridar_records(row, col_perctgF='Bioavailability', col_vehicle='ADME PK;Concat;Vehicle'):
    result = row[col_perctgF]
    if row.notna()[col_vehicle]:
        if 'elacridar' in row[col_vehicle]:
            result = np.nan
            print(f"\t------>change from {row[col_perctgF]} to np.nan, {row[col_vehicle]}")
    return result

## ------------------------------------------------------------------
def calc_mean(value_list):
    value_list_clean = []
    for v in value_list:
        if v not in [None, np.nan, '', ' ']:
            try:
                v_num = float(v)
            except Exception as e:
                print(f'\tError, cannot numericalize value {v}', e)
            else:
                value_list_clean.append(v_num)
    return np.mean(value_list_clean)

def calc_eIC50_hERG_from_cmt(comments_str):
    # e.g., comments_str = '21.38% inhibition @ 10 ?M' or '11.17 inhibition @ 3 ?M'
    try:
        [str_inhb, str_conc] = comments_str.split('@')

        if '%' in str_inhb:
            inhb = str_inhb.split('%')[0]
        elif 'inhibit' in str_inhb:
            inhb = str_inhb.split('inhibit')[0]
        else:
            inhb = 'N/A'
        
        try:
            inhb = float(inhb)
        except:
            eIC50 = None
        else:
            inhb = 0.1 if inhb < 0 else (99.99 if inhb > 100 else inhb)
            conc = float(str_conc.split('M')[0][:-1])
            eIC50 = conc*(100-inhb)/inhb
            
    except Exception as e:
        eIC50 = None
        if comments_str not in [' ', '/']:
            print(f'\tError, cannot calc hERG eIC50 from comment data. {comments_str}')
    return eIC50

def calc_hERG_eIC50(row, col_hERG_cmts):
    if col_hERG_cmts in row:
        if row.notna()[col_hERG_cmts]:
            hERG_eIC50_list = []
            for cmnt in row[col_hERG_cmts].split(';'):
                this_eIC50 = calc_eIC50_hERG_from_cmt(cmnt)
                hERG_eIC50_list.append(this_eIC50)
            hERG_eIC50 = calc_mean(hERG_eIC50_list)
        else:
            result = np.nan
            # print(f"\tNo data in this row for column <{col_hERG_cmts}>")
    else:
        result = np.nan
        print(f"\tColumn <{col_hERG_cmts}> is not in the Table")
     
def calc_hERG_mIC50(row, col_hERG_IC50, col_hERG_eIC50):
    if row.notna()[col_hERG_IC50]:
        result = row[col_hERG_IC50]
    elif row.notna()[col_hERG_eIC50]:
        result = row[col_hERG_eIC50]
    else:
        result = np.nan
    return result

## ------------------------------------------------------------------
def calc_EstFa_fromAdm(PKF_PO, Clobs_IV, Species='Rat'):
    dict_IV_ratio = {'Rat': 90, 'Mouse': 70, 'Dog': 30, 'Monkey': 44}    
    try:
        estfa = (PKF_PO/100)/(1-(Clobs_IV/dict_IV_ratio[Species]))
    except Exception as e:
        estfa = np.nan
    return estfa

def calc_EstFa(row, colName_pctF, colName_Clobs, Species='Rat'):
    try:
        pctgF_PO, Clobs_IV = row[colName_pctgF], row[colName_Clobs]
    except Exception as e:
        # print(f"\tWarning! Cannot get data for this row from column <{colName_pctgF}> or <{colName_Clobs}>")
        result = np.nan
    else:
        result = calc_EstFa(pctgF_PO, Clobs_IV, Species=Species)
    return result


## ------------------------------------------------------------------
dict_prop_cols = {
    'Permeability': 'ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);', 
    'Efflux': 'ADME MDCK (MDR1) efflux;Mean;Efflux Ratio;', 
    'Bioavailability': 'ADME PK;Mean;F %;Dose: 10.000 (mg/kg);Route of Administration: PO;Species: Rat;', 
    'Cl_obs': 'Copy 1 ;ADME PK;Mean;Cl_obs(mL/min/kg);Dose: 2.000 (mg/kg);Route of Administration: IV;Species: Rat;',
    'hERG_IC50': 'ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];',
    'hERG_eIC50': 'ADME Tox-manual patch hERG 34C;Concat;Comments',
    'estFa': 'Not Availale',
    'MW': 'Molecular Weight',
    }

## ------------------------------------------------------------------
for prop in dict_prop_cols:
    passCheck = CheckThePropertyDataStats(dataTable, col_prop_prefix=dict_prop_cols[prop], propName=prop)
    if passCheck:
        dataTable[prop] = dataTable.apply(lambda row: clean_up_prop_data(row, col_prop_prefix=dict_prop_cols[prop], propName=prop), axis=1)

    ## remove the 'elacridar' records
    if prop == 'Bioavailability':
        print(f"\t==>The num rows with cleaned {prop} data (raw) is:", str(dataTable[dataTable[prop].notna()].shape[0]))
        dataTable[prop] = dataTable.apply(lambda row: rm_elacridar_records(row, col_perctgF=prop, col_vehicle='ADME PK;Concat;Vehicle'), axis=1)
        print(f"\t==>The num rows with cleaned {prop} data (no elacridar) is:", str(dataTable[dataTable[prop].notna()].shape[0]))

    ## calc estFa
    if prop == 'estFa':
        dataTable[prop] = dataTable.apply(lambda row: calc_EstFa(row, 'Bioavailability', 'Cl_obs', Species='Rat'), axis=1)

    ## calc hERG eIC50
    if prop == 'hERG_eIC50':
        dataTable[prop] = dataTable.apply(lambda row: calc_hERG_eIC50(row, dict_prop_cols[prop]), axis=1)
        dataTable['hERG_mixedIC50'] = dataTable.apply(lambda row: calc_hERG_mIC50(row, 'hERG_IC50', 'hERG_eIC50'), axis=1)

    ## rename MW
    if prop == 'MW':
        dataTable[prop] = dataTable[dict_prop_cols[prop]].apply(lambda x: x)

    ## report
    print(f"\t==>The num rows with cleaned {prop} data is:", str(dataTable[dataTable[prop].notna()].shape[0]))


	There are total 3730 existing data for Permeability
	==>The num rows with cleaned Permeability data is: 3730
	There are total 2264 existing data for Efflux
	==>The num rows with cleaned Efflux data is: 2264
	There are total 3170 existing data for Bioavailability
	==>The num rows with cleaned Bioavailability data (raw) is: 3170
	==>The num rows with cleaned Bioavailability data (no elacridar) is: 3170
	==>The num rows with cleaned Bioavailability data is: 3170
	There are total 4085 existing data for Cl_obs
	==>The num rows with cleaned Cl_obs data is: 4085
	There are total 806 existing data for hERG_IC50
	==>The num rows with cleaned hERG_IC50 data is: 806
	==>The num rows with cleaned hERG_eIC50 data is: 0
	==>The num rows with cleaned estFa data is: 0
	==>The num rows with cleaned MW data is: 346846


In [4]:
colNames_basic = ['Compound Name', 'Structure', 'Concat;Project', 'Concat;External Id'] 
dataTable_short = dataTable[colNames_basic + list(dict_prop_cols.keys())]
print(dataTable_short.shape)
dataTable_short.head(3)

(346846, 12)


Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,Permeability,Efflux,Bioavailability,Cl_obs,hERG_IC50,hERG_eIC50,estFa,MW
0,KT-0346391,CC(=O)C3=C(C)c1cnc(nc1N(C2CCCC2)C3=O)Nc4ccc(cn...,CBL-C,PH-CMR-CLB-267-0N-001,,,,,,,,1121.226
1,KT-0000036,c1(cc(c(cc1)Nc2nc(ncc2Cl)Nc3c(cc(c(c3)OCCOCCOC...,ZAP-70 and Kinases,ZP-028-001H,,,,,,,,911.391
2,KT-0000038,c1(cc(c(cc1)Nc2nc(ncc2Cl)Nc3c(cc(c(c3)OCCOCCOC...,ZAP-70 and Kinases,ZP-030-001F,,,,,,,,999.497


In [5]:
# dataTable_short.to_csv(f'Data_4_MMP_2024Dec10.csv', index=False)