In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import chardet
import numpy as np
import pandas as pd

#### 0_load file

In [2]:
dataFile = f'./D360_dataset_q_id3539_101224_1318.csv'

## determine encoding type
def determine_encoding(dataFile):
    # Step 1: Open the CSV file in binary mode
    with open(dataFile, 'rb') as f:
        data = f.read()
    
    # Step 2: Detect the encoding using the chardet library
    encoding_result = chardet.detect(data)

    # Step 3: Retrieve the encoding information
    encoding = encoding_result['encoding']

    # Step 4: Print/export the detected encoding information
    # print("Detected Encoding:", encoding)
    return encoding

encoding = determine_encoding(dataFile)

## read csv file
dataTable = pd.read_csv(dataFile, encoding=encoding)
print(dataTable.shape)
dataTable.head(3)

(48288, 39)


Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,Created On,Molecular Weight,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Num),...,ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Num),ADME Tox-manual patch hERG 34C;Concat;Comments,ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Mod),ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Num),ADME Tox-manual patch hERG 34C;Concat;Date run,ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Mod),ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Num),ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num),Marked
0,KT-0026805,Clc4nc3c(c1c(sc2c1N(C[C@H](NC2=O)C)C)cc3)cc4,MK2,MK2-325-001T,12-Aug-2021,331.828,,,,,...,,,,,,,,,,UNMARKED
1,KT-0026821,F[C@@H]1[C@@H](C1)NC(=O)c3n2nc(cc(c2nc3)NC4CCN...,TYK2,PH-CMR-TK2-191-0N-001,13-Aug-2021,774.818,=,0.904733,=,1.147038,...,,,,,,,,,,UNMARKED
2,KT-0026822,n1n8c(c(cc1NC2=CC=CN(C2=O)c3ccccn3)NCCCN4CCC(C...,TYK2,PH-CMR-TK2-192-0N-001,13-Aug-2021,802.872,=,1.421024,=,2.045365,...,,,,,,,,,,UNMARKED


In [5]:
def extractPropertyDataFromD360Table(row, colName_mod, colName_num):
    result = np.nan
    if colName_mod in row and colName_num in row:
        if row.notna()[colName_mod] and row.notna()[colName_num]:
            if row[colName_mod] == '=':
                result = row[colName_num]
    return result

def calc_mean(value_list):
    value_list_clean = []
    for v in value_list:
        if v not in [None, np.nan, '', ' ']:
            try:
                v_num = float(v)
            except Exception as e:
                print(f'Error, cannot numericalize value {v}', e)
            else:
                value_list_clean.append(v_num)
    return np.mean(value_list_clean)

def calc_eIC50_hERG(comments_str):
    # e.g., comments_str = '21.38% inhibition @ 10 ?M'
    ## 11.17 inhibition @ 3 ?M
    try:
        [str_inhb, str_conc] = comments_str.split('@')

        if '%' in str_inhb:
            inhb = str_inhb.split('%')[0]
        elif 'inhibit' in str_inhb:
            inhb = str_inhb.split('inhibit')[0]
        else:
            inhb = 'N/A'
        
        try:
            inhb = float(inhb)
        except:
            eIC50 = None
        else:
            inhb = 0.1 if inhb < 0 else (99.99 if inhb > 100 else inhb)
            conc = float(str_conc.split('M')[0][:-1])
            eIC50 = conc*(100-inhb)/inhb
            
    except Exception as e:
        eIC50 = None
        if comments_str not in [' ', '/']:
            print(f'Error, cannot calc hERG eIC50 from comment data. {comments_str}')
    return eIC50

def calc_EstFa(PKF_PO, Clobs_IV, Species='Rat'):
    dict_IV_ratio = {'Rat': 90, 'Mouse': 70, 'Dog': 30, 'Monkey': 44}    
    try:
        estfa = (PKF_PO/100)/(1-(Clobs_IV/dict_IV_ratio[Species]))
    except Exception as e:
        estfa = np.nan
    return estfa

#########################################################################################
def clean_up_permeability(row):
    colName_prefix = 'ADME MDCK(WT) Permeability'
    colName_a2b_mod = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Mod)'
    colName_a2b_num = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Num)'
    permeability = extractPropertyDataFromD360Table(row, colName_a2b_mod, colName_a2b_num)
    return permeability

def clean_up_efflux(row):
    colName_prefix = 'ADME MDCK (MDR1) efflux'
    colName_efflux_mod = colName_prefix + ';Mean;' + 'Efflux Ratio;(Mod)'
    colName_efflux_num = colName_prefix + ';Mean;' + 'Efflux Ratio;(Num)'
    efflux = extractPropertyDataFromD360Table(row, colName_efflux_mod, colName_efflux_num)
    return efflux

def clean_up_hERG(row, eIC50=False):
    colName_prefix = 'ADME Tox-manual patch hERG 34C'

    ## expt IC50
    colName_ic50_mod = colName_prefix + ';GMean;' + 'm-patch hERG IC50 [uM];(Mod)'
    colName_ic50_num = colName_prefix + ';GMean;' + 'm-patch hERG IC50 [uM];(Num)'
    hERG_IC50 = extractPropertyDataFromD360Table(row, colName_ic50_mod, colName_ic50_num)

    ## estimated IC50 by comments column
    if eIC50:
        colName_hERG_cmnt = colName_prefix + ';Concat;' + 'Comments'
        hERG_eIC50_list = []
        if colName_hERG_cmnt in row:
            if row.notna()[colName_hERG_cmnt]:
                for cmnt in row[colName_hERG_cmnt].split(';'):
                    hERG_eIC50_list.append(calc_eIC50_hERG(cmnt))    
        hERG_eIC50 = calc_mean(hERG_eIC50_list)

        ## determine mixedIC50
        if not np.isnan(hERG_IC50):
            hERG_mixedIC50, ambitiousData = hERG_IC50, 0
        elif not np.isnan(hERG_eIC50):
            hERG_mixedIC50, ambitiousData = hERG_eIC50, 1
        else:
            hERG_mixedIC50, ambitiousData = np.nan, np.nan
        return pd.Series([hERG_IC50, hERG_eIC50, hERG_mixedIC50, ambitiousData])
    else:
        return hERG_IC50

def clean_up_PK(row, Species='Rat', EstFa=False):
    colName_PKF_mod = f'ADME PK;Mean;F %;Dose: 10.000 (mg/kg);Route of Administration: PO;Species: {Species};(Mod)'
    colName_PKF_num = f'ADME PK;Mean;F %;Dose: 10.000 (mg/kg);Route of Administration: PO;Species: {Species};(Num)'
    PKF_PO = extractPropertyDataFromD360Table(row, colName_PKF_mod, colName_PKF_num)

    if EstFa:
        colName_Cl_mod = f'Copy 1 ;ADME PK;Mean;Cl_obs(mL/min/kg);Dose: 2.000 (mg/kg);Route of Administration: IV;Species: {Species};(Mod)'
        colName_Cl_num = f'Copy 1 ;ADME PK;Mean;Cl_obs(mL/min/kg);Dose: 2.000 (mg/kg);Route of Administration: IV;Species: {Species};(Num)'
        Clobs_IV = extractPropertyDataFromD360Table(row, colName_Cl_mod, colName_Cl_num)
        EstFa = calc_EstFa(PKF_PO, Clobs_IV)
        return pd.Series([PKF_PO, EstFa])
    else:
        return PKF_PO

In [6]:
#########################################################################################
Species = 'Rat'
# dataTable['logD_CDD'] = dataTable['Log D'].apply(lambda x: x)
dataTable['MW'] = dataTable['Molecular Weight'].apply(lambda x: x)
dataTable[[f'F%_{Species}', f'EstFa_{Species}']] = dataTable.apply(lambda row: clean_up_PK(row, Species=Species, EstFa=True), axis=1)
dataTable['permeability'] = dataTable.apply(lambda row: clean_up_permeability(row), axis=1)
dataTable['efflux'] = dataTable.apply(lambda row: clean_up_efflux(row), axis=1)
dataTable[['hERG_IC50', 'hERG_eIC50', 'hERG_mixedIC50', 'ambitiousData']] = dataTable.apply(lambda row: clean_up_hERG(row, eIC50=True), axis=1)

print(dataTable.shape)
dataTable.head(3)

(48288, 48)


Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,Created On,Molecular Weight,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Num),...,Marked,MW,F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_eIC50,hERG_mixedIC50,ambitiousData
0,KT-0026805,Clc4nc3c(c1c(sc2c1N(C[C@H](NC2=O)C)C)cc3)cc4,MK2,MK2-325-001T,12-Aug-2021,331.828,,,,,...,UNMARKED,331.828,,,,,,,,
1,KT-0026821,F[C@@H]1[C@@H](C1)NC(=O)c3n2nc(cc(c2nc3)NC4CCN...,TYK2,PH-CMR-TK2-191-0N-001,13-Aug-2021,774.818,=,0.904733,=,1.147038,...,UNMARKED,774.818,,,0.904733,2.446261,,,,
2,KT-0026822,n1n8c(c(cc1NC2=CC=CN(C2=O)c3ccccn3)NCCCN4CCC(C...,TYK2,PH-CMR-TK2-192-0N-001,13-Aug-2021,802.872,=,1.421024,=,2.045365,...,UNMARKED,802.872,,,1.421024,7.164047,,,,


In [None]:
dataTable.to_csv(f'Data_4_MMP_2024Dec10.csv', index=False)

In [None]:
dataTable['Concat;Project'].value_counts()