In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import chardet
import numpy as np
import pandas as pd

In [2]:
def determine_encoding(dataFile):
    # Step 1: Open the CSV file in binary mode
    with open(dataFile, 'rb') as f:
        data = f.read()
    
    # Step 2: Detect the encoding using the chardet library
    encoding_result = chardet.detect(data)

    # Step 3: Retrieve the encoding information
    encoding = encoding_result['encoding']

    # Step 4: Print/export the detected encoding information
    # print("Detected Encoding:", encoding)
    return encoding

In [3]:
dataFile = f'./DataView_ADMET_4_MMP_1__export.csv'
encoding = determine_encoding(dataFile)

dataTable = pd.read_csv(dataFile, encoding=encoding)
dataTable.head(3)

Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Concat;Comments,ADME MDCK(WT) Permeability;Concat;Run Date,...,ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Mod),ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Num),ADME Tox-manual patch hERG 34C;Concat;Comments,ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Mod),ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Num),ADME Tox-manual patch hERG 34C;Concat;Date run,ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Mod),ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Num),ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num)
0,KT-0194988,C3(CCN(C(=O)c1cc(c(c(c1)N2CCC(=O)NC2=O)Cl)C)CC...,TYK2,PH-CMR-TK2-2575-0N-001,,,,,,,...,=,14.741188,21.36% inhibition @ 3 ?M; ; ;,=,1.433333,5/21/2024;5/21/2024;5/21/2024;5/21/2024,>,3.0,=,0.467074
1,KT-0194990,N1(CCC(=O)NC1=O)c2cc(cc(c2C)Cl)C(=O)N%11CCC%10...,TYK2,PH-CMR-TK2-2610-0N-001,,,,,,,...,=,18.038312,24.47% inhibition @ 3 ?M; ; ;,=,1.433333,5/21/2024;5/21/2024;5/21/2024;5/21/2024,>,3.0,=,1.265248
2,KT-0194991,CNc1cc(nn2c(cnc21)C(=O)N[C@H]3[C@H](OC)CC3)N5c...,TYK2,PH-CMR-TK2-2615-0N-001,,,,,,,...,=,17.836152,,=,1.433333,5/21/2024;5/21/2024;5/21/2024;5/21/2024,=,5.19295,=,4.445823


In [4]:
def extractPropertyDataFromD360Table(row, colName_mod, colName_num):
    result = np.nan
    if colName_mod in row and colName_num in row:
        if row.notna()[colName_mod] and row.notna()[colName_num]:
            if row[colName_mod] == '=':
                result = row[colName_num]
    return result

def calc_mean(value_list):
    value_list_clean = []
    for v in value_list:
        if v not in [None, np.nan, '', ' ']:
            try:
                v_num = float(v)
            except Exception as e:
                print(f'Error, cannot numericalize value {v}', e)
            else:
                value_list_clean.append(v_num)
    return np.mean(value_list_clean)

def calc_eIC50_hERG(comments_str):
    # e.g., comments_str = '21.38% inhibition @ 10 ?M'
    try:
        [str_inhb, str_conc] = comments_str.split('@')
        inhb = float(str_inhb.split('%')[0])
        inhb = 0.1 if inhb < 0 else (99.99 if inhb > 100 else inhb)
        conc = float(str_conc.split('M')[0][:-1])
        eIC50 = conc*(100-inhb)/inhb
    except Exception as e:
        eIC50 = None
        if comments_str not in [' ', '/']:
            print(f'Error, cannot calc hERG eIC50 from comment data. {comments_str}')
    return eIC50

def calc_EstFa(PKF_PO, Clobs_IV, Species='Rat'):
    dict_IV_ratio = {'Rat': 90, 'Mouse': 70, 'Dog': 30, 'Monkey': 44}    
    try:
        estfa = (PKF_PO/100)/(1-(Clobs_IV/dict_IV_ratio[Species]))
    except Exception as e:
        estfa = np.nan
    return estfa

#########################################################################################
def clean_up_permeability(row):
    colName_prefix = 'ADME MDCK(WT) Permeability'
    # colName_a2b_rec = colName_prefix + ';Mean;' + 'A to B Recovery (%)' 
    colName_a2b_mod = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Mod)'
    colName_a2b_num = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Num)'
    permeability = extractPropertyDataFromD360Table(row, colName_a2b_mod, colName_a2b_num)
    return permeability

def clean_up_efflux(row):
    colName_prefix = 'ADME MDCK (MDR1) efflux'
    # colName_a2b_mod = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Mod)'
    # colName_a2b_num = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Num)'
    # colName_a2b_rec = colName_prefix + ';Mean;' + 'A to B Recovery (%)'
    # colName_b2a_mod = colName_prefix + ';Mean;' + 'B to A Papp (10^-6 cm/s);(Mod)'
    # colName_b2a_num = colName_prefix + ';Mean;' + 'B to A Papp (10^-6 cm/s);(Num)'
    # colName_b2a_rec = colName_prefix + ';Mean;' + 'B to A Recovery (%)'
    # colName_comments = colName_prefix + ';Concat;' + 'Comments'
    colName_efflux_mod = colName_prefix + ';Mean;' + 'Efflux Ratio;(Mod)'
    colName_efflux_num = colName_prefix + ';Mean;' + 'Efflux Ratio;(Num)'
    efflux = extractPropertyDataFromD360Table(row, colName_efflux_mod, colName_efflux_num)
    return efflux

def clean_up_hERG(row, eIC50=False):
    colName_prefix = 'ADME Tox-manual patch hERG 34C'
    # colName_hERG_date = colName_prefix + ';Concat;' + 'Date run'
    # colName_sd_mod = colName_prefix + ';GMean;' + 'SD;(Mod)'
    # colName_sd_num = colName_prefix + ';GMean;' + 'SD;(Num)'

    ## expt IC50
    colName_ic50_mod = colName_prefix + ';GMean;' + 'm-patch hERG IC50 [uM];(Mod)'
    colName_ic50_num = colName_prefix + ';GMean;' + 'm-patch hERG IC50 [uM];(Num)'
    hERG_IC50 = extractPropertyDataFromD360Table(row, colName_ic50_mod, colName_ic50_num)

    ## estimated IC50 by comments column
    if eIC50:
        colName_hERG_cmnt = colName_prefix + ';Concat;' + 'Comments'
        hERG_eIC50_list = []
        if colName_hERG_cmnt in row:
            if row.notna()[colName_hERG_cmnt]:
                for cmnt in row[colName_hERG_cmnt].split(';'):
                    hERG_eIC50_list.append(calc_eIC50_hERG(cmnt))    
        hERG_eIC50 = calc_mean(hERG_eIC50_list)

        ## determine mixedIC50
        if not np.isnan(hERG_IC50):
            hERG_mixedIC50, ambitiousData = hERG_IC50, 0
        elif not np.isnan(hERG_eIC50):
            hERG_mixedIC50, ambitiousData = hERG_eIC50, 1
        else:
            hERG_mixedIC50, ambitiousData = np.nan, np.nan
        return pd.Series([hERG_IC50, hERG_eIC50, hERG_mixedIC50, ambitiousData])
    else:
        return hERG_IC50

def clean_up_PK(row, Species='Rat', EstFa=False):
    colName_PKF_mod = f'ADME PK;Mean;F %;Dose: 10.000 (mg/kg);Route of Administration: PO;Species: {Species};(Mod)'
    colName_PKF_num = f'ADME PK;Mean;F %;Dose: 10.000 (mg/kg);Route of Administration: PO;Species: {Species};(Num)'
    PKF_PO = extractPropertyDataFromD360Table(row, colName_PKF_mod, colName_PKF_num)

    if EstFa:
        colName_Cl_mod = f'Copy 1 ;ADME PK;Mean;Cl_obs(mL/min/kg);Dose: 2.000 (mg/kg);Route of Administration: IV;Species: {Species};(Mod)'
        colName_Cl_num = f'Copy 1 ;ADME PK;Mean;Cl_obs(mL/min/kg);Dose: 2.000 (mg/kg);Route of Administration: IV;Species: {Species};(Num)'
        Clobs_IV = extractPropertyDataFromD360Table(row, colName_Cl_mod, colName_Cl_num)
        EstFa = calc_EstFa(PKF_PO, Clobs_IV)
        return pd.Series([PKF_PO, EstFa])
    else:
        return PKF_PO
    
#########################################################################################
Species = 'Rat'
dataTable[[f'F%_{Species}', f'EstFa_{Species}']] = dataTable.apply(lambda row: clean_up_PK(row, Species=Species, EstFa=True), axis=1)
dataTable['permeability'] = dataTable.apply(lambda row: clean_up_permeability(row), axis=1)
dataTable['efflux'] = dataTable.apply(lambda row: clean_up_efflux(row), axis=1)
dataTable[['hERG_IC50', 'hERG_eIC50', 'hERG_mixedIC50', 'ambitiousData']] = dataTable.apply(lambda row: clean_up_hERG(row, eIC50=True), axis=1)

In [5]:
# dataTable.to_csv(f'Data_ADMET_4_MMP.csv', index=False)
dataTable

Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Concat;Comments,ADME MDCK(WT) Permeability;Concat;Run Date,...,ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num),F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_eIC50,hERG_mixedIC50,ambitiousData
0,KT-0194988,C3(CCN(C(=O)c1cc(c(c(c1)N2CCC(=O)NC2=O)Cl)C)CC...,TYK2,PH-CMR-TK2-2575-0N-001,,,,,,,...,=,0.467074,10.80,0.125258,,,,11.044944,11.044944,1.0
1,KT-0194990,N1(CCC(=O)NC1=O)c2cc(cc(c2C)Cl)C(=O)N%11CCC%10...,TYK2,PH-CMR-TK2-2610-0N-001,,,,,,,...,=,1.265248,3.87,0.048645,,,,9.259910,9.259910,1.0
2,KT-0194991,CNc1cc(nn2c(cnc21)C(=O)N[C@H]3[C@H](OC)CC3)N5c...,TYK2,PH-CMR-TK2-2615-0N-001,,,,,,,...,=,4.445823,,,,,5.19295,,5.192950,0.0
3,KT-0194992,CNc1cc(nn2c(cnc21)C(=O)N[C@H]3[C@H](OC)CC3)N5c...,TYK2,PH-CMR-TK2-2616-0N-001,,,,,,,...,=,1.646552,,,,,3.22913,,3.229130,0.0
4,KT-0194993,CO[C@@H]1CC[C@H]1NC(=O)c2cnc3n2nc(cc3NC)N5CCc4...,TYK2,PH-CMR-TK2-2618-0N-001;PH-CMR-TK2-2618-0N-002,,,,,,,...,=,2.719742,7.30,0.104952,,,,16.841270,16.841270,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6145,KT-0199163,n1(nc(c2c1ccc(c2)-c3ccc(cc3)N4C=NN(C4=O)C)C(=O...,STAT-6,ST6-S-3601-001F,,,,,,,...,,,3.00,0.030241,,,,,,
6146,KT-0199164,Fc3c1c(n(nc1C(=O)Nc2cc(ccc2)C(=O)N)C)ccc3-c4cc...,STAT-6,ST6-S-3602-001N,,,,,,,...,,,2.40,0.024432,,,,,,
6147,KT-0199167,N1(N=CN(C1=O)c2ccc(cc2)-c3cc(c(cc3)C)N4CCN(C4=...,STAT-6,ST6-S-3607-001F,,,,,,,...,,,27.40,0.305652,,,,,,
6148,KT-0199168,Fc1c(cccc1-c2ccc(cc2)N3C=NN(C3=O)C)N4CCN(C4=O)...,STAT-6,ST6-S-3611-001N,,,,,,,...,,,1.08,0.011823,,,,,,


In [6]:
dataTable[dataTable['EstFa_Rat'].notna()]

Unnamed: 0,Compound Name,Structure,Concat;Project,Concat;External Id,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Concat;Comments,ADME MDCK(WT) Permeability;Concat;Run Date,...,ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num),F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_eIC50,hERG_mixedIC50,ambitiousData
0,KT-0194988,C3(CCN(C(=O)c1cc(c(c(c1)N2CCC(=O)NC2=O)Cl)C)CC...,TYK2,PH-CMR-TK2-2575-0N-001,,,,,,,...,=,0.467074,10.80,0.125258,,,,11.044944,11.044944,1.0
1,KT-0194990,N1(CCC(=O)NC1=O)c2cc(cc(c2C)Cl)C(=O)N%11CCC%10...,TYK2,PH-CMR-TK2-2610-0N-001,,,,,,,...,=,1.265248,3.87,0.048645,,,,9.259910,9.259910,1.0
4,KT-0194993,CO[C@@H]1CC[C@H]1NC(=O)c2cnc3n2nc(cc3NC)N5CCc4...,TYK2,PH-CMR-TK2-2618-0N-001;PH-CMR-TK2-2618-0N-002,,,,,,,...,=,2.719742,7.30,0.104952,,,,16.841270,16.841270,1.0
5,KT-0194995,CO[C@@H]1CC[C@H]1NC(=O)c2cnc3n2nc(cc3NC)N5CCc4...,TYK2,PH-CMR-TK2-2620-0N-001;PH-CMR-TK2-2620-0N-002,,,,,,,...,=,4.550594,20.60,0.261864,,,,9.647555,9.647555,1.0
6,KT-0194996,CNc1cc(nn2c(cnc21)C(=O)N[C@H]3[C@H](OC)CC3)N5c...,TYK2,PH-CMR-TK2-2621-0N-001,,,,,,,...,=,4.607898,4.30,0.067422,,,25.5965,,25.596500,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6144,KT-0199157,Fc3c1c(n(cc1C(=O)Nc2cc(ccc2)C(=O)N)C)ccc3-c4c(...,STAT-6,ST6-S-3517-001F,,,,,,,...,,,1.09,0.011158,,,,,,
6145,KT-0199163,n1(nc(c2c1ccc(c2)-c3ccc(cc3)N4C=NN(C4=O)C)C(=O...,STAT-6,ST6-S-3601-001F,,,,,,,...,,,3.00,0.030241,,,,,,
6146,KT-0199164,Fc3c1c(n(nc1C(=O)Nc2cc(ccc2)C(=O)N)C)ccc3-c4cc...,STAT-6,ST6-S-3602-001N,,,,,,,...,,,2.40,0.024432,,,,,,
6147,KT-0199167,N1(N=CN(C1=O)c2ccc(cc2)-c3cc(c(cc3)C)N4CCN(C4=...,STAT-6,ST6-S-3607-001F,,,,,,,...,,,27.40,0.305652,,,,,,
