In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import chardet
import numpy as np
import pandas as pd

In [2]:
def determine_encoding(dataFile):
    # Step 1: Open the CSV file in binary mode
    with open(dataFile, 'rb') as f:
        data = f.read()
    
    # Step 2: Detect the encoding using the chardet library
    encoding_result = chardet.detect(data)

    # Step 3: Retrieve the encoding information
    encoding = encoding_result['encoding']

    # Step 4: Print/export the detected encoding information
    # print("Detected Encoding:", encoding)
    return encoding


## clean up projects list
def CleanUpProjects(proj_text):
    main_projs = ['IRAK4', 'STAT-6', 'IRF5', 'TYK2', 'CDK2', 'MK2']    # 'IGG', , 'FcRn', 'MDM2', "MGD"
    proj_list = []
    for proj in proj_text.split(";"):
        if "MGD" in proj:
            proj = "MGD"
        if proj not in main_projs:
            proj = "Others"
        if proj not in proj_list:
            proj_list.append(proj)
            break
    
    proj_list = sorted(proj_list)
    return ';'.join(proj_list)

In [3]:
dataFile = f'./D360_dataset_q_id3539_280824_0045.csv'
encoding = determine_encoding(dataFile)

dataTable = pd.read_csv(dataFile, encoding=encoding)
print(dataTable.shape)
dataTable.head(3)

(25714, 40)


Unnamed: 0,Compound Name,Structure,Smiles,Concat;Project,Concat;External Id,Created On,Log D,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),...,ADME Tox-manual patch hERG 34C;Mean;Average % of hERG inhibition;(Num),ADME Tox-manual patch hERG 34C;Concat;Comments,ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Mod),ADME Tox-manual patch hERG 34C;Mean;Concentration (uM);(Num),ADME Tox-manual patch hERG 34C;Concat;Date run,ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Mod),ADME Tox-manual patch hERG 34C;GMean;m-patch hERG IC50 [uM];(Num),ADME Tox-manual patch hERG 34C;Mean;SD;(Mod),ADME Tox-manual patch hERG 34C;Mean;SD;(Num),Marked
0,KT-0026812,C[C@@H]%10CNc2c(sc3ccc1nc(ccc1c32)-c4ccc(cc4)N...,C[C@@H]1CNC2=C(SC3=CC=C4N=C(C5=CC=C(N6CCC(CN7C...,MK2,MK2-361-001F,12-Aug-2021,5.59691,,,,...,,17.94% inhibition @ 10 ?M,,,11/17/2022,>,10.0,,,UNMARKED
1,KT-0026813,CN2C(=O)N(C1CCC(=O)NC1=O)c3cccc(c32)N%10CCC(CN...,CN1C(=O)N(C2CCC(=O)NC2=O)C2=CC=CC(N3CCC(CN4CCC...,MK2,MK2-367-001N,12-Aug-2021,2.20623,,,,...,,,,,,,,,,UNMARKED
2,KT-0026814,Fc1c(cccc1)-c5nc4c(c2c(sc3c2NC[C@H](NC3=O)C)cc...,N1C(=O)C2=C(C3=C(C=CC4=NC(C5=CC=CC=C5F)=CC=C43...,MK2,MK2-368-001H,12-Aug-2021,3.83864,,,,...,,,,,,,,,,UNMARKED


In [4]:
def extractPropertyDataFromD360Table(row, colName_mod, colName_num):
    result = np.nan
    if colName_mod in row and colName_num in row:
        if row.notna()[colName_mod] and row.notna()[colName_num]:
            if row[colName_mod] == '=':
                result = row[colName_num]
    return result

def calc_mean(value_list):
    value_list_clean = []
    for v in value_list:
        if v not in [None, np.nan, '', ' ']:
            try:
                v_num = float(v)
            except Exception as e:
                print(f'Error, cannot numericalize value {v}', e)
            else:
                value_list_clean.append(v_num)
    return np.mean(value_list_clean)

def calc_eIC50_hERG(comments_str):
    # e.g., comments_str = '21.38% inhibition @ 10 ?M'
    try:
        [str_inhb, str_conc] = comments_str.split('@')
        inhb = float(str_inhb.split('%')[0])
        inhb = 0.1 if inhb < 0 else (99.99 if inhb > 100 else inhb)
        conc = float(str_conc.split('M')[0][:-1])
        eIC50 = conc*(100-inhb)/inhb
    except Exception as e:
        eIC50 = None
        if comments_str not in [' ', '/']:
            print(f'Error, cannot calc hERG eIC50 from comment data. {comments_str}')
    return eIC50

def calc_EstFa(PKF_PO, Clobs_IV, Species='Rat'):
    dict_IV_ratio = {'Rat': 90, 'Mouse': 70, 'Dog': 30, 'Monkey': 44}    
    try:
        estfa = (PKF_PO/100)/(1-(Clobs_IV/dict_IV_ratio[Species]))
    except Exception as e:
        estfa = np.nan
    return estfa

#########################################################################################
def clean_up_permeability(row):
    colName_prefix = 'ADME MDCK(WT) Permeability'
    colName_a2b_mod = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Mod)'
    colName_a2b_num = colName_prefix + ';Mean;' + 'A to B Papp (10^-6 cm/s);(Num)'
    permeability = extractPropertyDataFromD360Table(row, colName_a2b_mod, colName_a2b_num)
    return permeability

def clean_up_efflux(row):
    colName_prefix = 'ADME MDCK (MDR1) efflux'
    colName_efflux_mod = colName_prefix + ';Mean;' + 'Efflux Ratio;(Mod)'
    colName_efflux_num = colName_prefix + ';Mean;' + 'Efflux Ratio;(Num)'
    efflux = extractPropertyDataFromD360Table(row, colName_efflux_mod, colName_efflux_num)
    return efflux

def clean_up_hERG(row, eIC50=False):
    colName_prefix = 'ADME Tox-manual patch hERG 34C'

    ## expt IC50
    colName_ic50_mod = colName_prefix + ';GMean;' + 'm-patch hERG IC50 [uM];(Mod)'
    colName_ic50_num = colName_prefix + ';GMean;' + 'm-patch hERG IC50 [uM];(Num)'
    hERG_IC50 = extractPropertyDataFromD360Table(row, colName_ic50_mod, colName_ic50_num)

    ## estimated IC50 by comments column
    if eIC50:
        colName_hERG_cmnt = colName_prefix + ';Concat;' + 'Comments'
        hERG_eIC50_list = []
        if colName_hERG_cmnt in row:
            if row.notna()[colName_hERG_cmnt]:
                for cmnt in row[colName_hERG_cmnt].split(';'):
                    hERG_eIC50_list.append(calc_eIC50_hERG(cmnt))    
        hERG_eIC50 = calc_mean(hERG_eIC50_list)

        ## determine mixedIC50
        if not np.isnan(hERG_IC50):
            hERG_mixedIC50, ambitiousData = hERG_IC50, 0
        elif not np.isnan(hERG_eIC50):
            hERG_mixedIC50, ambitiousData = hERG_eIC50, 1
        else:
            hERG_mixedIC50, ambitiousData = np.nan, np.nan
        return pd.Series([hERG_IC50, hERG_eIC50, hERG_mixedIC50, ambitiousData])
    else:
        return hERG_IC50

def clean_up_PK(row, Species='Rat', EstFa=False):
    colName_PKF_mod = f'ADME PK;Mean;F %;Dose: 10.000 (mg/kg);Route of Administration: PO;Species: {Species};(Mod)'
    colName_PKF_num = f'ADME PK;Mean;F %;Dose: 10.000 (mg/kg);Route of Administration: PO;Species: {Species};(Num)'
    PKF_PO = extractPropertyDataFromD360Table(row, colName_PKF_mod, colName_PKF_num)

    if EstFa:
        colName_Cl_mod = f'Copy 1 ;ADME PK;Mean;Cl_obs(mL/min/kg);Dose: 2.000 (mg/kg);Route of Administration: IV;Species: {Species};(Mod)'
        colName_Cl_num = f'Copy 1 ;ADME PK;Mean;Cl_obs(mL/min/kg);Dose: 2.000 (mg/kg);Route of Administration: IV;Species: {Species};(Num)'
        Clobs_IV = extractPropertyDataFromD360Table(row, colName_Cl_mod, colName_Cl_num)
        EstFa = calc_EstFa(PKF_PO, Clobs_IV)
        return pd.Series([PKF_PO, EstFa])
    else:
        return PKF_PO
    
#########################################################################################
Species = 'Rat'
dataTable['logD_CDD'] = dataTable['Log D'].apply(lambda x: x)
dataTable[[f'F%_{Species}', f'EstFa_{Species}']] = dataTable.apply(lambda row: clean_up_PK(row, Species=Species, EstFa=True), axis=1)
dataTable['permeability'] = dataTable.apply(lambda row: clean_up_permeability(row), axis=1)
dataTable['efflux'] = dataTable.apply(lambda row: clean_up_efflux(row), axis=1)
dataTable[['hERG_IC50', 'hERG_eIC50', 'hERG_mixedIC50', 'ambitiousData']] = dataTable.apply(lambda row: clean_up_hERG(row, eIC50=True), axis=1)

print(dataTable.shape)
dataTable.head(3)

(25714, 49)


Unnamed: 0,Compound Name,Structure,Smiles,Concat;Project,Concat;External Id,Created On,Log D,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),...,Marked,logD_CDD,F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_eIC50,hERG_mixedIC50,ambitiousData
0,KT-0026812,C[C@@H]%10CNc2c(sc3ccc1nc(ccc1c32)-c4ccc(cc4)N...,C[C@@H]1CNC2=C(SC3=CC=C4N=C(C5=CC=C(N6CCC(CN7C...,MK2,MK2-361-001F,12-Aug-2021,5.59691,,,,...,UNMARKED,5.59691,13.8,-0.17493,,,,45.74136,45.74136,1.0
1,KT-0026813,CN2C(=O)N(C1CCC(=O)NC1=O)c3cccc(c32)N%10CCC(CN...,CN1C(=O)N(C2CCC(=O)NC2=O)C2=CC=CC(N3CCC(CN4CCC...,MK2,MK2-367-001N,12-Aug-2021,2.20623,,,,...,UNMARKED,2.20623,0.0,0.0,,,,,,
2,KT-0026814,Fc1c(cccc1)-c5nc4c(c2c(sc3c2NC[C@H](NC3=O)C)cc...,N1C(=O)C2=C(C3=C(C=CC4=NC(C5=CC=CC=C5F)=CC=C43...,MK2,MK2-368-001H,12-Aug-2021,3.83864,,,,...,UNMARKED,3.83864,,,,,,,,


In [5]:
dataTable.to_csv(f'Data_ADMET_4_MMP_2024Aug27.csv', index=False)

In [6]:
dataTable[dataTable['Concat;Project'].isna()]

Unnamed: 0,Compound Name,Structure,Smiles,Concat;Project,Concat;External Id,Created On,Log D,ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Mod),ADME MDCK(WT) Permeability;Mean;A to B Papp (10^-6 cm/s);(Num),ADME MDCK(WT) Permeability;Mean;B to A Papp (10^-6 cm/s);(Mod),...,Marked,logD_CDD,F%_Rat,EstFa_Rat,permeability,efflux,hERG_IC50,hERG_eIC50,hERG_mixedIC50,ambitiousData


In [7]:
dataTable['Concat;Project'].value_counts()

Concat;Project
STAT-6                                      5376
IRAK4                                       4178
IRF5                                        2539
TYK2                                        2058
SMARCA2                                     1996
                                            ... 
ASGPR;FcRn;ACVR2                               1
IGG;ASGPR;FcRn;ACVR2                           1
ASGPR;FcRn                                     1
IGG;FcRn                                       1
STAT-6;TYK2;KLHDC2_MGD_Library;FEM1B MGD       1
Name: count, Length: 110, dtype: int64