In [1]:
import os
import pandas as pd

from datacat4ml.const import CURA_LHD_OR_DIR, CURA_MHD_OR_DIR, CURA_MHD_effect_OR_DIR, CURA_HHD_OR_DIR, CURA_DATA_DIR
from datacat4ml.const import CAT_DATA_DIR
from datacat4ml.Scripts.data_prep.data_curate.curate_utils.standardize_structures import remove_dup_mols

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def read_hml_hd(in_dir=CURA_LHD_OR_DIR, dir_base_name ='rmvDupMol0', f_base_name='CHEMBL233_bind_RBA_Ki_CHEMBL3707592_lhd'):
    for f in os.listdir(os.path.join(in_dir, dir_base_name)):
        if f.startswith(f_base_name):
            df = pd.read_csv(os.path.join(in_dir, dir_base_name, f))
            return df

#===================================================================
# DupMols in a SINGLE dataset
#===================================================================
std_smiles_col='canonical_smiles_by_Std'
pref_name_col='pref_name'
cpd_chemblid_col='compound_chembl_id'

pvalue_col1='standard_value'
pvalue_col2='pStandard_value'

cols_to_show = ['activity_id', 'assay_chembl_id', cpd_chemblid_col, std_smiles_col, pref_name_col, 'standard_type', 'standard_relation', pvalue_col1, pvalue_col2]

def sorted_vc(df):
    vc = df[std_smiles_col].value_counts()
    # sort vc by the value counts in descending order
    vc = vc.sort_values(ascending=False)
    vc
    return vc

#===================================================================
# Compare DupMols in rmvDupMol0 and rmvDupMol1 datasets
#===================================================================
def plot_diff_col_values(df, df_rmv1, col='pref_name', plot=False):

    vc = df[col].value_counts()
    print(f'len(vc): {len(vc)}')
    col_values = vc.index.tolist()
    print(f'col: {col_values}')

    vc_rmv1 = df_rmv1[col].value_counts()
    print(f'len(vc_rmv1): {len(vc_rmv1)}')
    col_values_rmv1 = vc_rmv1.index.tolist()
    print(f'col_values_rmv1: {col_values_rmv1}')

    # get the difference between col_values and col_values_rmv1
    diff_col_values = list(set(col_values) - set(col_values_rmv1))
    print(f'len(diff_col_values): {len(diff_col_values)}')
    print(f'diff_col_values: {diff_col_values}')

    # plot the distribution of the diff_col_values in df
    if plot:
        for n in diff_col_values:
            df[df[col]==n]['pStandard_value'].hist(bins=30)
            plt.title(n)
            plt.show()
    
    return diff_col_values

# CURA_LHD_OR

In [None]:
# rmvDupMol0
lhd_or_files = [
'CHEMBL2014_bind_RBA_IC50_CHEMBL867087_lhd_b50_b50_curated.csv',
'CHEMBL2014_bind_RBA_Ki_CHEMBL1030622_lhd_b50_b50_curated.csv',
'CHEMBL2014_bind_RBA_Ki_CHEMBL888952_lhd_b50_s50_curated.csv',
'CHEMBL2014_bind_RBA_Ki_CHEMBL892111_lhd_b50_b50_curated.csv',
'CHEMBL233_agon_G-cAMP_EC50_CHEMBL4356649_lhd_b50_b50_curated.csv',
'CHEMBL233_agon_G-GTP_EC50_CHEMBL3887793_lhd_b50_b50_curated.csv',
'CHEMBL233_agon_G-GTP_EC50_CHEMBL4201551_lhd_b50_s50_curated.csv',
'CHEMBL233_antag_G-GTP_IC50_CHEMBL909298_lhd_s50_s50_curated.csv',
'CHEMBL233_bind_RBA_Ki_CHEMBL1030625_lhd_b50_b50_curated.csv',
'CHEMBL233_bind_RBA_Ki_CHEMBL3887789_lhd_b50_b50_curated.csv',
'CHEMBL233_bind_RBA_Ki_CHEMBL4356647_lhd_b50_b50_curated.csv',
'CHEMBL233_bind_RBA_Ki_CHEMBL892114_lhd_b50_b50_curated.csv',
'CHEMBL233_bind_RBA_Ki_CHEMBL909851_lhd_b50_s50_curated.csv',
'CHEMBL236_bind_RBA_Ki_CHEMBL1030623_lhd_b50_b50_curated.csv',
'CHEMBL236_bind_RBA_Ki_CHEMBL3887791_lhd_b50_b50_curated.csv',
'CHEMBL236_bind_RBA_Ki_CHEMBL892112_lhd_b50_b50_curated.csv',
'CHEMBL237_agon_G-GTP_EC50_CHEMBL3887794_lhd_b50_b50_curated.csv',
'CHEMBL237_agon_G-GTP_EC50_CHEMBL910350_lhd_s50_s50_curated.csv',
'CHEMBL237_bind_RBA_Ki_CHEMBL1030624_lhd_b50_b50_curated.csv',
'CHEMBL237_bind_RBA_Ki_CHEMBL3887790_lhd_b50_b50_curated.csv',
'CHEMBL237_bind_RBA_Ki_CHEMBL5253113_lhd_s50_s50_curated.csv',
'CHEMBL237_bind_RBA_Ki_CHEMBL892113_lhd_b50_b50_curated.csv',
'CHEMBL237_bind_RBA_Ki_CHEMBL910349_lhd_b50_s50_curated.csv'
]

## 1. 'CHEMBL233_bind_RBA_Ki_CHEMBL3707592_lhd_b50_s50_curated.csv'

In [None]:
lhd_df1 = read_hml_hd(f_base_name='CHEMBL233_bind_RBA_Ki_CHEMBL3707592_lhd')
print(f'lhd_df1.shape: {lhd_df1.shape}')

vc = lhd_df1['compound_chembl_id'].value_counts()
print(f'vc: {vc}')

wrong_activity_ids_1 = [
    #compound_chembl_id: CHEMBL3908275    3
    16348719, 16260203,
    #comppund_chembl_id: CHEMBL3948231    3
    16280459, 16285964,
    #comppund_chembl_id: CHEMBL3911529    3
    16338796, 16345420,
    #comppund_chembl_id: CHEMBL3893577    3
    16329322, 16334546,
    #comppund_chembl_id: CHEMBL3954498    3
    16323198, 16325206,
    #comppund_chembl_id: CHEMBL3948400    3
    16353251, 16318808,
    #comppund_chembl_id: CHEMBL3901699    3
    16311427, 16320230,
    #comppund_chembl_id: CHEMBL3664540    3
    16320904, 16329066,
    #comppund_chembl_id: CHEMBL3904663    3
    16311300, 16296171,
    #comppund_chembl_id: CHEMBL3916122    3
    16343355, 16278872,
    #comppund_chembl_id: CHEMBL4107573    3
    17607502, 17607533,
    #comppund_chembl_id: CHEMBL3923372    3
    16330728, 16340093,
    #comppund_chembl_id: CHEMBL3947184    3
    16262862, 16294741,
    #comppund_chembl_id: CHEMBL3907149    3
    16265478, 16268318,
    #comppund_chembl_id: CHEMBL3912511    3
    16309495, 16266350,
    #comppund_chembl_id: CHEMBL3955920    3
    16329500, 16270670,
    #comppund_chembl_id: CHEMBL3919397    3
    16274120, 16305421,
    #comppund_chembl_id: CHEMBL3915488    2
    16301326, 
    #comppund_chembl_id: CHEMBL3951177    2
    16272693,
    #comppund_chembl_id: CHEMBL3946351    2
    16305554, 
    #comppund_chembl_id: CHEMBL3900482    2
    16303895,
    #comppund_chembl_id: CHEMBL3664541    2
    16346449, 16299045,
    #comppund_chembl_id: CHEMBL3944046    2
    16278180,
    #comppund_chembl_id: CHEMBL3892737    2
    16276817,
    #comppund_chembl_id: CHEMBL3664525    2
    16293587, 
    #comppund_chembl_id: CHEMBL3664526    2
    16260396,
    #comppund_chembl_id: CHEMBL3982823    2
    16289441,
    #comppund_chembl_id: CHEMBL3959791    2
    16356146,
    #comppund_chembl_id: CHEMBL3952829    2
    16279037, 
    #comppund_chembl_id: CHEMBL3911769    2             
    16289762
]

#lhd_df1_drop = lhd_df1[~lhd_df1['activity_id'].isin(wrong_activity_ids_1)].reset_index(drop=True)
#print(lhd_df1_drop.shape)
#
#lhd_df1_final = remove_dup_mols(lhd_df1_drop)
#print(lhd_df1_final.shape)

In [None]:
curated_df = lhd_df1[~lhd_df1['activity_id'].isin(wrong_activity_ids_1)]
print(f'curated_df.shape: {curated_df.shape}')

## 2. 'CHEMBL237_bind_RBA_Ki_CHEMBL4050969_lhd_b50_s50_curated.csv'

In [None]:
# After checking on the CHEMBL website, it was found that the duplicates are the same compounds with almost the same activity values. So it is save to merge the duplicates into one by taking the mean of the activity values
lhd_df2 = read_hml_hd(f_base_name='CHEMBL237_bind_RBA_Ki_CHEMBL4050969_lhd')
print(f'lhd_df2.shape: {lhd_df2.shape}')

vc = lhd_df2['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n'
    f'vc: {vc}')

In [None]:
lhd_df2_final = remove_dup_mols(lhd_df2)
print(lhd_df2_final.shape)

## 3. 'CHEMBL233_bind_RBA_Ki_CHEMBL3888831_lhd_b50_s50_curated.csv'

In [None]:
# After checking the original document, it was found that the duplicates were a pair of polar and non-polar diastereomers.
lhd_df3 = read_hml_hd(f_base_name='CHEMBL233_bind_RBA_Ki_CHEMBL3888831_lhd')
print(f'lhd_df3.shape: {lhd_df3.shape}')

vc = lhd_df3['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:29]

In [None]:
wrong_activity_ids_3 = [
    #CHEMBL3920864    2
    
    #CHEMBL3982887    2

    #CHEMBL3899387    2

    #CHEMBL3911970    2

    #CHEMBL3898034    2

    #CHEMBL3934183    2

    #CHEMBL3985824    2

    #CHEMBL3977691    2

    #CHEMBL3909426    2

    #CHEMBL3965425    2

    #CHEMBL3928651    2
    #CHEMBL3967435    2
    #CHEMBL3972755    2
    #CHEMBL3907138    2
    #CHEMBL3919659    2
    #CHEMBL3921976    2
    #CHEMBL3929120    2
    #CHEMBL3982651    2
    #CHEMBL3914240    2
    #CHEMBL3889835    2
    #CHEMBL3982674    2
    #CHEMBL3951213    2
    #CHEMBL3928190    2
    #CHEMBL3969636    2
    #CHEMBL3977883    2
    #CHEMBL3937167    2
    #CHEMBL3906866    2
    #CHEMBL3937815    2 
]

## 4.'CHEMBL2014_bind_RBA_Ki_CHEMBL3888830_lhd_b50_s50_curated.csv'

In [None]:
# After checking the original document, it was found that the duplicates were a pair of polar and non-polar diastereomers.
lhd_df4 = read_hml_hd(f_base_name='CHEMBL2014_bind_RBA_Ki_CHEMBL3888830_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df4.shape: {lhd_df4.shape}')

vc = lhd_df4['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:25]

In [None]:
wrong_activity_ids_4 = [
    #CHEMBL3943372    2

    #CHEMBL3985824    2
    
    #CHEMBL3928651    2
    #CHEMBL3899387    2
    #CHEMBL3911970    2
    #CHEMBL3898034    2
    #CHEMBL3934183    2
    #CHEMBL3967435    2
    #CHEMBL3977691    2
    #CHEMBL3889835    2
    #CHEMBL3909426    2
    #CHEMBL3965425    2
    #CHEMBL3907138    2
    #CHEMBL3919659    2
    #CHEMBL3921976    2
    #CHEMBL3929120    2
    #CHEMBL3914240    2
    #CHEMBL3982651    2
    #CHEMBL3982674    2
    #CHEMBL3977883    2
    #CHEMBL3969636    2
    #CHEMBL3928190    2
    #CHEMBL3937167    2
    #CHEMBL3951213    2
]

## 5. 'CHEMBL236_bind_RBA_Ki_CHEMBL758175_lhd_b50_s50_curated.csv'

In [None]:
# after checking the original document, it was found the duplicates were same compound with crystal waters, or racemic isomers.
lhd_df5 = read_hml_hd(f_base_name='CHEMBL236_bind_RBA_Ki_CHEMBL758175_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df5.shape: {lhd_df5.shape}')

vc = lhd_df5['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:7]

In [None]:
wrong_activity_ids_5 = [
    # CHEMBL169346     3
    # CHEMBL2367916    3
    # CHEMBL299114     3
    # CHEMBL279968     3
    # CHEMBL56585      2
    # CHEMBL301835     2
]

## 6. 'CHEMBL237_bind_RBA_Ki_CHEMBL3887032_lhd_b50_b50_curated.csv'

In [None]:
# can't access the original document, but the standard_value for most duplicates are exactly the same, some differ within 1 magnitude.
lhd_df6 = read_hml_hd(f_base_name='CHEMBL237_bind_RBA_Ki_CHEMBL3887032_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df6.shape: {lhd_df6.shape}')

vc = lhd_df6['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:10]

In [None]:
wrong_activity_ids_6 = [
    #CHEMBL3695218    2

    #CHEMBL3647958    2
    
    #CHEMBL3695217    2

    #CHEMBL3647959    2

    #CHEMBL3698892    2

    #CHEMBL3698893    2

    #CHEMBL3647963    2
    
    #CHEMBL3695245    2
]

## 7. 'CHEMBL233_bind_RBA_Ki_CHEMBL3887030_lhd_b50_b50_curated.csv'

In [None]:
# can't access the original document, but the standard_value for most duplicates are exactly the same, some differ within 1 magnitude.
lhd_df7 = read_hml_hd(f_base_name='CHEMBL233_bind_RBA_Ki_CHEMBL3887030_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df7.shape: {lhd_df7.shape}')

vc = lhd_df7['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:10]

In [None]:
wrong_activity_ids_7 = [
    #CHEMBL3695217    2

    #CHEMBL3695218    2

    #CHEMBL3698892    2

    #CHEMBL3698893    2

    #CHEMBL3647959    2

    #CHEMBL3647958    2

    #CHEMBL3695245    2
    
    #CHEMBL3647963    2
]

## 8. 'CHEMBL236_bind_RBA_Ki_CHEMBL3887031_lhd_b50_b50_curated.csv'

In [None]:
# can't access the original document, but the standard_value for most duplicates are exactly the same, some differ within 1 magnitude, some are both inactive. So it is save to merge the duplicates into one.
lhd_df8 = read_hml_hd(f_base_name='CHEMBL236_bind_RBA_Ki_CHEMBL3887031_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df8.shape: {lhd_df8.shape}')

vc = lhd_df8['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:10]

In [None]:
wrong_activity_ids_8 = [
    #CHEMBL3695217    2

    #CHEMBL3695218    2
    
    #CHEMBL3698892    2

    #CHEMBL3647959    2

    #CHEMBL3698893    2

    #CHEMBL3647958    2

    #CHEMBL3647963    2
    
    #CHEMBL3695245    2
]

## 9. 'CHEMBL233_agon_G-GTP_EC50_CHEMBL3888720_lhd_b50_b50_curated.csv'

In [None]:
# After checking the original document, it was found that the duplicates are compounds were recorded twice with different activity values. But the activity values are very close to each other.
# So it is save to merge the duplicates into one by taking the mean of the activity values
lhd_df9 = read_hml_hd(f_base_name='CHEMBL233_agon_G-GTP_EC50_CHEMBL3888720_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df9.shape: {lhd_df9.shape}')

vc = lhd_df9['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:10]

In [None]:
wrong_activity_ids_9 = [
    #CHEMBL3976681    2
    #CHEMBL4111936    2
    #CHEMBL4108710    2
    #CHEMBL3971169    2
    #CHEMBL4106563    2
    #CHEMBL3948713    2
]

## 10. 'CHEMBL237_agon_G-GTP_EC50_CHEMBL3888721_lhd_b50_s50_curated.csv'

In [None]:
# After checking the original document, it was found that the duplicates are compounds were recorded twice with different activity values. But the activity values are very close to each other.
# So it is save to merge the duplicates into one by taking the mean of the activity values
lhd_df10 = read_hml_hd(f_base_name='CHEMBL237_agon_G-GTP_EC50_CHEMBL3888721_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df10.shape: {lhd_df10.shape}')

vc = lhd_df10['compound_chembl_id'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:10]

In [None]:
wrong_activity_ids_10 = [
    #CHEMBL4108710    2
    #CHEMBL3948713    2
    #CHEMBL4106563    2
    #CHEMBL3971169    2
    #CHEMBL4111936    2
    #CHEMBL3976681    2
]

## 11. 'CHEMBL233_bind_RBA_Ki_CHEMBL3880325_lhd_b50_b50_curated.csv'

In [None]:
#The duplicated SMILES correspond same primary structure with very close activity values. So it is save to merge the duplicates into one by taking the mean of the activity values
lhd_df11 = read_hml_hd(f_base_name='CHEMBL233_bind_RBA_Ki_CHEMBL3880325_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df11.shape: {lhd_df11.shape}')

vc = lhd_df11['canonical_smiles_by_Std'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:10]

## 12. 'CHEMBL236_bind_RBA_Ki_CHEMBL3880326_lhd_b50_b50_curated.csv'

In [None]:
# The duplicated SMILES correspond same primary structure with very close activity values. So it is save to merge the duplicates into one by taking the mean of the activity values
lhd_df12 = read_hml_hd(f_base_name='CHEMBL236_bind_RBA_Ki_CHEMBL3880326_lhd', dir_base_name='rmvDupMol0')
print(f'lhd_df12.shape: {lhd_df12.shape}')

#vc = lhd_df12['compound_chembl_id'].value_counts()
vc = lhd_df12['canonical_smiles_by_Std'].value_counts()
print(f'len(vc): {len(vc)}\n')
vc[:10]

# CURA_MHD_OR

In [3]:
# rmvDupMol0
mhd_or_files = [

'CHEMBL233_bind_RBA_IC50_mhd_b50_b50_curated.csv',
'CHEMBL237_bind_RBA_Ki_mhd_b50_b50_curated.csv',
'CHEMBL237_bind_RBA_IC50_mhd_b50_b50_curated.csv',
'CHEMBL236_bind_RBA_Ki_mhd_b50_b50_curated.csv',
'CHEMBL236_bind_RBA_IC50_mhd_b50_b50_curated.csv',
'CHEMBL2014_bind_RBA_Ki_mhd_b50_b50_curated.csv',
'CHEMBL2014_bind_RBA_IC50_mhd_b50_b50_curated.csv',
'CHEMBL233_agon_G-GTP_EC50_mhd_b50_b50_curated.csv',
'CHEMBL237_agon_G-GTP_EC50_mhd_b50_b50_curated.csv',
'CHEMBL236_agon_G-GTP_EC50_mhd_b50_b50_curated.csv',
'CHEMBL2014_agon_G-GTP_EC50_mhd_b50_b50_curated.csv',
'CHEMBL233_agon_G-Ca_EC50_mhd_b50_b50_curated.csv',
'CHEMBL237_agon_G-Ca_EC50_mhd_b50_b50_curated.csv',
'CHEMBL236_agon_G-Ca_EC50_mhd_s50_s50_curated.csv',
'CHEMBL2014_agon_G-Ca_EC50_mhd_s50_s50_curated.csv',
'CHEMBL233_agon_G-cAMP_IC50_mhd_b50_s50_curated.csv',
'CHEMBL233_agon_G-cAMP_EC50_mhd_b50_b50_curated.csv',
'CHEMBL237_agon_G-cAMP_IC50_mhd_s50_s50_curated.csv',
'CHEMBL237_agon_G-cAMP_EC50_mhd_b50_b50_curated.csv',
'CHEMBL236_agon_G-cAMP_IC50_mhd_s50_s50_curated.csv',
'CHEMBL236_agon_G-cAMP_EC50_mhd_b50_b50_curated.csv',
'CHEMBL2014_agon_G-cAMP_IC50_mhd_b50_b50_curated.csv',
'CHEMBL2014_agon_G-cAMP_EC50_mhd_s50_s50_curated.csv',
'CHEMBL233_agon_B-arrest_EC50_mhd_b50_b50_curated.csv',
'CHEMBL237_agon_B-arrest_EC50_mhd_s50_s50_curated.csv',
'CHEMBL236_agon_B-arrest_EC50_mhd_s50_s50_curated.csv',
'CHEMBL2014_agon_B-arrest_EC50_mhd_s50_s50_curated.csv',
'CHEMBL233_antag_G-GTP_IC50_mhd_b50_b50_curated.csv',
'CHEMBL233_antag_G-GTP_Ki_mhd_b50_b50_curated.csv',
'CHEMBL237_antag_G-GTP_IC50_mhd_b50_b50_curated.csv',
'CHEMBL237_antag_G-GTP_Ki_mhd_b50_b50_curated.csv',
'CHEMBL236_antag_G-GTP_IC50_mhd_b50_b50_curated.csv',
'CHEMBL236_antag_G-GTP_Ki_mhd_b50_b50_curated.csv',
'CHEMBL2014_antag_G-GTP_IC50_mhd_b50_b50_curated.csv',
'CHEMBL2014_antag_G-GTP_Ki_mhd_s50_s50_curated.csv',
'CHEMBL233_antag_B-arrest_IC50_mhd_s50_s50_curated.csv',
'CHEMBL237_antag_B-arrest_IC50_mhd_b50_s50_curated.csv',
'CHEMBL236_antag_B-arrest_IC50_mhd_s50_s50_curated.csv',
]

## 1. 'CHEMBL233_bind_RBA_Ki_mhd_b50_b50_curated.csv',

In [None]:
def remove_dupMol(df, std_smiles_col='canonical_smiles_by_Std', pvalue_col='pStandard_value') -> pd.DataFrame:
    
    """
    Remove duplicate molecules with high intra-molecule variability (>1 std),
    keep single-appearance smiles, and replace pStandard_value for multi-appearance
    molecules with the mean pStandard_value of that group.
    """

    #group stats per smiles
    df_group = df.groupby(std_smiles_col)[pvalue_col].agg(['mean', 'std'])

    # singletons: std is NaN (only one activity value)
    single_idx = df_group[df_group['std'].isna()].index
    single_df = df[df[std_smiles_col].isin(single_idx)].copy()
    print(f'single_df.shape: {single_df.shape}')

    # multi-apprearance smiles (std is not NaN)
    multi_idx = df_group[df_group['std'].notna()].index
    multi_df = df[df[std_smiles_col].isin(multi_idx)].copy()
    print(f'multi_df.shape: {multi_df.shape}')

    # remove multi-appearance with high std (>1)
    keep_multi_idx = df_group.loc[multi_idx].loc[lambda x: x['std'] <=1].index
    rmv_dupMol_df = multi_df[multi_df[std_smiles_col].isin(keep_multi_idx)].drop_duplicates(subset=std_smiles_col, keep='first').copy()

    # map the mean pStandard_value to each remaining multi-smiles row
    rmv_dupMol_df[pvalue_col] = rmv_dupMol_df[std_smiles_col].map(df_group['mean'])

    # combine single and multi-appearance smiles
    final_df = pd.concat([single_df, rmv_dupMol_df], axis=0).reset_index(drop=True)
    print(f'final_df.shape: {final_df.shape}')

    return final_df

In [38]:
mhd_df1 = read_hml_hd(in_dir=CURA_MHD_OR_DIR, dir_base_name ='rmvDupMol0', f_base_name='CHEMBL233_bind_RBA_Ki')
print(f'mhd_df1.shape: {mhd_df1.shape}')

mhd_df1_rmv1 = read_hml_hd(in_dir=CURA_MHD_OR_DIR, dir_base_name ='rmvDupMol1', f_base_name='CHEMBL233_bind_RBA_Ki')
print(f'mhd_df1_rmv1.shape: {mhd_df1_rmv1.shape}')

#plot_diff_pref_names(mhd_df1, mhd_df1_rmv1)

#mhd_df1_rmv = remove_dup_mols(mhd_df1)
#print(f'mhd_df1_rmv.shape: {mhd_df1_rmv.shape}')

mhd_df1.shape: (5150, 50)
mhd_df1_rmv1.shape: (4493, 50)


In [39]:
df = mhd_df1.copy()
df_group = df.groupby(std_smiles_col)[pvalue_col2].agg(['mean', 'std'])
df_group

Unnamed: 0_level_0,mean,std
canonical_smiles_by_Std,Unnamed: 1_level_1,Unnamed: 2_level_1
C#CCN1CC[C@]23CCCCC2[C@H]1Cc1ccc(OC(=O)CCCCCCCCC(=O)Oc2ccc4c(c2)[C@@]25CCCCC2[C@@H](C4)N(CC#C)CC5)cc13,7.853872,
C#CCN1CC[C@]23CCCC[C@H]2[C@H]1Cc1ccc(O)cc13,11.522879,
C#CCN1CN(c2ccccc2)C2(CCN(C(c3ccccc3)c3ccccc3)CC2)C1=O,5.365120,
C#CCOCC1CC23CCC1(OC)C1Oc4c(OC)ccc5c4[C@@]12CCN(C)[C@@H]3C5,7.376751,
C/C=C(\C)C(=O)O[C@H]1C[C@@H](C(=O)OC)[C@]2(C)CC[C@H]3C(=O)O[C@H](c4ccoc4)C[C@]3(C)[C@H]2C1=O,5.000000,
...,...,...
c1ccc2c(c1)nc(-c1cc[nH]n1)n2[C@H]1C[C@H]2CCC[C@@H](C1)N2[C@@H]1C[C@@H]2CCCC[C@@H](C2)C1,6.276544,
c1ccc2c(c1)nc(C[C@@H]1COCCN1)n2[C@H]1C[C@H]2CCC[C@@H](C1)N2[C@@H]1C[C@@H]2CCCC[C@@H](C2)C1,5.777544,
c1ccc2c(c1)nc(C[C@H]1COCCN1)n2[C@H]1C[C@H]2CCC[C@@H](C1)N2[C@@H]1C[C@@H]2CCCC[C@@H](C2)C1,5.718058,
c1cnc2[nH]cc(C3CCN(CC4CCCCC4)CC3)c2c1,5.301030,


In [40]:
single_idx = df_group[df_group['std'].isna()].index
single_df = df[df[std_smiles_col].isin(single_idx)].copy()
print(f'single_smiles_df.shape: {single_df.shape}')

single_smiles_df.shape: (4210, 50)


In [41]:
# multi-apprearance smiles (std is not NaN)
multi_idx = df_group[df_group['std'].notna()].index
multi_df = df[df[std_smiles_col].isin(multi_idx)].copy()
print(f'multi_df.shape: {multi_df.shape}')

multi_df.shape: (940, 50)


In [None]:
# remove multi-appearance with high std (>1)
keep_multi_idx = df_group.loc[multi_idx].loc[lambda x: x['std'] <=1].index
rmv_dupMol_df = multi_df[multi_df[std_smiles_col].isin(keep_multi_idx)].drop_duplicates(subset=std_smiles_col, keep='first').copy()
print(f'rmv_dupMol_df.shape: {rmv_dupMol_df.shape}')

rmv_dupMol_df.shape: (283, 50)


In [43]:
# map the mean pStandard_value to each remaining multi-smiles row
rmv_dupMol_df[pvalue_col2] = rmv_dupMol_df[std_smiles_col].map(df_group['mean'])
print(f'rmv_dupMol_df.shape: {rmv_dupMol_df.shape}')

rmv_dupMol_df.shape: (283, 50)


In [50]:
# combine single and multi-appearance smiles
final_df = pd.concat([single_df, rmv_dupMol_df], axis=0).reset_index(drop=True)
print(f'final_df.shape: {final_df.shape}')

final_df.shape: (4493, 50)


In [29]:
# the difference rows between mhd_df1 and mhd_df1_rmv1
diff_cpd_ids = plot_diff_col_values(mhd_df1, mhd_df1_rmv1, col='compound_chembl_id', plot=False)
diff_cpd_ids

len(vc): 65
col: ['CHEMBL240657', 'CHEMBL472669', 'CHEMBL514662', 'CHEMBL475310', 'CHEMBL473700', 'CHEMBL473701', 'CHEMBL473904', 'CHEMBL239822', 'CHEMBL471262', 'CHEMBL511649', 'CHEMBL474283', 'CHEMBL474485', 'CHEMBL474687', 'CHEMBL514489', 'CHEMBL515091', 'CHEMBL514463', 'CHEMBL473272', 'CHEMBL475094', 'CHEMBL474092', 'CHEMBL473278', 'CHEMBL473279', 'CHEMBL473280', 'CHEMBL473898', 'CHEMBL474096', 'CHEMBL515575', 'CHEMBL471882', 'CHEMBL511162', 'CHEMBL516187', 'CHEMBL511161', 'CHEMBL471880', 'CHEMBL472086', 'CHEMBL516389', 'CHEMBL475707', 'CHEMBL510201', 'CHEMBL515142', 'CHEMBL474091', 'CHEMBL475308', 'CHEMBL514606', 'CHEMBL472676', 'CHEMBL472677', 'CHEMBL474708', 'CHEMBL514965', 'CHEMBL241272', 'CHEMBL475496', 'CHEMBL516077', 'CHEMBL473884', 'CHEMBL516024', 'CHEMBL472867', 'CHEMBL473068', 'CHEMBL475096', 'CHEMBL518325', 'CHEMBL456225', 'CHEMBL510745', 'CHEMBL460636', 'CHEMBL517265', 'CHEMBL499169', 'CHEMBL515829', 'CHEMBL457244', 'CHEMBL501088', 'CHEMBL457451', 'CHEMBL464191', 'CHEMB

['CHEMBL475497', 'CHEMBL472086']

In [21]:
# apply wrong_activity_ids filter
mhd_df1_rmv = mhd_df1_rmv[~mhd_df1_rmv['activity_id'].isin(wrong_activity_ids)]
mhd_df1_rmv.shape

(4493, 50)

In [23]:
# delete rows with SMILES strings that will fail to be embeded for 3D descriptor featurization
curated_df = mhd_df1_rmv.copy()
file_basename = 'CHEMBL233_bind_RBA_Ki'
for f in failed_dict.keys():
    if f == file_basename:
        for key, value in failed_dict[f].items():
            idx_to_drop = curated_df[curated_df['canonical_smiles'] == value].index
            print(f'index to drop is {idx_to_drop} for SMILES: {value}')
            curated_df = curated_df.drop(index=idx_to_drop)

print(f'curated_df.shape: {curated_df.shape}')

curated_df.shape: (4493, 50)


In [None]:
vc1 = sorted_vc(mhd_df1)
vc1_rmv1 = sorted_vc(mhd_df1_rmv1)
vc1_rmv = sorted_vc(mhd_df1_rmv)

In [None]:
df = mhd_df1
dup_df = df[df.duplicated(subset=[std_smiles_col, pvalue_col1], keep=False)]
#dup_df = dup_df.sort_values(by=[std_smiles_col, pvalue_col1])
dup_df = dup_df.sort_values(by=[std_smiles_col])
print(f'dup_df.shape: {dup_df.shape}')
dup_df[cols_to_show][149:200]

### heatmap

In [None]:
mhd_or_df = pd.read_csv('/storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/data_prep/data_categorize/cat_mhd_or/CHEMBL233/bind/RBA/Ki/CHEMBL233_bind_RBA_Ki_mhd_df.csv')

In [None]:
# get rows where `pStandard_value` is NaN
nan_pstd_df = mhd_df[mhd_or_df['pStandard_value'].isna()]
print(f'nan_pstd_df.shape: {nan_pstd_df.shape}')

NameError: name 'mhd_or_df' is not defined

In [14]:
vc = sorted_vc(mhd_df_rmv0)
vc

canonical_smiles_by_Std
C[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)NCC(=O)N(C)[C@@H](Cc1ccccc1)C(=O)NCCO                                                  41
CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5                                                                        35
O=C1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5c4[C@@]2(CCN3CC2CC2)[C@H]1O5                                                                    33
C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2C(=O)CC[C@@]3(O)[C@H]1C5                                                                       25
Oc1ccc2c3c1O[C@H]1c4[nH]c5c(c4C[C@@]4(O)[C@@H](C2)N(CC2CC2)CC[C@]314)C[C@@]1(O)[C@H]2Cc3ccc(O)c4c3[C@@]1(CCN2CC1CC1)[C@H]5O4    15
                                                                                                                                ..
Cc1cc(F)ccc1-c1cnc2c(c1)C[C@@]1(OCCCc3ccccc3)[C@H]3Cc4ccc(O)c5c4[C@@]1(CCN3CC1CC1)[C@H]2O5                                       1
CN(C)C1(c2ccccc2)CCC2(CCN(C(=O)CC3CCC3)C2)CC1              

### standard_value: mean and std

In [None]:
df_group = mhd_df1.groupby(std_smiles_col)[pvalue_col2].agg(['mean', 'std'])
df_group['std'].tolist()


# CURA_MHD_effect_OR_DIR

In [None]:
mhd_effect_df1 = read_hml_hd(in_dir=CURA_MHD_effect_OR_DIR, dir_base_name ='rmvDupMol0', f_base_name='CHEMBL233_bind')
print(f'mhd_effect_df1.shape: {mhd_effect_df1.shape}')

mhd_effect_df1_rmv1 = read_hml_hd(in_dir=CURA_MHD_effect_OR_DIR, dir_base_name ='rmvDupMol1', f_base_name='CHEMBL233_bind')
print(f'mhd_effect_df1_rmv1.shape: {mhd_effect_df1_rmv1.shape}')

plot_diff_pref_names(mhd_effect_df1, mhd_effect_df1_rmv1)

# CURA_HHD_OR

In [None]:
hhd_df1 = read_hml_hd(in_dir=CURA_HHD_OR_DIR, dir_base_name ='rmvDupMol0', f_base_name='CHEMBL233_Ki')
print(f'hhd_df1.shape: {hhd_df1.shape}')

hhd_df1_rmv1 = read_hml_hd(in_dir=CURA_HHD_OR_DIR, dir_base_name ='rmvDupMol1', f_base_name='CHEMBL233_Ki')
print(f'hhd_df1_rmv1.shape: {hhd_df1_rmv1.shape}')

plot_diff_pref_names(hhd_df1, hhd_df1_rmv1)