In [1]:
import copy
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem

In [2]:
fileName_out = "./MMPs_results"
DB_dict = f"{fileName_out}/DB_tables/DB_table_all.dict"

with open(DB_dict, "rb") as ifh:
    dataDict_tables = pickle.load(ifh)

In [3]:
def findPropValue(dbTable_propValue, cid, prop_id, average=False):
    cond_1 = (dbTable_propValue["compound_id"]==cid)
    cond_2 = (dbTable_propValue["property_name_id"]==prop_id)
    
    match_data = dbTable_propValue[cond_1 & cond_2]["value"].values

    if match_data.shape[0] <= 0:
        result = np.nan
    else:
        if average:
            if match_data.shape[0] > 1:
                print(f"\t\tWarning! Compound {cid} has multiple <{prop_id}> values")
                result = np.meam(match_data)
            else:
                result = match_data[0]
        else:
            result = np.array2string(match_data, separator=',')
    return result

In [4]:
dataTable_pair = dataDict_tables["pair"]
dbTable_cmpd = dataDict_tables["compound"]
dbTable_propName = dataDict_tables["property_name"]
dbTable_propValue = dataDict_tables["compound_property"]

dataDict = {}
for idx in dataTable_pair.index:
    pair_idx = dataTable_pair['id'][idx]
    cid_1 = dataTable_pair['compound1_id'][idx]
    cid_2 = dataTable_pair['compound2_id'][idx]
    const_id = dataTable_pair['constant_id'][idx]
    rule_env_id = dataTable_pair['rule_environment_id'][idx]

    ## initialize the sub-dict
    pair_info = f"{cid_1}==>{cid_2}"
    if pair_info not in dataDict:
        ## add pair basic info
        dataDict[pair_info] = {}
        dataDict[pair_info]["pair_info"] = pair_info
        dataDict[pair_info]["pair_id"] = f"({min([cid_1, cid_2])},{max([cid_1, cid_2])})"
        dataDict[pair_info]["compound1_id"] = cid_1
        dataDict[pair_info]["compound2_id"] = cid_2
        dataDict[pair_info]["pair_detail"] = {}

        ## add compound info
        dataDict[pair_info]["KT_id_1"] = dbTable_cmpd['public_id'][cid_1]
        dataDict[pair_info]["KT_id_2"] = dbTable_cmpd['public_id'][cid_2]
        dataDict[pair_info]["Smiles_1"] = dbTable_cmpd['input_smiles'][cid_1]
        dataDict[pair_info]["Smiles_2"] = dbTable_cmpd['input_smiles'][cid_2]

        ## add compound prop info
        for prop_id in dbTable_propName.index:
            prop_name = dbTable_propName['name'][prop_id]

            dataDict[pair_info][f"{prop_name}_1"] = findPropValue(dbTable_propValue, cid_1, prop_id, average=True)
            dataDict[pair_info][f"{prop_name}_2"] = findPropValue(dbTable_propValue, cid_2, prop_id, average=True)

    ## add pair details information (constant part)
    if const_id not in dataDict[pair_info]["pair_detail"]:
        dataDict[pair_info]["pair_detail"][const_id] = []
    
    ## add pair details information (rule_env)
    if rule_env_id not in dataDict[pair_info]["pair_detail"][const_id]:
        dataDict[pair_info]["pair_detail"][const_id].append(rule_env_id)

print(f"Current num_pairs: {len(dataDict)}")

Current num_pairs: 146198


In [5]:
#####################################################################
list_pair_info_4loop = copy.deepcopy(list(dataDict.keys()))
list_pair_info_4check = copy.deepcopy(list(dataDict.keys()))
for pair_info in list_pair_info_4loop:
    if pair_info in list_pair_info_4check:
        list_pair_info_4check.remove(pair_info)

        ## reverse pair
        cid_1, cid_2 = pair_info.split("==>")
        pair_info_revs = f"{cid_2}==>{cid_1}"
        if pair_info_revs in list_pair_info_4check:
            list_pair_info_4check.remove(pair_info_revs)
        else:
            ## if reversed pair not in check list, add it in the dict
            dataDict[pair_info_revs] = {}
            dataDict[pair_info_revs]["pair_info"] = pair_info_revs
            dataDict[pair_info_revs]["pair_id"] = dataDict[pair_info]["pair_id"]
            dataDict[pair_info_revs]["pair_detail"] = {key: [] for key in dataDict[pair_info]["pair_detail"]}

            for tmp_key in dataDict[pair_info]:
                if '1' in tmp_key:
                    dataDict[pair_info_revs][tmp_key] = dataDict[pair_info][tmp_key.replace('1', '2')]
                elif '2' in tmp_key:
                    dataDict[pair_info_revs][tmp_key] = dataDict[pair_info][tmp_key.replace('2', '1')]
                else:
                    pass
    else:
        ## this pair was removed from check list because it's the revs pair of another pair
        pass

print(f"Modified num_pairs: {len(dataDict)}")
print(len(list_pair_info_4loop), len(list_pair_info_4check))

Modified num_pairs: 268948
146198 0


In [7]:
##
dataTable = pd.DataFrame.from_dict(dataDict).T
dataTable['Num_Consts'] = dataTable['pair_detail'].apply(lambda x: len(x))
dataTable = dataTable.sort_values(by=["pair_id", "pair_info"], ascending=[True, True]).reset_index(drop=True)
print(dataTable.shape)
dataTable.head(3)

(268948, 24)


Unnamed: 0,pair_info,pair_id,compound1_id,compound2_id,pair_detail,KT_id_1,KT_id_2,Smiles_1,Smiles_2,F%_Rat_1,...,permeability_2,efflux_1,efflux_2,hERG_IC50_1,hERG_IC50_2,hERG_mixedIC50_1,hERG_mixedIC50_2,logD_CDD_1,logD_CDD_2,Num_Consts
0,1==>2,"(1,2)",1,2,"{1: [1, 2, 3, 4, 5, 6], 17071: [1048406, 10484...",KT-0013567,KT-0013672,C1=C(C2=CC=CC=C2)N(C2=CC=C(NC(C)=O)C=C2)C(=O)/...,C1=C(C2=CC=CC=C2)N(C2=CC=C(NC(C)=O)C=C2)C(=O)/...,,...,,,,,,,,4.74391,4.26768,2
1,2==>1,"(1,2)",2,1,"{1: [], 17071: []}",KT-0013672,KT-0013567,C1=C(C2=CC=CC=C2)N(C2=CC=C(NC(C)=O)C=C2)C(=O)/...,C1=C(C2=CC=CC=C2)N(C2=CC=C(NC(C)=O)C=C2)C(=O)/...,,...,,,,,,,,4.26768,4.74391,2
2,10==>1592,"(10,1592)",10,1592,"{942: [62186, 62187, 62188, 62189, 62190, 62191]}",KT-0035007,KT-0035717,C1=CC(Br)=CC=C1[C@@H]1C[C@H]1C(=O)N1CC(C2=CC=C...,O=C(C1=CNN=C1I)N1CC(C2=CC=C3C(=C2)NC(C(=O)N(C)...,,...,,,,,,,,4.60718,2.16124,1


In [None]:
cid = 10
prop_id = 6
cond_1 = (dbTable_propValue["compound_id"]==cid)
cond_2 = (dbTable_propValue["property_name_id"]==prop_id)
dbTable_propValue[cond_1 & cond_2].index

In [None]:
dataDict_tables["rule_environment_statistics"]

In [None]:
dataTable_rule_env_stats = copy.deepcopy(dataDict_tables["rule_environment_statistics"])

dataTable_rule_env_stats

In [None]:
dataTable_rule_env_stats = copy.deepcopy(dataDict_tables["rule_environment_statistics"])
dataTable_rule_env_stats.drop(columns=['id', 'kurtosis','skewness', 'paired_t', 'p_value', 'q1', 'median', 'q3'], inplace=True)
dataTable_rule_env_stats = dataTable_rule_env_stats.merge(dataDict_tables["property_name"], left_on='property_name_id', right_on='id')
dataTable_rule_env_stats

In [None]:
dataTable_rule_env_fp = copy.deepcopy(dataDict_tables["environment_fingerprint"])
dataTable_rule_env_fp.rename(columns={'id':'environment_fingerprint_id', 
                                      'pseudosmiles':'rule_env_fp_pseudosmiles',
                                      'smarts':'rule_env_fp_smarts', 
                                      'parent_smarts':'rule_env_fp_parent_smarts'})
dataTable_rule_env_fp

#### 2. clean up data

In [None]:
## -------------------- clean up rule table & merge rule smiles --------------------
dataTable_rules = copy.deepcopy(dataDict_tables["rule"])
dataTable_rules.rename(columns={'id':'rule_id'},  inplace=True)

## from
dataTable_rules = dataTable_rules.merge(dataDict_tables["rule_smiles"], left_on=['from_smiles_id'], right_on=['id'])
dataTable_rules.drop(columns=['id', 'num_heavies'], inplace=True)
dataTable_rules.rename(columns={'smiles':'from_smiles'}, inplace=True)
dataTable_rules.head(3)

## to 
dataTable_rules = dataTable_rules.merge(dataDict_tables["rule_smiles"], left_on=['to_smiles_id'], right_on=['id'])
dataTable_rules.drop(columns=['id', 'num_heavies'], inplace=True)
dataTable_rules.rename(columns={'smiles':'to_smiles'}, inplace=True)

## -------------------- merge rule table and rule env table --------------------
dataTable_rule_env = copy.deepcopy(dataDict_tables["rule_environment"])
dataTable_rule_env.rename(columns={'id':'rule_environment_id', 'radius':'rule_env_radius', 'num_pairs':'rule_env_num_pairs'},  inplace=True)
dataTable_rule_env = dataTable_rule_env.merge(dataTable_rules, on='rule_id')

## -------------------- merge rule env table and rule_env_stats info --------------------
## clean up rule-env-stats table
dataTable_rule_env_stats = copy.deepcopy(dataDict_tables["rule_environment_statistics"])
dataTable_rule_env_stats.drop(columns=['id', 'kurtosis','skewness', 'paired_t', 'p_value', 'q1', 'median', 'q3'], inplace=True)
dataTable_rule_env_stats = dataTable_rule_env_stats.merge(dataDict_tables["property_name"], left_on='property_name_id', right_on='id')

## merge
dataTable_rule_env = dataTable_rule_env.merge(dataTable_rule_env_stats, left_on=['rule_environment_id'], right_on=['rule_environment_id'])


## -------------------- merge rule env table and rule_env_fp info --------------------
dataTable_rule_env_fp = copy.deepcopy(dataDict_tables["environment_fingerprint"])
dataTable_rule_env_fp.rename(columns={'id':'environment_fingerprint_id', 
                                      'pseudosmiles':'rule_env_fp_pseudosmiles',
                                      'smarts':'rule_env_fp_smarts', 
                                      'parent_smarts':'rule_env_fp_parent_smarts'}, inplace=True)

dataTable_rule_env = dataTable_rule_env.merge(dataTable_rule_env_fp, on=['environment_fingerprint_id'])
# dataTable_rule_env.drop(columns=['id'], inplace=True)    #, 'smarts', 'parent_smarts'
dataTable_rule_env.rename(columns={'environment_fingerprint_id':'rule_env_fingerprint_id',
                                   'pseudosmiles':'rule_env_fp_pseudosmiles', 
                                   'smarts':'rule_env_fp_smarts', 
                                   'parent_smarts':'rule_env_fp_parent_smarts', }, inplace=True)


# cols_in_order = ['rule_id', 'from_smiles_id', 'from_smiles', 'to_smiles_id', 'to_smiles', 
#                  'rule_environment_id', 'rule_env_num_pairs', 'rule_env_radius', 'rule_env_fingerprint_id', 
#                  'rule_env_fp_pseudosmiles', 'rule_env_fp_smarts', 'rule_env_fp_parent_smarts']
# dataTable_rule_env = dataTable_rule_env[cols_in_order]

dataTable_rule_env

In [None]:
dataDict_tables["rule_smiles"]

In [None]:
dataTable = copy.deepcopy(dataDict_tables["pair"])
dataTable.head(3)

## ------------------- add compound structure & property data -------------------
table_merge = dataDict_tables["compound"]

## compound-1 (from)
dataTable = dataTable.merge(table_merge, left_on=['compound1_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'clean_smiles', 'clean_num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'public_id':'KT_number_1', 'input_smiles':'smiles_1'}, inplace=True)

## compound-2 (to)
dataTable = dataTable.merge(table_merge, left_on=['compound2_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'clean_smiles', 'clean_num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'public_id':'KT_number_2', 'input_smiles':'smiles_2'}, inplace=True)

## ------------------- add compound prop data -------------------
table_merge = dataDict_tables["compound_property"]

## compound-1 (from)
dataTable = dataTable.merge(table_merge, left_on=['compound1_id'], right_on=['compound_id'])
dataTable.drop(columns=['id_y', 'compound_id'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'value':'property_values_1'}, inplace=True)

## compound-2 (to)
dataTable = dataTable.merge(table_merge, left_on=['compound2_id', 'property_name_id'], right_on=['compound_id', 'property_name_id'])
dataTable.drop(columns=['id_y', 'compound_id'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'value':'property_values_2'}, inplace=True)

## add property name
table_merge = dataDict_tables["property_name"]
dataTable = dataTable.merge(table_merge, left_on=['property_name_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'name':'property_name'}, inplace=True)

## ------------------- add constant pieces data of the match pair -------------------
table_merge = dataDict_tables["constant_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['constant_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'smiles':'constant_smiles'}, inplace=True)

## ------------------- add rule env data -------------------
table_merge = dataDict_tables["rule_environment"]
dataTable = dataTable.merge(table_merge, left_on=['rule_environment_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'radius':'rule_env_radius', 'num_pairs':'rule_env_num_pairs'}, inplace=True)

## ------------------- add rule info -------------------
table_merge = dataDict_tables["rule"]
dataTable = dataTable.merge(table_merge, left_on=['rule_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)    #'rule_id'
dataTable.rename(columns={'id_x':'id'}, inplace=True)

table_merge = dataDict_tables["rule_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['from_smiles_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'from_smiles_id', 'num_heavies'], inplace=True)    #'num_heavies'
dataTable.rename(columns={'id_x':'id', 'smiles':'rule_from_smiles'}, inplace=True)

table_merge = dataDict_tables["rule_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['to_smiles_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'to_smiles_id', 'num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'smiles':'rule_to_smiles'}, inplace=True)

## ------------------- add rule env stats -------------------
table_merge = dataDict_tables["rule_environment_statistics"]
dataTable = dataTable.merge(table_merge, 
                            left_on=['rule_environment_id', 'property_name_id'], 
                            right_on=['rule_environment_id', 'property_name_id'])

drop_cols = ['kurtosis', 'skewness', 'paired_t', 'p_value', 'q1', 'q3', 'median', 'std']
dataTable.drop(columns=['id_y']+drop_cols, inplace=True)
dataTable.rename(columns={'id_x':'id', 'count':'rule_env_count', 'avg':'rule_env_avg', 
                          'min':'rule_env_min', 'max':'rule_env_max'}, inplace=True)

## ------------------- add rule env environment_fingerprint data -------------------
table_merge = dataDict_tables["environment_fingerprint"]
## to be added

## ------------------- remove useless cols -------------------
dataTable.drop(columns=['id', 'compound1_id', 'compound2_id', 'constant_id', 'rule_environment_id', 'property_name_id'], inplace=True)
print(dataTable.shape)
dataTable.head(3)

In [None]:
def GeneratePairID(row, col_mol_id_1='KT_number_1', col_mol_id_2='KT_number_2'):
    mol_id_1 = row[col_mol_id_1]
    mol_id_2 = row[col_mol_id_2]
    pair_id = str(mol_id_1) + '=>' + str(mol_id_2)
    
    mol_id_1_num = int(str(mol_id_1).split('-')[1])
    mol_id_2_num = int(str(mol_id_2).split('-')[1])
    pair_couple = (np.min([mol_id_1_num, mol_id_2_num]), np.max([mol_id_1_num, mol_id_2_num]))
    return pd.Series([pair_id, pair_couple])

dataTable[['Pair_id', 'PairInfo']] = dataTable.apply(lambda row: GeneratePairID(row, col_mol_id_1='KT_number_1', col_mol_id_2='KT_number_2'), axis=1)
print(dataTable.shape)

################################################################################################
def calculate_heavy_atoms(molecule_smiles):
    try:
        mol = Chem.MolFromSmiles(molecule_smiles)
        num_heavy_atoms = mol.GetNumHeavyAtoms()
    except Exception as e:
        print('Error', e)
        num_heavy_atoms = np.nan
    return num_heavy_atoms

dataTable['constant_size'] = dataTable['constant_smiles'].apply(calculate_heavy_atoms)
dataTable.sort_values(by=['PairInfo', 'Pair_id', 'rule_env_radius', 'constant_size'], ascending=[True, True, True, False], inplace=True)
print(dataTable.shape)

################################################################################################

In [None]:
def calculate_heavy_atoms(molecule_smiles):
    try:
        mol = Chem.MolFromSmiles(molecule_smiles)
        num_heavy_atoms = mol.GetNumHeavyAtoms()
    except Exception as e:
        print('Error', e)
        num_heavy_atoms = np.nan
    return num_heavy_atoms

dataTable['constant_size'] = dataTable['constant_smiles'].apply(calculate_heavy_atoms)
dataTable.sort_values(by=['PairInfo', 'Pair_id', 'rule_env_radius', 'constant_size'], ascending=[True, True, True, False], inplace=True)
# dataTable.to_csv(f'./results/Compounds_All_4_informatics.csv', index=False)
dataTable.head(3)

#### 3. removed the "duplicated" rows

In [None]:
dataTable.sort_values(by=['PairInfo', 'rule_env_radius', 'constant_size'], ascending=[True, True, False], inplace=True)
dataTable_rmDup = dataTable.drop_duplicates(subset=['PairInfo', 'property_name'], keep='first', inplace=False)
print(dataTable_rmDup.shape)
dataTable_rmDup.head(3)

#### append symetric rows

In [None]:
rename_symetric_dict = {
    'KT_number_1': 'KT_number_2',
    'smiles_1': 'smiles_2',
    'KT_number_2': 'KT_number_1',
    'smiles_2': 'smiles_1',
    'property_values_1': 'property_values_2',
    'property_values_2': 'property_values_1', 
    'rule_from_smiles': 'rule_to_smiles',
    'rule_to_smiles': 'rule_from_smiles'}
dataTable_rmDup_symetric = dataTable_rmDup.rename(columns=rename_symetric_dict, inplace=False)
dataTable_rmDup_symetric['Pair_id'] = dataTable_rmDup_symetric['KT_number_1'] + '=>' + dataTable_rmDup_symetric['KT_number_2']
for col in ['rule_env_avg', 'rule_env_min', 'rule_env_max']:
    dataTable_rmDup_symetric[col] = dataTable_rmDup_symetric[col] * -1
dataTable_rmDup_symetric.head(3)

In [None]:
dataTable_rmDup_all = dataTable_rmDup._append(dataTable_rmDup_symetric, ignore_index=True)
dataTable_rmDup_all['rule_env_min'] = dataTable_rmDup_all['rule_env_min'].apply(lambda x:round(x, 2)).astype('str')
dataTable_rmDup_all['rule_env_max'] = dataTable_rmDup_all['rule_env_max'].apply(lambda x:round(x, 2)).astype('str')
dataTable_rmDup_all['rule_env_range'] = '('+ dataTable_rmDup_all['rule_env_min'] + ',' + dataTable_rmDup_all['rule_env_max'] +')'

In [None]:
dataTable_rmDup_all.sort_values(by=['PairInfo', 'property_name', 'Pair_id'], ascending=[True, True, True], inplace=True)
dataTable_rmDup_all = dataTable_rmDup_all.reset_index(drop=True)
# dataTable_rmDup_all.to_csv(f'./results/Compounds_All_4_informatics_rmDups.csv', index=False)
dataTable_rmDup_all