In [1]:
import copy
import sqlite3
import numpy as np
import pandas as pd
from rdkit import Chem

#### 1. Extracting data from Database

In [2]:
def call_my_query(db_file, my_query):
    ## connect to the SQLIte database
    my_connection = sqlite3.connect(db_file)

    ## create a cursor object
    my_cursor = my_connection.cursor()

    ## excute the query
    my_cursor.execute(my_query)

    ## fetch all the rows
    rows = my_cursor.fetchall()
    
    ## export the results
    data_list = [row for row in rows]

    my_connection.close()
    return data_list

def extract_tables(db_file, table_name):
    ## extract table data from SQLite DB
    my_query_colName = f"PRAGMA table_info({table_name})"
    colName_list = call_my_query(db_file, my_query_colName)

    my_query_data = f"SELECT * FROM {table_name}"
    data_list = call_my_query(db_file, my_query_data)

    ## clean up data
    dataDict = {}
    for row_tuple in data_list:
        idx = row_tuple[0]
        dataDict[idx] = {}

        for col in colName_list:
            colIdx, colName = col[0], col[1]
            dataDict[idx][colName] = row_tuple[colIdx]
    return dataDict

In [3]:
db_file = './results/Compounds_All.mmpdb'
dataDict_tables = {}

for table_name in ["pair", "compound", "compound_property", "property_name", "constant_smiles",
                   "rule", "rule_smiles", "rule_environment", "rule_environment_statistics", "environment_fingerprint"]:
    dataDict_table = extract_tables(db_file, table_name)
    dataDict_tables[table_name] = pd.DataFrame.from_dict(dataDict_table).T
    # print(table_name)

In [4]:
dataDict_tables["property_name"]

Unnamed: 0,id,name
0,0,F%_Rat
1,1,EstFa_Rat
2,2,permeability
3,3,efflux
4,4,hERG_IC50
5,5,hERG_mixedIC50
6,6,logD_CDD


In [5]:
dataTable_rule_env_stats = copy.deepcopy(dataDict_tables["rule_environment_statistics"])

dataTable_rule_env_stats

Unnamed: 0,id,rule_environment_id,property_name_id,count,avg,std,kurtosis,skewness,min,q1,median,q3,max,paired_t,p_value
1,1.0,343.0,0.0,3.0,6.673333,3.715432,-1.500000,0.374010,4.240000,4.38750,4.830000,9.42000,10.95000,3.110958,0.089649
2,2.0,349.0,0.0,34.0,3.912390,8.627412,11.345179,2.765673,-10.023889,0.12560,1.478167,6.92000,43.55000,2.644241,0.012434
3,3.0,433.0,0.0,1.0,6.530000,,,,6.530000,6.53000,6.530000,6.53000,6.53000,,
4,4.0,521.0,0.0,6.0,-3.587667,3.104594,-1.351174,-0.327930,-7.880000,-6.88000,-2.490000,-1.78000,-0.00600,-2.830629,0.036648
5,5.0,569.0,0.0,2.0,0.385000,0.544472,,,0.000000,0.00000,0.385000,0.77000,0.77000,1.000000,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548577,1548577.0,1350365.0,6.0,1.0,0.890390,,,,0.890390,0.89039,0.890390,0.89039,0.89039,,
1548578,1548578.0,1350366.0,6.0,1.0,0.890390,,,,0.890390,0.89039,0.890390,0.89039,0.89039,,
1548579,1548579.0,1350367.0,6.0,1.0,0.711850,,,,0.711850,0.71185,0.711850,0.71185,0.71185,,
1548580,1548580.0,1350368.0,6.0,1.0,0.711850,,,,0.711850,0.71185,0.711850,0.71185,0.71185,,


In [6]:
dataTable_rule_env_stats = copy.deepcopy(dataDict_tables["rule_environment_statistics"])
dataTable_rule_env_stats.drop(columns=['id', 'kurtosis','skewness', 'paired_t', 'p_value', 'q1', 'median', 'q3'], inplace=True)
dataTable_rule_env_stats = dataTable_rule_env_stats.merge(dataDict_tables["property_name"], left_on='property_name_id', right_on='id')
dataTable_rule_env_stats

Unnamed: 0,rule_environment_id,property_name_id,count,avg,std,min,max,id,name
0,343.0,0.0,3.0,6.673333,3.715432,4.240000,10.95000,0,F%_Rat
1,349.0,0.0,34.0,3.912390,8.627412,-10.023889,43.55000,0,F%_Rat
2,433.0,0.0,1.0,6.530000,,6.530000,6.53000,0,F%_Rat
3,521.0,0.0,6.0,-3.587667,3.104594,-7.880000,-0.00600,0,F%_Rat
4,569.0,0.0,2.0,0.385000,0.544472,0.000000,0.77000,0,F%_Rat
...,...,...,...,...,...,...,...,...,...
1548576,1350365.0,6.0,1.0,0.890390,,0.890390,0.89039,6,logD_CDD
1548577,1350366.0,6.0,1.0,0.890390,,0.890390,0.89039,6,logD_CDD
1548578,1350367.0,6.0,1.0,0.711850,,0.711850,0.71185,6,logD_CDD
1548579,1350368.0,6.0,1.0,0.711850,,0.711850,0.71185,6,logD_CDD


In [7]:
dataTable_rule_env_fp = copy.deepcopy(dataDict_tables["environment_fingerprint"])
dataTable_rule_env_fp.rename(columns={'id':'environment_fingerprint_id', 
                                      'pseudosmiles':'rule_env_fp_pseudosmiles',
                                      'smarts':'rule_env_fp_smarts', 
                                      'parent_smarts':'rule_env_fp_parent_smarts'})
dataTable_rule_env_fp

Unnamed: 0,id,smarts,pseudosmiles,parent_smarts
1,1,[#0;X1;H0;+0;!R:1],[*:1](~*),
2,2,[#0;X1;H0;+0;!R:1]-[C;X3;H1;+0;!R],[*:1]-[CH](~*),[#0;X1;H0;+0;!R:1]
3,3,[#0;X1;H0;+0;!R:1]-[C;X3;H1;+0;!R]=[#6;X3;H0;+...,[*:1]-[CH]=[#6](~*)(~*),[#0;X1;H0;+0;!R:1]-[C;X3;H1;+0;!R]
4,4,[#0;X1;H0;+0;!R:1]-[C;X3;H1;+0;!R]=[#6;X3;H0;+...,[*:1]-[CH]=[C](-[#6](~*)(~*))-[CH](~*),[#0;X1;H0;+0;!R:1]-[C;X3;H1;+0;!R]=[#6;X3;H0;+...
5,5,[#0;X1;H0;+0;!R:1]-[C;X3;H1;+0;!R]=[#6;X3;H0;+...,[*:1]-[CH]=[C](-[CH]=[#6](~*)(~*))-[C](-[#7](~...,[#0;X1;H0;+0;!R:1]-[C;X3;H1;+0;!R]=[#6;X3;H0;+...
...,...,...,...,...
29019,29019,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...,[*:1]-[n]1:[n]:[cH]:[cH]:[c]:1-[NH]-[C](~*)(~*),[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R](:[#7;X2;H0;...
29020,29020,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...,[*:1]-[n]1:[n]:[cH]:[cH]:[c]:1-[NH]-[C](-[#6](...,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...
29021,29021,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R](:[#7;X2;H0;...,[*:1]-[n](:[n]:[cH](~*)):[c](:[n](~*))-[CH2](~*),[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R](:[#6;X3;H0;...
29022,29022,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...,[*:1]-[n]1:[n]:[cH]:[n]:[c]:1-[CH2]-[O](~*),[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R](:[#7;X2;H0;...


#### 2. clean up data

In [8]:
## -------------------- clean up rule table & merge rule smiles --------------------
dataTable_rules = copy.deepcopy(dataDict_tables["rule"])
dataTable_rules.rename(columns={'id':'rule_id'},  inplace=True)

## from
dataTable_rules = dataTable_rules.merge(dataDict_tables["rule_smiles"], left_on=['from_smiles_id'], right_on=['id'])
dataTable_rules.drop(columns=['id', 'num_heavies'], inplace=True)
dataTable_rules.rename(columns={'smiles':'from_smiles'}, inplace=True)
dataTable_rules.head(3)

## to 
dataTable_rules = dataTable_rules.merge(dataDict_tables["rule_smiles"], left_on=['to_smiles_id'], right_on=['id'])
dataTable_rules.drop(columns=['id', 'num_heavies'], inplace=True)
dataTable_rules.rename(columns={'smiles':'to_smiles'}, inplace=True)

## -------------------- merge rule table and rule env table --------------------
dataTable_rule_env = copy.deepcopy(dataDict_tables["rule_environment"])
dataTable_rule_env.rename(columns={'id':'rule_environment_id', 'radius':'rule_env_radius', 'num_pairs':'rule_env_num_pairs'},  inplace=True)
dataTable_rule_env = dataTable_rule_env.merge(dataTable_rules, on='rule_id')

## -------------------- merge rule env table and rule_env_stats info --------------------
## clean up rule-env-stats table
dataTable_rule_env_stats = copy.deepcopy(dataDict_tables["rule_environment_statistics"])
dataTable_rule_env_stats.drop(columns=['id', 'kurtosis','skewness', 'paired_t', 'p_value', 'q1', 'median', 'q3'], inplace=True)
dataTable_rule_env_stats = dataTable_rule_env_stats.merge(dataDict_tables["property_name"], left_on='property_name_id', right_on='id')

## merge
dataTable_rule_env = dataTable_rule_env.merge(dataTable_rule_env_stats, left_on=['rule_environment_id'], right_on=['rule_environment_id'])


## -------------------- merge rule env table and rule_env_fp info --------------------
dataTable_rule_env_fp = copy.deepcopy(dataDict_tables["environment_fingerprint"])
dataTable_rule_env_fp.rename(columns={'id':'environment_fingerprint_id', 
                                      'pseudosmiles':'rule_env_fp_pseudosmiles',
                                      'smarts':'rule_env_fp_smarts', 
                                      'parent_smarts':'rule_env_fp_parent_smarts'}, inplace=True)

dataTable_rule_env = dataTable_rule_env.merge(dataTable_rule_env_fp, on=['environment_fingerprint_id'])
# dataTable_rule_env.drop(columns=['id'], inplace=True)    #, 'smarts', 'parent_smarts'
dataTable_rule_env.rename(columns={'environment_fingerprint_id':'rule_env_fingerprint_id',
                                   'pseudosmiles':'rule_env_fp_pseudosmiles', 
                                   'smarts':'rule_env_fp_smarts', 
                                   'parent_smarts':'rule_env_fp_parent_smarts', }, inplace=True)


# cols_in_order = ['rule_id', 'from_smiles_id', 'from_smiles', 'to_smiles_id', 'to_smiles', 
#                  'rule_environment_id', 'rule_env_num_pairs', 'rule_env_radius', 'rule_env_fingerprint_id', 
#                  'rule_env_fp_pseudosmiles', 'rule_env_fp_smarts', 'rule_env_fp_parent_smarts']
# dataTable_rule_env = dataTable_rule_env[cols_in_order]

dataTable_rule_env

Unnamed: 0,rule_environment_id,rule_id,rule_env_fingerprint_id,rule_env_radius,rule_env_num_pairs,from_smiles_id,to_smiles_id,from_smiles,to_smiles,property_name_id,count,avg,std,min,max,id,name,rule_env_fp_smarts,rule_env_fp_pseudosmiles,rule_env_fp_parent_smarts
0,1,1,1,0,1,1,2,[*:1]c1ccc(Cl)cc1,[*:1]c1ccc([N+](=O)[O-])cc1,6.0,1.0,-0.476230,,-0.476230,-0.476230,6,logD_CDD,[#0;X1;H0;+0;!R:1],[*:1](~*),
1,7,2,1,0,1,3,4,[*:1]c1cc(C#N)ccc1OC,[*:1]c1cc(O)c(Cl)cn1,6.0,1.0,-0.945850,,-0.945850,-0.945850,6,logD_CDD,[#0;X1;H0;+0;!R:1],[*:1](~*),
2,13,3,1,0,1,4,5,[*:1]c1cc(O)c(Cl)cn1,[*:1]c1ccc(N)cc1Cl,6.0,1.0,1.378780,,1.378780,1.378780,6,logD_CDD,[#0;X1;H0;+0;!R:1],[*:1](~*),
3,19,4,1,0,1,6,4,[*:1]c1cc(Cl)ccc1N,[*:1]c1cc(O)c(Cl)cn1,6.0,1.0,-1.190590,,-1.190590,-1.190590,6,logD_CDD,[#0;X1;H0;+0;!R:1],[*:1](~*),
4,25,5,1,0,1,3,5,[*:1]c1cc(C#N)ccc1OC,[*:1]c1ccc(N)cc1Cl,6.0,1.0,0.432930,,0.432930,0.432930,6,logD_CDD,[#0;X1;H0;+0;!R:1],[*:1](~*),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548576,1350294,206070,28983,4,1,17044,17045,[*:1]c1nc(C)cc(=O)[nH]1,[*:1]c1nc(C)cc(OC)n1,6.0,1.0,1.107470,,1.107470,1.107470,6,logD_CDD,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...,[*:1]-[n]1:[n]:[c](:[cH]:[c]:1-[NH]-[C](~*)(~*...,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R](:[#7;X2;H0;...
1548577,1350295,206070,28984,5,1,17044,17045,[*:1]c1nc(C)cc(=O)[nH]1,[*:1]c1nc(C)cc(OC)n1,6.0,1.0,1.107470,,1.107470,1.107470,6,logD_CDD,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...,[*:1]-[n]1:[n]:[c](:[cH]:[c]:1-[NH]-[C](-[#6](...,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...
1548578,1350342,206071,29000,5,1,12002,248,[*:1]C1CCCCO1,[*:1][H],6.0,1.0,-0.709970,,-0.709970,-0.709970,6,logD_CDD,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...,[*:1]-[n]1:[n]:[cH]:[c]2:[c](:[cH]:[cH]:[cH]:[...,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...
1548579,1350362,206071,29016,4,1,12002,248,[*:1]C1CCCCO1,[*:1][H],6.0,1.0,-0.013268,,-0.013268,-0.013268,6,logD_CDD,[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R]1:[#7;X2;H0;...,[*:1]-[n]1:[n]:[cH]:[cH]:[c]:1-[CH2]-[#6](~*)(~*),[#0;X1;H0;+0;!R:1]-[#7;X3;H0;+0;R](:[#7;X2;H0;...


In [9]:
dataDict_tables["rule_smiles"]

Unnamed: 0,id,smiles,num_heavies
1,1,[*:1]c1ccc(Cl)cc1,7
2,2,[*:1]c1ccc([N+](=O)[O-])cc1,9
3,3,[*:1]c1cc(C#N)ccc1OC,10
4,4,[*:1]c1cc(O)c(Cl)cn1,8
5,5,[*:1]c1ccc(N)cc1Cl,8
...,...,...,...
17041,17041,[*:1][C@H]1O[C@@H](CO)[C@H](O)[C@@H]1O,9
17042,17042,[*:1]C1CCCN(C(=O)C=C)C1,10
17043,17043,[*:1][C@@H]1CCCN(C(=O)C=C)C1,10
17044,17044,[*:1]c1nc(C)cc(=O)[nH]1,8


In [10]:
dataTable = copy.deepcopy(dataDict_tables["pair"])
dataTable.head(3)

## ------------------- add compound structure & property data -------------------
table_merge = dataDict_tables["compound"]

## compound-1 (from)
dataTable = dataTable.merge(table_merge, left_on=['compound1_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'clean_smiles', 'clean_num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'public_id':'KT_number_1', 'input_smiles':'smiles_1'}, inplace=True)

## compound-2 (to)
dataTable = dataTable.merge(table_merge, left_on=['compound2_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'clean_smiles', 'clean_num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'public_id':'KT_number_2', 'input_smiles':'smiles_2'}, inplace=True)

## ------------------- add compound prop data -------------------
table_merge = dataDict_tables["compound_property"]

## compound-1 (from)
dataTable = dataTable.merge(table_merge, left_on=['compound1_id'], right_on=['compound_id'])
dataTable.drop(columns=['id_y', 'compound_id'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'value':'property_values_1'}, inplace=True)

## compound-2 (to)
dataTable = dataTable.merge(table_merge, left_on=['compound2_id', 'property_name_id'], right_on=['compound_id', 'property_name_id'])
dataTable.drop(columns=['id_y', 'compound_id'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'value':'property_values_2'}, inplace=True)

## add property name
table_merge = dataDict_tables["property_name"]
dataTable = dataTable.merge(table_merge, left_on=['property_name_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'name':'property_name'}, inplace=True)

## ------------------- add constant pieces data of the match pair -------------------
table_merge = dataDict_tables["constant_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['constant_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'smiles':'constant_smiles'}, inplace=True)

## ------------------- add rule env data -------------------
table_merge = dataDict_tables["rule_environment"]
dataTable = dataTable.merge(table_merge, left_on=['rule_environment_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'radius':'rule_env_radius', 'num_pairs':'rule_env_num_pairs'}, inplace=True)

## ------------------- add rule info -------------------
table_merge = dataDict_tables["rule"]
dataTable = dataTable.merge(table_merge, left_on=['rule_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)    #'rule_id'
dataTable.rename(columns={'id_x':'id'}, inplace=True)

table_merge = dataDict_tables["rule_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['from_smiles_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'from_smiles_id', 'num_heavies'], inplace=True)    #'num_heavies'
dataTable.rename(columns={'id_x':'id', 'smiles':'rule_from_smiles'}, inplace=True)

table_merge = dataDict_tables["rule_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['to_smiles_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'to_smiles_id', 'num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'smiles':'rule_to_smiles'}, inplace=True)

## ------------------- add rule env stats -------------------
table_merge = dataDict_tables["rule_environment_statistics"]
dataTable = dataTable.merge(table_merge, 
                            left_on=['rule_environment_id', 'property_name_id'], 
                            right_on=['rule_environment_id', 'property_name_id'])

drop_cols = ['kurtosis', 'skewness', 'paired_t', 'p_value', 'q1', 'q3', 'median', 'std']
dataTable.drop(columns=['id_y']+drop_cols, inplace=True)
dataTable.rename(columns={'id_x':'id', 'count':'rule_env_count', 'avg':'rule_env_avg', 
                          'min':'rule_env_min', 'max':'rule_env_max'}, inplace=True)

## ------------------- add rule env environment_fingerprint data -------------------
table_merge = dataDict_tables["environment_fingerprint"]
## to be added

## ------------------- remove useless cols -------------------
dataTable.drop(columns=['id', 'compound1_id', 'compound2_id', 'constant_id', 'rule_environment_id', 'property_name_id'], inplace=True)
print(dataTable.shape)
dataTable.head(3)

(1857222, 18)


Unnamed: 0,KT_number_1,smiles_1,KT_number_2,smiles_2,property_values_1,property_values_2,property_name,constant_smiles,rule_id,environment_fingerprint_id,rule_env_radius,rule_env_num_pairs,rule_from_smiles,rule_to_smiles,rule_env_count,rule_env_avg,rule_env_min,rule_env_max
0,KT-0013567,Clc1ccc(cc1)/C=C/2\C=C(N(C2=O)c3ccc(cc3)NC(=O)...,KT-0013672,[N+](=O)([O-])c1ccc(cc1)/C=C/2\C=C(N(C2=O)c3cc...,4.74391,4.26768,logD_CDD,[*:1]/C=C1/C=C(c2ccccc2)N(c2ccc(NC(C)=O)cc2)C1=O,1,1,0,1,[*:1]c1ccc(Cl)cc1,[*:1]c1ccc([N+](=O)[O-])cc1,1.0,-0.47623,-0.47623,-0.47623
1,KT-0013567,Clc1ccc(cc1)/C=C/2\C=C(N(C2=O)c3ccc(cc3)NC(=O)...,KT-0013672,[N+](=O)([O-])c1ccc(cc1)/C=C/2\C=C(N(C2=O)c3cc...,4.74391,4.26768,logD_CDD,[*:1]/C=C1/C=C(c2ccccc2)N(c2ccc(NC(C)=O)cc2)C1=O,1,2,1,1,[*:1]c1ccc(Cl)cc1,[*:1]c1ccc([N+](=O)[O-])cc1,1.0,-0.47623,-0.47623,-0.47623
2,KT-0013567,Clc1ccc(cc1)/C=C/2\C=C(N(C2=O)c3ccc(cc3)NC(=O)...,KT-0013672,[N+](=O)([O-])c1ccc(cc1)/C=C/2\C=C(N(C2=O)c3cc...,4.74391,4.26768,logD_CDD,[*:1]/C=C1/C=C(c2ccccc2)N(c2ccc(NC(C)=O)cc2)C1=O,1,3,2,1,[*:1]c1ccc(Cl)cc1,[*:1]c1ccc([N+](=O)[O-])cc1,1.0,-0.47623,-0.47623,-0.47623


In [11]:
def GeneratePairID(row, col_mol_id_1='KT_number_1', col_mol_id_2='KT_number_2'):
    mol_id_1 = row[col_mol_id_1]
    mol_id_2 = row[col_mol_id_2]
    pair_id = str(mol_id_1) + '=>' + str(mol_id_2)
    
    mol_id_1_num = int(str(mol_id_1).split('-')[1])
    mol_id_2_num = int(str(mol_id_2).split('-')[1])
    pair_couple = (np.min([mol_id_1_num, mol_id_2_num]), np.max([mol_id_1_num, mol_id_2_num]))
    return pd.Series([pair_id, pair_couple])

dataTable[['Pair_id', 'PairInfo']] = dataTable.apply(lambda row: GeneratePairID(row, col_mol_id_1='KT_number_1', col_mol_id_2='KT_number_2'), axis=1)
print(dataTable.shape)

################################################################################################
def calculate_heavy_atoms(molecule_smiles):
    try:
        mol = Chem.MolFromSmiles(molecule_smiles)
        num_heavy_atoms = mol.GetNumHeavyAtoms()
    except Exception as e:
        print('Error', e)
        num_heavy_atoms = np.nan
    return num_heavy_atoms

dataTable['constant_size'] = dataTable['constant_smiles'].apply(calculate_heavy_atoms)
dataTable.sort_values(by=['PairInfo', 'Pair_id', 'rule_env_radius', 'constant_size'], ascending=[True, True, True, False], inplace=True)
print(dataTable.shape)

################################################################################################

(1857222, 20)
(1857222, 21)


In [12]:
def calculate_heavy_atoms(molecule_smiles):
    try:
        mol = Chem.MolFromSmiles(molecule_smiles)
        num_heavy_atoms = mol.GetNumHeavyAtoms()
    except Exception as e:
        print('Error', e)
        num_heavy_atoms = np.nan
    return num_heavy_atoms

dataTable['constant_size'] = dataTable['constant_smiles'].apply(calculate_heavy_atoms)
dataTable.sort_values(by=['PairInfo', 'Pair_id', 'rule_env_radius', 'constant_size'], ascending=[True, True, True, False], inplace=True)
# dataTable.to_csv(f'./results/Compounds_All_4_informatics.csv', index=False)
dataTable.head(3)

Unnamed: 0,KT_number_1,smiles_1,KT_number_2,smiles_2,property_values_1,property_values_2,property_name,constant_smiles,rule_id,environment_fingerprint_id,...,rule_env_num_pairs,rule_from_smiles,rule_to_smiles,rule_env_count,rule_env_avg,rule_env_min,rule_env_max,Pair_id,PairInfo,constant_size
64546,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000032,F[C@@H]([C@@H](O)C)CNC(=O)c1cnc(cc1NC2CC2)Nc4n...,2.08638,1.60852,logD_CDD,[*:1][C@@](C)(O)[C@H](F)CNC(=O)c1cnc(Nc2ccc3cn...,1073,1,...,1034,[*:1]C,[*:1][H],1034.0,-0.270035,-2.95251,1.12564,KT-0000031=>KT-0000032,"(31, 32)",30
1474868,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000032,F[C@@H]([C@@H](O)C)CNC(=O)c1cnc(cc1NC2CC2)Nc4n...,2.08638,1.60852,logD_CDD,[*:1]C.[*:2]O.[*:3][C@H](F)CNC(=O)c1cnc(Nc2ccc...,28826,25,...,16,[*:1]C([*:2])([*:3])C,[*:1]C([*:2])[*:3],16.0,-0.230068,-1.1031,0.62476,KT-0000031=>KT-0000032,"(31, 32)",29
482143,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000032,F[C@@H]([C@@H](O)C)CNC(=O)c1cnc(cc1NC2CC2)Nc4n...,2.08638,1.60852,logD_CDD,[*:1]C.[*:2][C@H](F)CNC(=O)c1cnc(Nc2ccc3cnccc3...,55832,8,...,3,[*:1]C([*:2])(C)O,[*:1]C([*:2])O,3.0,-0.224377,-0.47786,0.28259,KT-0000031=>KT-0000032,"(31, 32)",28


#### 3. removed the "duplicated" rows

In [13]:
dataTable.sort_values(by=['PairInfo', 'rule_env_radius', 'constant_size'], ascending=[True, True, False], inplace=True)
dataTable_rmDup = dataTable.drop_duplicates(subset=['PairInfo', 'property_name'], keep='first', inplace=False)
print(dataTable_rmDup.shape)
dataTable_rmDup.head(3)

(154059, 21)


Unnamed: 0,KT_number_1,smiles_1,KT_number_2,smiles_2,property_values_1,property_values_2,property_name,constant_smiles,rule_id,environment_fingerprint_id,...,rule_env_num_pairs,rule_from_smiles,rule_to_smiles,rule_env_count,rule_env_avg,rule_env_min,rule_env_max,Pair_id,PairInfo,constant_size
64546,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000032,F[C@@H]([C@@H](O)C)CNC(=O)c1cnc(cc1NC2CC2)Nc4n...,2.08638,1.60852,logD_CDD,[*:1][C@@](C)(O)[C@H](F)CNC(=O)c1cnc(Nc2ccc3cn...,1073,1,...,1034,[*:1]C,[*:1][H],1034.0,-0.270035,-2.95251,1.12564,KT-0000031=>KT-0000032,"(31, 32)",30
1418526,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000033,FC(CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc4)CO,2.08638,1.18709,logD_CDD,[*:1]CNC(=O)c1cnc(Nc2ccc3cnccc3n2)cc1NC1CC1.[*...,84893,25,...,1,[*:1]C([*:2])C([*:3])(C)C,[*:3]CC([*:1])[*:2],1.0,-0.89929,-0.89929,-0.89929,KT-0000031=>KT-0000033,"(31, 33)",27
1037478,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000055,N([C@@H]1CC[C@H](CC1)C(=O)NC)C(=O)c2cnc(cc2NC3...,2.08638,2.68964,logD_CDD,[*:2]C.[*:1]NC(=O)c1cnc(Nc2ccc3cnccc3n2)cc1NC1CC1,45084,8,...,1,[*:1]C[C@@H](F)C([*:2])(C)O,[*:2]NC(=O)[C@H]1CC[C@H]([*:1])CC1,1.0,0.60326,0.60326,0.60326,KT-0000031=>KT-0000055,"(31, 55)",25


#### append symetric rows

In [14]:
rename_symetric_dict = {
    'KT_number_1': 'KT_number_2',
    'smiles_1': 'smiles_2',
    'KT_number_2': 'KT_number_1',
    'smiles_2': 'smiles_1',
    'property_values_1': 'property_values_2',
    'property_values_2': 'property_values_1', 
    'rule_from_smiles': 'rule_to_smiles',
    'rule_to_smiles': 'rule_from_smiles'}
dataTable_rmDup_symetric = dataTable_rmDup.rename(columns=rename_symetric_dict, inplace=False)
dataTable_rmDup_symetric['Pair_id'] = dataTable_rmDup_symetric['KT_number_1'] + '=>' + dataTable_rmDup_symetric['KT_number_2']
for col in ['rule_env_avg', 'rule_env_min', 'rule_env_max']:
    dataTable_rmDup_symetric[col] = dataTable_rmDup_symetric[col] * -1
dataTable_rmDup_symetric.head(3)

Unnamed: 0,KT_number_2,smiles_2,KT_number_1,smiles_1,property_values_2,property_values_1,property_name,constant_smiles,rule_id,environment_fingerprint_id,...,rule_env_num_pairs,rule_to_smiles,rule_from_smiles,rule_env_count,rule_env_avg,rule_env_min,rule_env_max,Pair_id,PairInfo,constant_size
64546,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000032,F[C@@H]([C@@H](O)C)CNC(=O)c1cnc(cc1NC2CC2)Nc4n...,2.08638,1.60852,logD_CDD,[*:1][C@@](C)(O)[C@H](F)CNC(=O)c1cnc(Nc2ccc3cn...,1073,1,...,1034,[*:1]C,[*:1][H],1034.0,0.270035,2.95251,-1.12564,KT-0000032=>KT-0000031,"(31, 32)",30
1418526,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000033,FC(CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc4)CO,2.08638,1.18709,logD_CDD,[*:1]CNC(=O)c1cnc(Nc2ccc3cnccc3n2)cc1NC1CC1.[*...,84893,25,...,1,[*:1]C([*:2])C([*:3])(C)C,[*:3]CC([*:1])[*:2],1.0,0.89929,0.89929,0.89929,KT-0000033=>KT-0000031,"(31, 33)",27
1037478,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000055,N([C@@H]1CC[C@H](CC1)C(=O)NC)C(=O)c2cnc(cc2NC3...,2.08638,2.68964,logD_CDD,[*:2]C.[*:1]NC(=O)c1cnc(Nc2ccc3cnccc3n2)cc1NC1CC1,45084,8,...,1,[*:1]C[C@@H](F)C([*:2])(C)O,[*:2]NC(=O)[C@H]1CC[C@H]([*:1])CC1,1.0,-0.60326,-0.60326,-0.60326,KT-0000055=>KT-0000031,"(31, 55)",25


In [15]:
dataTable_rmDup_all = dataTable_rmDup._append(dataTable_rmDup_symetric, ignore_index=True)
dataTable_rmDup_all['rule_env_min'] = dataTable_rmDup_all['rule_env_min'].apply(lambda x:round(x, 2)).astype('str')
dataTable_rmDup_all['rule_env_max'] = dataTable_rmDup_all['rule_env_max'].apply(lambda x:round(x, 2)).astype('str')
dataTable_rmDup_all['rule_env_range'] = '('+ dataTable_rmDup_all['rule_env_min'] + ',' + dataTable_rmDup_all['rule_env_max'] +')'

In [16]:
dataTable_rmDup_all.sort_values(by=['PairInfo', 'property_name', 'Pair_id'], ascending=[True, True, True], inplace=True)
dataTable_rmDup_all = dataTable_rmDup_all.reset_index(drop=True)
# dataTable_rmDup_all.to_csv(f'./results/Compounds_All_4_informatics_rmDups.csv', index=False)
dataTable_rmDup_all

Unnamed: 0,KT_number_1,smiles_1,KT_number_2,smiles_2,property_values_1,property_values_2,property_name,constant_smiles,rule_id,environment_fingerprint_id,...,rule_from_smiles,rule_to_smiles,rule_env_count,rule_env_avg,rule_env_min,rule_env_max,Pair_id,PairInfo,constant_size,rule_env_range
0,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000032,F[C@@H]([C@@H](O)C)CNC(=O)c1cnc(cc1NC2CC2)Nc4n...,2.086380,1.608520,logD_CDD,[*:1][C@@](C)(O)[C@H](F)CNC(=O)c1cnc(Nc2ccc3cn...,1073,1,...,[*:1]C,[*:1][H],1034.0,-0.270035,-2.95,1.13,KT-0000031=>KT-0000032,"(31, 32)",30,"(-2.95,1.13)"
1,KT-0000032,F[C@@H]([C@@H](O)C)CNC(=O)c1cnc(cc1NC2CC2)Nc4n...,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,1.608520,2.086380,logD_CDD,[*:1][C@@](C)(O)[C@H](F)CNC(=O)c1cnc(Nc2ccc3cn...,1073,1,...,[*:1][H],[*:1]C,1034.0,0.270035,2.95,-1.13,KT-0000032=>KT-0000031,"(31, 32)",30,"(2.95,-1.13)"
2,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000033,FC(CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc4)CO,2.086380,1.187090,logD_CDD,[*:1]CNC(=O)c1cnc(Nc2ccc3cnccc3n2)cc1NC1CC1.[*...,84893,25,...,[*:1]C([*:2])C([*:3])(C)C,[*:3]CC([*:1])[*:2],1.0,-0.899290,-0.9,-0.9,KT-0000031=>KT-0000033,"(31, 33)",27,"(-0.9,-0.9)"
3,KT-0000033,FC(CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc4)CO,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,1.187090,2.086380,logD_CDD,[*:1]CNC(=O)c1cnc(Nc2ccc3cnccc3n2)cc1NC1CC1.[*...,84893,25,...,[*:3]CC([*:1])[*:2],[*:1]C([*:2])C([*:3])(C)C,1.0,0.899290,0.9,0.9,KT-0000033=>KT-0000031,"(31, 33)",27,"(0.9,0.9)"
4,KT-0000031,F[C@H](CNC(=O)c1cnc(cc1NC2CC2)Nc4nc3c(cncc3)cc...,KT-0000055,N([C@@H]1CC[C@H](CC1)C(=O)NC)C(=O)c2cnc(cc2NC3...,2.086380,2.689640,logD_CDD,[*:2]C.[*:1]NC(=O)c1cnc(Nc2ccc3cnccc3n2)cc1NC1CC1,45084,8,...,[*:1]C[C@@H](F)C([*:2])(C)O,[*:2]NC(=O)[C@H]1CC[C@H]([*:1])CC1,1.0,0.603260,0.6,0.6,KT-0000031=>KT-0000055,"(31, 55)",25,"(0.6,0.6)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308113,KT-0345065,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)N3C[C@@H](CC...,KT-0345064,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)N3CC(C3)O)C(...,0.733954,0.442428,logD_CDD,[*:1]O.[*:2]c1nc(N(C(N)=O)c2c(F)cccc2F)ccc1C(N)=O,63645,8,...,[*:1][C@@H]1CCN([*:2])C1,[*:1]C1CN([*:2])C1,9.0,-0.376081,-0.26,-0.59,KT-0345065=>KT-0345064,"(345064, 345065)",22,"(-0.26,-0.59)"
308114,KT-0345064,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)N3CC(C3)O)C(...,KT-0345066,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)OCCOC)C(=O)N,0.442428,0.950584,logD_CDD,[*:1]c1nc(N(C(N)=O)c2c(F)cccc2F)ccc1C(N)=O,199887,1,...,[*:1]N1CC(O)C1,[*:1]OCCOC,1.0,0.508156,0.51,0.51,KT-0345064=>KT-0345066,"(345064, 345066)",21,"(0.51,0.51)"
308115,KT-0345066,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)OCCOC)C(=O)N,KT-0345064,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)N3CC(C3)O)C(...,0.950584,0.442428,logD_CDD,[*:1]c1nc(N(C(N)=O)c2c(F)cccc2F)ccc1C(N)=O,199887,1,...,[*:1]OCCOC,[*:1]N1CC(O)C1,1.0,-0.508156,-0.51,-0.51,KT-0345066=>KT-0345064,"(345064, 345066)",21,"(-0.51,-0.51)"
308116,KT-0345065,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)N3C[C@@H](CC...,KT-0345066,Fc1c(c(ccc1)F)N(c2nc(c(cc2)C(=O)N)OCCOC)C(=O)N,0.733954,0.950584,logD_CDD,[*:1]c1nc(N(C(N)=O)c2c(F)cccc2F)ccc1C(N)=O,199888,1,...,[*:1]N1CC[C@@H](O)C1,[*:1]OCCOC,1.0,0.216630,0.22,0.22,KT-0345065=>KT-0345066,"(345065, 345066)",21,"(0.22,0.22)"
