In [4]:
import copy
import sqlite3
import numpy as np
import pandas as pd
from rdkit import Chem

In [5]:
def call_my_query(db_file, my_query):
    ## connect to the SQLIte database
    my_connection = sqlite3.connect(db_file)

    ## create a cursor object
    my_cursor = my_connection.cursor()

    ## excute the query
    my_cursor.execute(my_query)

    ## fetch all the rows
    rows = my_cursor.fetchall()
    
    ## export the results
    data_list = [row for row in rows]

    my_connection.close()
    return data_list

def extract_tables(db_file, table_name):
    ## extract table data from SQLite DB
    my_query_colName = f"PRAGMA table_info({table_name})"
    colName_list = call_my_query(db_file, my_query_colName)

    my_query_data = f"SELECT * FROM {table_name}"
    data_list = call_my_query(db_file, my_query_data)

    ## clean up data
    dataDict = {}
    for row_tuple in data_list:
        idx = row_tuple[0]
        dataDict[idx] = {}

        for col in colName_list:
            colIdx, colName = col[0], col[1]
            dataDict[idx][colName] = row_tuple[colIdx]
    return dataDict

In [6]:
db_file = './results/Compounds_All.mmpdb'

dataDict_tables = {}
dataDict_dicts = {}

for table_name in ["pair", "compound", "compound_property", "property_name", "constant_smiles",
                   "rule", "rule_smiles", "rule_environment", "rule_environment_statistics", "environment_fingerprint"]:
    dataDict_table = extract_tables(db_file, table_name)
    dataDict_dicts[table_name] = dataDict_table
    dataDict_tables[table_name] = pd.DataFrame.from_dict(dataDict_table).T
    # print(table_name)

In [7]:
dataDict_pair = dataDict_dicts["pair"]
dataTable_pair = dataDict_tables["pair"].rename(columns={'id': 'pair_id'})

In [8]:
dataDict_pair

{1: {'id': 1,
  'rule_environment_id': 1,
  'compound1_id': 1,
  'compound2_id': 2,
  'constant_id': 1},
 2: {'id': 2,
  'rule_environment_id': 2,
  'compound1_id': 1,
  'compound2_id': 2,
  'constant_id': 1},
 3: {'id': 3,
  'rule_environment_id': 3,
  'compound1_id': 1,
  'compound2_id': 2,
  'constant_id': 1},
 4: {'id': 4,
  'rule_environment_id': 4,
  'compound1_id': 1,
  'compound2_id': 2,
  'constant_id': 1},
 5: {'id': 5,
  'rule_environment_id': 5,
  'compound1_id': 1,
  'compound2_id': 2,
  'constant_id': 1},
 6: {'id': 6,
  'rule_environment_id': 6,
  'compound1_id': 1,
  'compound2_id': 2,
  'constant_id': 1},
 7: {'id': 7,
  'rule_environment_id': 7,
  'compound1_id': 3,
  'compound2_id': 4,
  'constant_id': 2},
 8: {'id': 8,
  'rule_environment_id': 8,
  'compound1_id': 3,
  'compound2_id': 4,
  'constant_id': 2},
 9: {'id': 9,
  'rule_environment_id': 9,
  'compound1_id': 3,
  'compound2_id': 4,
  'constant_id': 2},
 10: {'id': 10,
  'rule_environment_id': 10,
  'compoun

In [9]:
dataTable_pair

Unnamed: 0,pair_id,rule_environment_id,compound1_id,compound2_id,constant_id
1,1,1,1,2,1
2,2,2,1,2,1
3,3,3,1,2,1
4,4,4,1,2,1
5,5,5,1,2,1
...,...,...,...,...,...
1562822,1562822,733663,5487,8266,18769
1562823,1562823,1350235,5487,8266,18769
1562824,1562824,1350367,5487,8266,18769
1562825,1562825,1350368,5487,8266,18769


In [11]:
for i in dataDict_dicts["rule_environment"]:
    if i != dataDict_dicts["rule_environment"][i]['id']:
        print(i, dataDict_dicts["rule_environment"][i]['id'])

In [12]:
for i in dataDict_dicts["compound"]:
    if i != dataDict_dicts["compound"][i]['id']:
        print(i, dataDict_dicts["compound"][i]['id'])

In [13]:
for i in dataDict_dicts["rule"]:
    if i != dataDict_dicts["rule"][i]['id']:
        print(i, dataDict_dicts["rule"][i]['id'])

In [14]:
for i in dataDict_dicts["property_name"]:
    if i != dataDict_dicts["property_name"][i]['id']:
        print(i, dataDict_dicts["property_name"][i]['id'])

In [15]:
for i in dataDict_dicts["constant_smiles"]:
    if i != dataDict_dicts["constant_smiles"][i]['id']:
        print(i, dataDict_dicts["constant_smiles"][i]['id'])

In [None]:
dataTable_pair = dataDict_tables["pair"].rename(columns={'id': 'pair_id'})
print(dataTable_pair.shape)
dataTable_pair.head(3)

In [None]:
pair_dict = {}
for p_idx in dataTable_pair.index:
    pid = dataTable_pair['pair_id'][p_idx]
    cid_1 = dataTable_pair['compound1_id'][p_idx]
    cid_2 = dataTable_pair['compound2_id'][p_idx]
    csid = dataTable_pair['constant_id'][p_idx]
    reid = dataTable_pair['rule_environment_id'][p_idx]

    pair_set = f'{sorted([cid_1, cid_2])[0]}<=>{sorted([cid_1, cid_2])[1]}'
    if pair_set not in pair_dict:
        pair_dict[pair_set] = {}
    
    pair_trans = f"{cid_1}=>{cid_2}"
    if pair_trans not in pair_dict[pair_set]:
        pair_dict[pair_set][pair_trans] = {}

    pair_dict[pair_set][pair_trans][pid] = {}
    pair_dict[pair_set][pair_trans][pid]['pair_id'] = pid
    pair_dict[pair_set][pair_trans][pid]['constant_id'] = csid
    pair_dict[pair_set][pair_trans][pid]['rule_environment_id'] = reid
    pair_dict[pair_set][f"num_pairs_{pair_trans}"] = len(pair_dict[pair_set][pair_trans])

In [None]:
len(pair_dict)

In [None]:
pd.DataFrame.from_dict(pair_dict).T

In [None]:
pair_dict

In [None]:
dataTable_pair = dataDict_tables["pair"].rename(columns={'id': 'pair_id'})
print(dataTable_pair.shape)
print(len(dataTable_pair['rule_environment_id'].unique()))

dataTable_rule_env = dataDict_tables["rule_environment"].rename(columns={'id': 'rule_environment_id'})
print(dataTable_rule_env.shape)
print(len(dataTable_rule_env['rule_environment_id'].unique()))

dataTable = pd.merge(left=dataTable_pair, right=dataTable_rule_env, on='rule_environment_id')
print(dataTable.shape)
print(len(dataTable['rule_environment_id'].unique()))
dataTable.head(3)

In [None]:
print(len(dataTable_pair['compound1_id'].unique()))

In [None]:
print(len(dataTable_pair['compound2_id'].unique()))

In [None]:
dataTable['radius'].value_counts()

In [None]:
dataTable_rule_env['radius'].value_counts()

In [None]:
dataTable_pair['pair_couple'] = 

In [None]:
for r in range(6):
    dataTable_rule_env_r = dataTable_rule_env[dataTable_rule_env['radius']==r]
    print(r, dataTable_rule_env_r.shape)

In [None]:
dataTable_rule_env = dataDict_tables["rule_environment"]
print(dataTable_rule_env.shape)
dataTable_rule_env.head(3)

In [None]:
dataTable_rule_env[dataTable_rule_env['radius']==1]

In [None]:
dataTable_rule_env[dataTable_rule_env['radius']==0]

In [None]:
dataTable_rule = dataDict_tables["rule"]
print(dataTable_rule.shape)
dataTable_rule.head(3)

In [None]:
dataTable = copy.deepcopy(dataDict_tables["pair"])
print("raw:", dataTable.shape)

## ------------------- add compound structure & property data -------------------


In [None]:
dataTable = copy.deepcopy(dataDict_tables["pair"])
print("raw:", dataTable.shape)

## ------------------- add compound structure & property data -------------------
table_merge = dataDict_tables["compound"]

## compound-1 (from)
dataTable = dataTable.merge(table_merge, left_on=['compound1_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'clean_smiles', 'clean_num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'public_id':'KT_number_1', 'input_smiles':'smiles_1'}, inplace=True)

## compound-2 (to)
dataTable = dataTable.merge(table_merge, left_on=['compound2_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'clean_smiles', 'clean_num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'public_id':'KT_number_2', 'input_smiles':'smiles_2'}, inplace=True)
print("join cmpd smi:", dataTable.shape)

## ------------------- add compound prop data -------------------
table_merge = dataDict_tables["compound_property"]

## compound-1 (from)
dataTable = dataTable.merge(table_merge, left_on=['compound1_id'], right_on=['compound_id'])
dataTable.drop(columns=['id_y', 'compound_id'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'value':'property_values_1'}, inplace=True)

## compound-2 (to)
dataTable = dataTable.merge(table_merge, left_on=['compound2_id', 'property_name_id'], right_on=['compound_id', 'property_name_id'])
dataTable.drop(columns=['id_y', 'compound_id'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'value':'property_values_2'}, inplace=True)

## add property name
table_merge = dataDict_tables["property_name"]
dataTable = dataTable.merge(table_merge, left_on=['property_name_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'name':'property_name'}, inplace=True)
print("join cmpd prop:", dataTable.shape)

## ------------------- add constant pieces data of the match pair -------------------
table_merge = dataDict_tables["constant_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['constant_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'smiles':'constant_smiles'}, inplace=True)
print("join constant smi:", dataTable.shape)

## ------------------- add rule env data -------------------
table_merge = dataDict_tables["rule_environment"]
dataTable = dataTable.merge(table_merge, left_on=['rule_environment_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'radius':'rule_env_radius', 'num_pairs':'rule_env_num_pairs'}, inplace=True)
print("join rule env:", dataTable.shape)

## ------------------- add rule info -------------------
table_merge = dataDict_tables["rule"]
dataTable = dataTable.merge(table_merge, left_on=['rule_id'], right_on=['id'])
dataTable.drop(columns=['id_y'], inplace=True)    #'rule_id'
dataTable.rename(columns={'id_x':'id'}, inplace=True)

table_merge = dataDict_tables["rule_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['from_smiles_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'from_smiles_id', 'num_heavies'], inplace=True)    #'num_heavies'
dataTable.rename(columns={'id_x':'id', 'smiles':'rule_from_smiles'}, inplace=True)

table_merge = dataDict_tables["rule_smiles"]
dataTable = dataTable.merge(table_merge, left_on=['to_smiles_id'], right_on=['id'])
dataTable.drop(columns=['id_y', 'to_smiles_id', 'num_heavies'], inplace=True)
dataTable.rename(columns={'id_x':'id', 'smiles':'rule_to_smiles'}, inplace=True)
print("join rule:", dataTable.shape)

## ------------------- add rule env stats -------------------
table_merge = dataDict_tables["rule_environment_statistics"]
dataTable = dataTable.merge(table_merge, 
                            left_on=['rule_environment_id', 'property_name_id'], 
                            right_on=['rule_environment_id', 'property_name_id'])

drop_cols = ['kurtosis', 'skewness', 'paired_t', 'p_value', 'q1', 'q3', 'median', 'std']
dataTable.drop(columns=['id_y']+drop_cols, inplace=True)
dataTable.rename(columns={'id_x':'id', 'count':'rule_env_count', 'avg':'rule_env_avg', 
                          'min':'rule_env_min', 'max':'rule_env_max'}, inplace=True)

print("join rule stats:", dataTable.shape)
## ------------------- add rule env environment_fingerprint data -------------------
table_merge = dataDict_tables["environment_fingerprint"]
## to be added

## ------------------- remove useless cols -------------------
dataTable.drop(columns=['id', 'compound1_id', 'compound2_id', 'constant_id', 'rule_environment_id', 'property_name_id'], inplace=True)
print(dataTable.shape)
dataTable.head(3)

In [None]:
pair_dict = {}

for idx in dataTable_pair.index:
    cidx_1 = dataTable_pair['compound1_id'][idx]
    cidx_2 = dataTable_pair['compound2_id'][idx]
    pair_idx = '<=>'.join(sorted([cidx_1, cidx_2]))
    if pair_idx not in pair_dict:
        pair_dict[pair_idx] = {}

        

    pair_dict[pair_idx][idx] = {}
    pair_dict[pair_idx][idx]['c1_id'] = cidx_1
    pair_dict[pair_idx][idx]['c2_id'] = cidx_2
    pair_dict[pair_idx][idx]['c2_id'] = cidx_2





#### 1. Read the entire DB

In [None]:
dataTable = pd.read_csv(f'./results/Compounds_All_4_informatics.csv')
print(dataTable.shape)
dataTable.head(3)

In [None]:
## duplicate all redius < 0 data
dataTable_radiu0 = dataTable[dataTable['rule_env_radius']==0].reset_index(drop=True)
print(dataTable_radiu0.shape)
dataTable_radiu0.head(3)

In [None]:
## collect the stats of the max const size
dataDict_max_const = {}
dataTable_loop = dataTable_radiu0

for idx in dataTable_loop.index:
    KT_id_1, KT_id_2 = dataTable_loop['KT_number_1'][idx], dataTable_loop['KT_number_2'][idx]
    
    pair_idx = '<=>'.join(sorted([KT_id_1, KT_id_2]))
    ## initialize the pair idx
    if pair_idx not in dataDict_max_const:
        dataDict_max_const[pair_idx] = {}
        dataDict_max_const[pair_idx]['pair_idx'] = pair_idx
        dataDict_max_const[pair_idx]['max_constant_size'] = 0

    ## update the max const size
    constant_size =  dataTable_loop['constant_size'][idx]
    if constant_size > dataDict_max_const[pair_idx]['max_constant_size']:
        dataDict_max_const[pair_idx]['max_constant_size'] = constant_size

In [None]:
def keep_row(row, dataDict_stats):
    KT_id_1 = row['KT_number_1'] 
    KT_id_2 = row['KT_number_2']
    constant_size = row['constant_size']
    ## exam whether the constant size in this row match the max size
    pair_idx = '<=>'.join(sorted([KT_id_1, KT_id_2]))
    assert pair_idx in dataDict_stats, f"Error! This pair {pair_idx} is not in the data"
    constant_size_max = dataDict_stats[pair_idx]['max_constant_size']
    ## export results
    if constant_size >= constant_size_max:
        if_keep = 1
    else:
        if_keep = 0
    return pd.Series([pair_idx, if_keep])

dataTable_radiu0[['pair_idx', 'Keep_row']] = dataTable_radiu0.apply(lambda row: keep_row(row, dataDict_max_const), axis=1)
print(dataTable_radiu0.shape)
dataTable_radiu0.head(3)

In [None]:
dataTable_clean = dataTable_radiu0[dataTable_radiu0['Keep_row']==1].sort_values(by=['pair_idx', 'KT_number_1', 'KT_number_2'])
dataTable_clean

In [None]:
dataTable_clean['pair_idx'].value_counts()

In [None]:
len(list(dataTable_clean['pair_idx'].unique()))

In [None]:
dataTable_clean[dataTable_clean['pair_idx']=='KT-0036156<=>KT-0036845']

In [None]:
dataDict_clean = {}
dataTable_loop = dataTable_radiu0

for idx in dataTable_loop.index:
    KT_id_1, KT_id_2 = dataTable_loop['KT_number_1'][idx], dataTable_loop['KT_number_2'][idx]

In [None]:
for prop in dataTable_radiu0['property_name'].unique():
    print(f"============================= {prop} =============================")
    dataTable_prop = dataTable_radiu0[dataTable_radiu0['property_name']==prop]
    print(f"Num of rows is {dataTable_prop.shape[0]}")

    list_cid_1 = dataTable_prop['KT_number_1'].unique()
    list_cid_2 = dataTable_prop['KT_number_2'].unique()
    print(len(list_cid_1), len(list_cid_2))
    print(len(set(list_cid_1) & set(list_cid_2)))
    print(len([id for id in list_cid_1 if id not in list_cid_2]))
    print(len([id for id in list_cid_2 if id not in list_cid_1]))

In [None]:
dataDict_clean = {}

for idx in dataTable_radiu0.index():
    KT_id_1, KT_id_2 = dataTable_radiu0['KT_number_1'][idx], dataTable_radiu0['KT_number_2'][idx]

    pair_idx = '<=>'.join(sorted([KT_id_1, KT_id_2]))
    if pair_idx not in dataDict_clean:
        dataDict_clean[pair_idx] = {}
        dataDict_clean[pair_idx]['constant_size_max'] = 0
        dataDict_clean[pair_idx]['row_idx'] = []

    constant_size =  dataTable_radiu0['constant_size'][idx]

    
    
    dataDict_clean[pair_idx]['KT_number_1'] = KT_id_1
    dataDict_clean[pair_idx]['KT_number_1'] = KT_id_2

    dataDict_clean[pair_idx]['smiles_1'] = dataTable_radiu0['smiles_1'][idx]
    dataDict_clean[pair_idx]['smiles_2'] = dataTable_radiu0['smiles_2'][idx]
    


        for col in []:
            dataDict_clean[pair_id][col] = dataTable_radiu0[col][idx]


    


In [None]:
################################################################################################
## generate "pair id" for each pairs
def GeneratePairID(row, col_mol_id_1='KT_number_1', col_mol_id_2='KT_number_2'):
    mol_id_1 = row[col_mol_id_1]
    mol_id_2 = row[col_mol_id_2]
    pair_id = str(mol_id_1) + '=>' + str(mol_id_2)
    
    mol_id_1_num = int(str(mol_id_1).split('-')[1])
    mol_id_2_num = int(str(mol_id_2).split('-')[1])
    pair_couple = (np.min([mol_id_1_num, mol_id_2_num]), np.max([mol_id_1_num, mol_id_2_num]))
    return pd.Series([pair_id, pair_couple])

dataTable[['Pair_id', 'PairInfo']] = dataTable.apply(lambda row: GeneratePairID(row, col_mol_id_1='KT_number_1', col_mol_id_2='KT_number_2'), axis=1)
print(dataTable.shape)

################################################################################################

################################################################################################

In [None]:
# # dataTable.to_csv(f'./results/Compounds_All_4_informatics.csv', index=False)
# dataTable = pd.read_csv(f'./results/Compounds_All_4_informatics.csv')
# print(dataTable.shape)
# dataTable.head(3)

#### 3. removed the "duplicated" rows

In [None]:
## duplicate all rows with reversed data
dataTable_radiu0 = dataTable[dataTable['rule_env_radius']==0].reset_index(drop=True)
dataTable_radiu0['rvs'] = 0
print(dataTable_radiu0.shape)

In [None]:
## duplicate all rows with reversed data
dataTable_radiu0 = dataTable[dataTable['rule_env_radius']==0].reset_index(drop=True)
dataTable_radiu0['rvs'] = 0
print(dataTable_radiu0.shape)


dict_reverse_cols = {'KT_number_1': 'KT_number_2', 'smiles_1': 'smiles_2', 'property_values_1': 'property_values_2',
                     'KT_number_2': 'KT_number_1', 'smiles_2': 'smiles_1', 'property_values_2': 'property_values_1',
                     'rule_from_smiles': 'rule_to_smiles', 'rule_to_smiles': 'rule_from_smiles'}
dataTable_radiu0_reverse = dataTable_radiu0.rename(columns=dict_reverse_cols)
dataTable_radiu0_reverse['rvs'] = 1
print(dataTable_radiu0_reverse.shape)

dataTable_radiu0_dups = pd.concat([dataTable_radiu0, dataTable_radiu0_reverse], ignore_index=True)
dataTable_radiu0_dups = dataTable_radiu0_dups.sort_values(by=['PairInfo', 'property_name', 'constant_size', 'constant_smiles', 'rvs'], 
                                                          ascending=[True, True, False, True, True])
print(dataTable_radiu0_dups.shape)

In [None]:
## only keep the pairs with max constant structure 
dataTable_radiu0_rmdups = dataTable_radiu0_dups.drop_duplicates(subset=['KT_number_1', 'KT_number_2', 'property_name', 'constant_smiles', 'rule_from_smiles', 'rule_to_smiles'])
dataTable_radiu0_rmdups = dataTable_radiu0_rmdups.reset_index(drop=True)
print(dataTable_radiu0_rmdups.shape)
dataTable_radiu0_rmdups.head(3)

In [None]:
list_dataTable_sele = []
for pi in dataTable_radiu0_rmdups['PairInfo'].unique():
    dataTable_subset = dataTable_radiu0_rmdups[dataTable_radiu0_rmdups['PairInfo']==pi]
    dataTable_sele = dataTable_subset[dataTable_subset['constant_size']==dataTable_subset['constant_size'].max()]
    list_dataTable_sele.append(dataTable_sele)

dataTable_clean = pd.concat(list_dataTable_sele)
print(dataTable_clean.shape)
dataTable_clean.head(3)

In [None]:
dataTable_radiu0_dups.to_csv(f'./results/Tmp_Compounds_All_4_informatics_dups.csv', index=False)

In [None]:
dataDict_mmps = {}

## loop all property
for prop in dataTable_radiu0['property_name'].unique():
    ## get all available data with this property in this cid-set
    dataTable_prop = dataTable_radiu0[dataTable_radiu0['property_name']==prop]
    print('====================', prop, ':', dataTable_prop.shape[0], '====================')

    ## loop all cid-sets
    for cid_set in dataTable_prop['PairInfo'].unique():
        ## get all available data with this cid-set
        dataTable_prop_pair = dataTable_prop[dataTable_prop['PairInfo']==cid_set]
        
        ## get pair compound id 
        cid_1, cid_2 = f"KT-{cid_set[0]:07}", f"KT-{cid_set[1]:07}"
        pair_id_a, pair_id_b = f"{cid_1}=>{cid_2}", f"{cid_2}=>{cid_1}"

        num_fragmentations = dataTable_prop_pair['constant_smiles'].unique()






In [None]:
dataTable_radiu0 = dataTable[dataTable['rule_env_radius']==0].reset_index(drop=True)
print(dataTable_radiu0.shape)

dataDict_mmps = {}

## loop all property
for prop in dataTable_radiu0['property_name'].unique():
    ## get all available data with this property in this cid-set
    dataTable_prop = dataTable_radiu0[dataTable_radiu0['property_name']==prop]
    print(prop, ':', dataTable_prop.shape[0])

    ## loop all cid-sets
    for cid_set in dataTable_prop['PairInfo'].unique():
        ## get pair compound id 
        cid_1, cid_2 = f"KT-{cid_set[0]:07}", f"KT-{cid_set[1]:07}"
        pair_id_a, pair_id_b = f"{cid_1}=>{cid_2}", f"{cid_2}=>{cid_1}"

        ## get all available data with this cid-set
        dataTable_prop_pair = dataTable_prop[dataTable_prop['PairInfo']==cid_set]
        num_fragmentations = dataTable_prop_pair['constant_smiles'].unique()

        for const_smi in dataTable_prop_pair['constant_smiles'].unique():
            const_smi_list = const_smi.split('.')


            dataTable_prop_pair_cuts = dataTable_prop_pair[dataTable_prop_pair['constant_smiles']==const_smi]
            if dataTable_prop_pair_cuts.shape[0] > 2:
                print(prop, cid_1, cid_2, dataTable_prop_pair_cuts.shape[0])
                test_dataTable = dataTable_prop_pair_cuts

               
        # dataTable_prop_pair = dataTable_prop_pair.sort_values(by=['constant_size', 'Pair_id'], ascending=[False, True], inplace=False)
        
        # if dataTable_prop_pair.shape[0] > 2:
        #     print(prop, cid_1, cid_2, dataTable_prop_pair.shape[0])

        # ## if this pair-a is available
        # if pair_id_a in dataTable_prop_pair['Pair_id'].unique():
        #     dataTable_prop_pair_a = dataTable_prop_pair[dataTable_prop_pair['Pair_id']==pair_id_a]
        #     dataTable_prop_pair_a

                    

        #     pass
        # else:
        #     pass
        
        # ## if this pair-b is available
        # if pair_id_b in dataTable_prop_pair['Pair_id'].unique():
        #     dataTable_prop_pair_b = dataTable_prop_pair[dataTable_prop_pair['Pair_id']==pair_id_b]
        #     pass
        # else:
        #     pass


In [None]:
dataTable_prop[(dataTable_prop['Pair_id']=='KT-0000058=>KT-0014349')]

In [None]:
test_dataTable['constant_smiles'][265].split('.')[0]

In [None]:
calculate_heavy_atoms(test_dataTable['constant_smiles'][265].split('.')[0])

In [None]:
Chem.MolFromSmiles(test_dataTable['constant_smiles'][265].split('.')[0]).

In [None]:
test_dataTable

In [None]:
unique_cid_1 = dataTable_radiu0['KT_number_1'].unique()
print(len(unique_cid_1))

unique_cid_2 = dataTable_radiu0['KT_number_2'].unique()
print(len(unique_cid_2))

unique_pair_set = dataTable_radiu0['PairInfo'].unique()
print(len(unique_pair_set))

In [None]:
f"KT-{unique_pair_set[0][0]:07}" in unique_cid_1

In [None]:
7990-4500-2677

In [None]:
dataDict_mmps = {}


for idx in dataTable_radiu0.index:
    pair_info = dataTable_radiu0['PairInfo'][idx]
    pair_id = dataTable_radiu0['Pair_id'][idx]
    cid_1, cid_2 = dataTable_radiu0['KT_number_1'][idx], dataTable_radiu0['KT_number_2'][idx]
    
    if pair_info not in dataDict_mmps

In [None]:
dataTable_radiu0 = dataTable[dataTable['rule_env_radius']==0].reset_index(drop=True)
print(dataTable_radiu0.shape)

dataTable_radiu0 = dataTable_radiu0.sort_values(by=['PairInfo', 'Pair_id', 'property_name', 'rule_env_radius', 'constant_size'], ascending=[True, True, False], inplace=False)
dataTable_rmDup = dataTable_radiu0.drop_duplicates(subset=['PairInfo', 'property_name'], keep='first', inplace=False)
dataTable_rmDup.to_csv(f'./results/Compounds_All_4_informatics_rmDups_raw.csv', index=False)
print(dataTable_rmDup.shape)
dataTable_rmDup.head(3)

#### append symetric rows

In [None]:
dataTable_rmDup

In [None]:
rename_symetric_dict = {
    'KT_number_1': 'KT_number_2',
    'smiles_1': 'smiles_2',
    'KT_number_2': 'KT_number_1',
    'smiles_2': 'smiles_1',
    'property_values_1': 'property_values_2',
    'property_values_2': 'property_values_1', 
    'rule_from_smiles': 'rule_to_smiles',
    'rule_to_smiles': 'rule_from_smiles'}
dataTable_rmDup_symetric = dataTable_rmDup.rename(columns=rename_symetric_dict, inplace=False)
dataTable_rmDup_symetric['Pair_id'] = dataTable_rmDup_symetric['KT_number_1'] + '=>' + dataTable_rmDup_symetric['KT_number_2']
for col in ['rule_env_avg', 'rule_env_min', 'rule_env_max']:
    dataTable_rmDup_symetric[col] = dataTable_rmDup_symetric[col] * -1
dataTable_rmDup_symetric.head(3)

In [None]:
dataTable_rmDup_all = dataTable_rmDup._append(dataTable_rmDup_symetric, ignore_index=True)
dataTable_rmDup_all['rule_env_min'] = dataTable_rmDup_all['rule_env_min'].apply(lambda x:round(x, 2)).astype('str')
dataTable_rmDup_all['rule_env_max'] = dataTable_rmDup_all['rule_env_max'].apply(lambda x:round(x, 2)).astype('str')
dataTable_rmDup_all['rule_env_range'] = '('+ dataTable_rmDup_all['rule_env_min'] + ',' + dataTable_rmDup_all['rule_env_max'] +')'

In [None]:
dataTable_rmDup_all.sort_values(by=['PairInfo', 'property_name', 'Pair_id'], ascending=[True, True, True], inplace=True)
dataTable_rmDup_all = dataTable_rmDup_all.reset_index(drop=True)
dataTable_rmDup_all

In [None]:
dataTable_rmDup_all.to_csv(f'./results/Compounds_All_4_informatics_rmDups.csv', index=False)
