In [2]:
import json
from rdkit import Chem
from rdkit.Chem import rdMolAlign
import pandas as pd

with open('../final_results/3amb/output.json', 'rt') as json_file:
    output = json.load(json_file)

# Violations

In [3]:
sum_violations = sum(output['NumDisplacementViolations'].values())
sum_generated_docking_poses = sum(output['GeneratedPoses'].values())
print(f'{sum_violations/sum_generated_docking_poses * 100}% of optimized poses were dropped')
print(f"Total number: {sum_violations}")
print(f"Total number: {sum_generated_docking_poses}")

16.722722858518363% of optimized poses were dropped
Total number: 101081
Total number: 604453


In [5]:
# average displacement
print(f"Mean displacement: {sum(output['MeanHydeDisplacementIncludingViolations'].values())/len(output['MeanHydeDisplacementIncludingViolations'])}")


Mean displacement: 1.3780824898020558


In [6]:
# parse violations
violations = [mol  for i in range(4) for mol in Chem.SDMolSupplier(f"../final_results/3amb/violations_SP{i}.sdf", removeHs=False)]

In [7]:
# pair violations
pairs = [[violations[i], violations[i+1]] for i in range(0, len(violations), 2)]
violations_df = pd.DataFrame(pairs, columns=['before', 'after'])

In [8]:
violations_df.head()

Unnamed: 0,before,after
0,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3040>,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3220>
1,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3580>,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed33a0>
2,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3400>,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3700>
3,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3820>,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3940>
4,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed3880>,<rdkit.Chem.rdchem.Mol object at 0x7fe0f1ed38e0>


In [9]:
violations_df['rmsd'] = violations_df.apply(lambda x: rdMolAlign.CalcRMS(x.before, x.after, map = [[[j, j] for j in range(x.before.GetNumAtoms())]]), axis=1)

In [10]:
violations_df =violations_df.sort_values(['rmsd']).reset_index(drop=True)

In [11]:
print(f'Minimum rmsd: {violations_df.rmsd[0]}')

Minimum rmsd: 2.000005361055315


In [12]:
print(f'Maximum rmsd: {violations_df.rmsd[violations_df.index.size - 1]}')

Maximum rmsd: 10.118315302617418


In [13]:
violations_df['binding_affinity'] = violations_df.apply(lambda x: (float(x.after.GetProp('BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_LOWER_BOUNDARY [nM]')) + float(x.after.GetProp('BIOSOLVEIT.HYDE_ESTIMATED_AFFINITY_UPPER_BOUNDARY [nM]'))) / 2, axis=1)

In [14]:
violations_df.binding_affinity.describe()

count    1.010810e+05
mean     1.214088e+13
std      1.250293e+15
min      4.429400e-02
25%      4.506207e+04
50%      4.757876e+06
75%      1.620926e+08
max      2.432397e+17
Name: binding_affinity, dtype: float64

In [15]:
violations_df.binding_affinity.median()

4757875.7871915