# Postprocessing Docking Results

In [1]:
import os, shutil
import pandas as pd

## Turn off SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)

## Parse HADDOCK Results

In [2]:
candidates = pd.read_excel("../../PD1_Candidates.xlsx")
candidates.head()

Unnamed: 0,antibody_id,antibody_portion,antibody_sequence_source,antigen,h_chain_version,l_chain_version,h_chain,l_chain,antibody_pdb_path_original,antibody_pdb_path_renumbered,...,haddock_AIRviol,haddock_AIRviol_sd,haddock_dihedviol,haddock_dihedviol_sd,haddock_BSA,haddock_BSA_sd,haddock-score,haddock-score_sd,haddock_prodigy_deltaG_kcalpermol,haddock_prodigy_dissociation_constant_M
0,TUPPD1-001,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,1,1,EVQLVKSGAEFKKPNDSLKITCKASGYTFTNTGTNVHWVRQAPLKQ...,ELVPTQAIRSLSLFLSEGLKISCSSSRDIDNSSNINTELGSFHTRP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,,,,,,,,,,
1,TUPPD1-002,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,1,2,EVQLVKSGAEFKKPNDSLKITCKASGYTFTNTGTNVHWVRQAPLKQ...,DIIMTNTPTNLYVSPGESICIICRSSKSGFDGNLVHTYLKWYLQHP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,,,,,,,,,,
2,TUPPD1-003,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,1,3,EVQLVKSGAEFKKPNDSLKITCKASGYTFTNTGTNVHWVRQAPLKQ...,EIVMTQSPASLPVSPGERATVTCRASESVSDSANGRGWLAWLWQKP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,,,,,,,,,,
3,TUPPD1-004,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,2,1,QISLVESGPVLVKPNEQLKVACKTSGFTLSSYGTVLNWVRQAPGQR...,ELVPTQAIRSLSLFLSEGLKISCSSSRDIDNSSNINTELGSFHTRP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,,,,,,,,,,
4,TUPPD1-005,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,2,2,QISLVESGPVLVKPNEQLKVACKTSGFTLSSYGTVLNWVRQAPGQR...,DIIMTNTPTNLYVSPGESICIICRSSKSGFDGNLVHTYLKWYLQHP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,,,,,,,,,,


In [4]:
## Define paths to results directories
results_directory_path = "../../docking/inputs/experiments"
subdirectory_path = 'run1/structures/it1/water/'

In [9]:
## Create empty DataFrame for metrics
experiment_results_df = pd.DataFrame()

missing_experiments = []

## Loop through experiment folders and grab best cluster metrics
for index, candidate in candidates.iterrows():
    candidate_id = candidate['antibody_id']
    experiment_path = f"{results_directory_path}/{candidate_id}/{subdirectory_path}"

    ## Get best cluster based on lowest van der Waals energy
    try:
        vdw_clusters = pd.read_csv(f'{experiment_path}cluster_ener.txt', delimiter=r"\s+").sort_values(by=['Evdw'], ascending = True)
        best_cluster = vdw_clusters.iat[0, 0]
        vdw_clusters = vdw_clusters.loc[vdw_clusters['#Cluster'] == best_cluster]
        vdw_clusters = vdw_clusters.rename(columns={'sd': 'Einter_sd', 'sd.1': 'Enb_sd', 'sd.2': 'Evdw+0.1Eelec_sd', 'sd.3': 'Evdw_sd', 'sd.4': 'Eelec_sd', 'sd.5': 'Eair_sd'})

        ## Get Desolvation Energy metric
        de_clusters = pd.read_csv(f'{experiment_path}cluster_Edesolv.txt', delimiter=r"\s+")#.sort_values(by=['#Edesolv'], ascending = True)
        de_clusters = de_clusters.loc[de_clusters['#Cluster'] == best_cluster]
        de_clusters = de_clusters.rename(columns={'#Edesolv': 'Edesolv', 'sd': 'Edesolv_sd'})

        ## Get Restraints Violation Energy metric
        rve_clusters = pd.read_csv(f'{experiment_path}cluster_viol.txt', delimiter=r"\s+")#.sort_values(by=['#AIRviol'], ascending = True)
        rve_clusters = rve_clusters.loc[rve_clusters['#Cluster'] == best_cluster]
        rve_clusters = rve_clusters.rename(columns={'#AIRviol': 'AIRviol', 'sd.1': 'AIRviol_sd', '#dihedviol': 'dihedviol', 'sd.2': 'dihedviol_sd'})
        rve_clusters = rve_clusters[['#Cluster', 'AIRviol', 'AIRviol_sd', 'dihedviol', 'dihedviol_sd']]

        ## Get Buried Surface Area metric
        bsa_clusters = pd.read_csv(f'{experiment_path}cluster_bsa.txt', delimiter=r"\s+")#.sort_values(by=['BSA'], ascending = True)
        bsa_clusters = bsa_clusters.loc[bsa_clusters['#Cluster'] == best_cluster]
        bsa_clusters = bsa_clusters.rename(columns={'sd': 'BSA_sd'})

        ## Get HADDOCK Score metric
        had_clusters = pd.read_csv(f'{experiment_path}cluster_haddock-score.txt', delimiter=r"\s+")#.sort_values(by=['haddock-score'], ascending = True)
        had_clusters = had_clusters.loc[had_clusters['#Cluster'] == best_cluster]
        had_clusters = had_clusters.rename(columns={'sd': 'haddock-score_sd'})

        ## Get best PDB file from best cluster
        cluster_pdbs = pd.read_csv(f'{experiment_path}{best_cluster}_ener', delimiter=r"\s+").sort_values(by=['Evdw'], ascending = True)
        best_pdb = cluster_pdbs.iat[0, 0]

        ## Make Pandas row and append to results DataFrame
        best_cluster_row = vdw_clusters.iloc[[0]]

        best_cluster_row['antibody_id'] = candidate_id
        best_cluster_row['best_cluster'] = f"Cluster {best_cluster.replace('file.nam_clust', '')}"
        best_cluster_row['best_pdb_path'] = f"{experiment_path}{best_pdb}"

        ### Append all metrics
        best_cluster_row = best_cluster_row.set_index('#Cluster').join(de_clusters.set_index('#Cluster'))
        best_cluster_row = best_cluster_row.join(rve_clusters.set_index('#Cluster'))
        best_cluster_row = best_cluster_row.join(bsa_clusters.set_index('#Cluster'))
        best_cluster_row = best_cluster_row.join(had_clusters.set_index('#Cluster'))

        experiment_results_df = pd.concat([experiment_results_df, best_cluster_row], ignore_index=True)
        
    except Exception:
        missing_experiments.append(candidate_id)
        pass
    
## Check for missing experiments
print(f"There are {len(missing_experiments)} experiments with missing files.")

There are 0 experiments with missing files.


In [10]:
## Make the output DataFrame prettier

## Subset columns to those of interest
# list(experiment_results_df.columns)
experiment_results_df = experiment_results_df[[
    'antibody_id',
    'best_cluster','best_pdb_path', 'Nstruc',
    'Evdw+0.1Eelec', 'Evdw+0.1Eelec_sd', 'Evdw', 'Evdw_sd',
    'Eelec', 'Eelec_sd',
    'Eair', 'Eair_sd',
    'Edesolv', 'Edesolv_sd',
    'AIRviol', 'AIRviol_sd',
    'dihedviol', 'dihedviol_sd',
    'BSA', 'BSA_sd',
    'haddock-score', 'haddock-score_sd'
    ]]

## Show DataFrame
experiment_results_df

Unnamed: 0,antibody_id,best_cluster,best_pdb_path,Nstruc,Evdw+0.1Eelec,Evdw+0.1Eelec_sd,Evdw,Evdw_sd,Eelec,Eelec_sd,...,Edesolv,Edesolv_sd,AIRviol,AIRviol_sd,dihedviol,dihedviol_sd,BSA,BSA_sd,haddock-score,haddock-score_sd
0,TUPPD1-001,Cluster 2,../../docking/inputs/experiments/TUPPD1-001/ru...,13,-77.05,3.59,-60.93,3.61,-161.23,7.4,...,-11.06,1.157,24.15,0.0,0.0,0.0,1818.277,23.655,-3.533,6.467
1,TUPPD1-002,Cluster 1,../../docking/inputs/experiments/TUPPD1-002/ru...,5,-11.09,4.6,-8.42,3.31,-26.73,17.36,...,-5.905,3.138,31.4,0.0,0.0,0.0,404.161,83.706,236.927,39.146
2,TUPPD1-003,Cluster 4,../../docking/inputs/experiments/TUPPD1-003/ru...,7,-97.42,9.7,-83.0,8.12,-144.18,40.03,...,-7.729,2.201,33.86,0.0,0.0,0.0,2367.807,147.604,169.392,25.107
3,TUPPD1-004,Cluster 8,../../docking/inputs/experiments/TUPPD1-004/ru...,4,-82.37,6.71,-63.0,5.28,-193.71,17.82,...,16.718,0.487,21.0,0.0,0.0,0.0,2136.385,41.838,1.296,14.224
4,TUPPD1-005,Cluster 2,../../docking/inputs/experiments/TUPPD1-005/ru...,5,-51.44,6.49,-38.18,7.11,-132.61,9.33,...,-4.699,1.445,26.2,0.0,0.0,0.0,1619.176,32.94,38.75,8.167
5,TUPPD1-006,Cluster 11,../../docking/inputs/experiments/TUPPD1-006/ru...,4,-61.84,13.63,-44.85,11.45,-169.83,66.66,...,1.52,2.059,24.25,0.0,0.0,0.0,1768.843,212.841,25.782,17.354
6,TUPPD1-007,Cluster 6,../../docking/inputs/experiments/TUPPD1-007/ru...,4,-61.67,10.13,-45.68,6.72,-159.88,42.67,...,10.043,3.016,25.5,0.0,0.0,0.0,1781.165,98.063,43.497,15.577
7,TUPPD1-008,Cluster 6,../../docking/inputs/experiments/TUPPD1-008/ru...,4,-46.31,7.26,-36.14,7.27,-101.72,3.97,...,-7.437,1.62,25.5,0.0,0.0,0.0,1624.548,98.11,38.993,11.525
8,TUPPD1-009,Cluster 1,../../docking/inputs/experiments/TUPPD1-009/ru...,10,-55.75,8.04,-40.77,5.09,-149.8,39.79,...,0.708,1.728,24.1,0.0,0.0,0.0,1734.141,86.495,25.305,15.568
9,Acrixolimab,Cluster 2,../../docking/inputs/experiments/Acrixolimab/r...,25,-88.11,10.49,-67.25,11.08,-208.64,36.61,...,-3.568,4.621,13.8,0.0,0.0,0.0,2631.959,213.109,-68.178,18.362


## Generate PRODIGY Predictions

Note: You must have PRODIGY installed.
```sh
git clone https://github.com/haddocking/prodigy
pip install prodigy/.
```

In [13]:
candidates = pd.read_excel("../../PD1_Candidates.xlsx")
candidates.head()

Unnamed: 0,antibody_id,antibody_portion,antibody_sequence_source,antigen,h_chain_version,l_chain_version,h_chain,l_chain,antibody_pdb_path_original,antibody_pdb_path_renumbered,...,haddock_AIRviol,haddock_AIRviol_sd,haddock_dihedviol,haddock_dihedviol_sd,haddock_BSA,haddock_BSA_sd,haddock-score,haddock-score_sd,haddock_prodigy_deltaG_kcalpermol,haddock_prodigy_dissociation_constant_M
0,TUPPD1-001,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,1,1,EVQLVKSGAEFKKPNDSLKITCKASGYTFTNTGTNVHWVRQAPLKQ...,ELVPTQAIRSLSLFLSEGLKISCSSSRDIDNSSNINTELGSFHTRP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,24.15,1.41,0,0,1818.277,23.655,-3.533,6.467,,
1,TUPPD1-002,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,1,2,EVQLVKSGAEFKKPNDSLKITCKASGYTFTNTGTNVHWVRQAPLKQ...,DIIMTNTPTNLYVSPGESICIICRSSKSGFDGNLVHTYLKWYLQHP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,24.333333,4.189935,0,0,1654.57,311.075,66.170873,87.999296,,
2,TUPPD1-003,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,1,3,EVQLVKSGAEFKKPNDSLKITCKASGYTFTNTGTNVHWVRQAPLKQ...,EIVMTQSPASLPVSPGERATVTCRASESVSDSANGRGWLAWLWQKP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,26.8,2.56,0,0,1642.766,26.642,28.774,10.272,,
3,TUPPD1-004,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,2,1,QISLVESGPVLVKPNEQLKVACKTSGFTLSSYGTVLNWVRQAPGQR...,ELVPTQAIRSLSLFLSEGLKISCSSSRDIDNSSNINTELGSFHTRP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,24.3,2.72,0,0,42.563,15.062,42.563,15.062,,
4,TUPPD1-005,Fv,"EvoDiff, dd2169c, MSA_OA_DM_MAXSUB",PD1,2,2,QISLVESGPVLVKPNEQLKVACKTSGFTLSSYGTVLNWVRQAPGQR...,DIIMTNTPTNLYVSPGESICIICRSSKSGFDGNLVHTYLKWYLQHP...,./docking/inputs/PDBs/diffused_antibodies/comb...,./docking/inputs/PDBs/diffused_antibodies/comb...,...,24.666667,2.054805,0,0,1778.14,84.603261,39.039853,13.968612,,


In [21]:
## Create empty DataFrame for PRODIGY results
prodigy_results_df = pd.DataFrame()

## To run on local machine (Windows)
local_base_path = "C:\\Users\\Colby\\Documents\\GitHub\\PD1_Fab_Diffusion\\"

for index, candidate in candidates.iterrows():
    ## Create full path to best PDB file
    best_pdb_path = candidate['haddock_best_pdb_path'].replace('../../', '').replace('/', '\\')
    pdb_path = f'{local_base_path}{best_pdb_path}'

    print(f"Running PRODIGY on {pdb_path}")

    ## Run PRODIGY and parse stdout
    prodigy_output = os.popen(f'prodigy {pdb_path}').read()
    prodigy_output_lines = prodigy_output.split('\n')
    predicted_binding_affinity = float(prodigy_output_lines[-3].split(':')[1].replace(' ', ''))
    predicted_dissociation_constant = float(prodigy_output_lines[-2].split(':')[1].replace(' ', ''))

    ## Add values to DataFrame
    prodigy_results_df.loc[index, 'antibody_id'] = candidate['antibody_id']
    prodigy_results_df.loc[index, 'prodigy_deltaG_kcalpermol'] = predicted_binding_affinity
    prodigy_results_df.loc[index, 'prodigy_dissociation_constant_M'] = predicted_dissociation_constant

prodigy_results_df

Unnamed: 0,antibody_id,prodigy_deltaG_kcalpermol,prodigy_dissociation_constant_M
0,TUPPD1-001,-10.9,1e-08
1,TUPPD1-002,-12.0,1.6e-09
2,TUPPD1-003,-12.0,1.6e-09
3,TUPPD1-004,-8.8,3.3e-07
4,TUPPD1-005,-10.5,1.8e-08
5,TUPPD1-006,-11.1,7.7e-09
6,TUPPD1-007,-10.9,1.1e-08
7,TUPPD1-008,-9.0,2.4e-07
8,TUPPD1-009,-12.5,6.8e-10
9,Acrixolimab,-13.1,2.5e-10


In [22]:
## Write out results DataFrame
# prodigy_results_df.to_csv("prodigy_results.csv", index=False)

In [24]:
## Grab Best PDBs
for index, candidate in candidates.iterrows():
    best_pdb_path = candidate['haddock_best_pdb_path']
    shutil.copy(best_pdb_path, '../../docking/outputs/best_pdbs')