# Postprocessing HADDOCK Results

In [1]:
import os
import pandas as pd

## Turn off SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)

In [47]:
## Define paths to results directories
results_directory_path = "./example_results"
subdirectory_path = 'run1/structures/it1/water/'

## Get list of experiment directories
results_directories = [f for f in os.listdir(results_directory_path)]

## Create empty DataFrame for metrics
experiment_results_df = pd.DataFrame()

In [48]:
## Loop through experiment folders and grab best cluster metrics
for experiment in results_directories:
    experiment_path = f"{results_directory_path}/{experiment}/{subdirectory_path}"

    ## Get best cluster based on lowest van der Waals energy
    vdw_clusters = pd.read_csv(f'{experiment_path}cluster_ener.txt', delimiter=r"\s+").sort_values(by=['Evdw'], ascending = True)
    best_cluster = vdw_clusters.iat[0, 0]
    vdw_clusters = vdw_clusters.loc[vdw_clusters['#Cluster'] == best_cluster]
    vdw_clusters = vdw_clusters.rename(columns={'sd': 'Einter_sd', 'sd.1': 'Enb_sd', 'sd.2': 'Evdw+0.1Eelec_sd', 'sd.3': 'Evdw_sd', 'sd.4': 'Eelec_sd', 'sd.5': 'Eair_sd'})

    ## Get Desolvation Energy metric
    de_clusters = pd.read_csv(f'{experiment_path}cluster_Edesolv.txt', delimiter=r"\s+")#.sort_values(by=['#Edesolv'], ascending = True)
    de_clusters = de_clusters.loc[de_clusters['#Cluster'] == best_cluster]
    de_clusters = de_clusters.rename(columns={'#Edesolv': 'Edesolv', 'sd': 'Edesolv_sd'})

    ## Get Restraints Violation Energy metric
    rve_clusters = pd.read_csv(f'{experiment_path}cluster_viol.txt', delimiter=r"\s+")#.sort_values(by=['#AIRviol'], ascending = True)
    rve_clusters = rve_clusters.loc[rve_clusters['#Cluster'] == best_cluster]
    rve_clusters = rve_clusters.rename(columns={'#AIRviol': 'AIRviol', 'sd.1': 'AIRviol_sd', '#dihedviol': 'dihedviol', 'sd.2': 'dihedviol_sd'})
    rve_clusters = rve_clusters[['#Cluster', 'AIRviol', 'AIRviol_sd', 'dihedviol', 'dihedviol_sd']]

    ## Get Buried Surface Area metric
    bsa_clusters = pd.read_csv(f'{experiment_path}cluster_bsa.txt', delimiter=r"\s+")#.sort_values(by=['BSA'], ascending = True)
    bsa_clusters = bsa_clusters.loc[bsa_clusters['#Cluster'] == best_cluster]
    bsa_clusters = bsa_clusters.rename(columns={'sd': 'BSA_sd'})

    ## Get HADDOCK Score metric
    had_clusters = pd.read_csv(f'{experiment_path}cluster_haddock-score.txt', delimiter=r"\s+")#.sort_values(by=['haddock-score'], ascending = True)
    had_clusters = had_clusters.loc[had_clusters['#Cluster'] == best_cluster]
    had_clusters = had_clusters.rename(columns={'sd': 'haddock-score_sd'})

    ## Get best PDB file from best cluster
    cluster_pdbs = pd.read_csv(f'{experiment_path}{best_cluster}_ener', delimiter=r"\s+").sort_values(by=['Evdw'], ascending = True)
    best_pdb = cluster_pdbs.iat[0, 0]

    ## Make Pandas row and append to results DataFrame
    best_cluster_row = vdw_clusters.iloc[[0]]

    best_cluster_row['experiment_name'] = experiment
    best_cluster_row['best_cluster'] = best_cluster.replace('file.nam_clust', '')
    best_cluster_row['best_pdb_path'] = f"{experiment_path}{best_pdb}"

    ### Append all metrics
    best_cluster_row = best_cluster_row.set_index('#Cluster').join(de_clusters.set_index('#Cluster'))
    best_cluster_row = best_cluster_row.join(rve_clusters.set_index('#Cluster'))
    best_cluster_row = best_cluster_row.join(bsa_clusters.set_index('#Cluster'))
    best_cluster_row = best_cluster_row.join(had_clusters.set_index('#Cluster'))

    experiment_results_df = pd.concat([experiment_results_df, best_cluster_row], ignore_index=True)

In [52]:
## Make the output DataFrame prettier

## Subset columns to those of interest
# list(experiment_results_df.columns)
experiment_results_df = experiment_results_df[[
    'experiment_name', 'best_cluster','best_pdb_path', 'Nstruc',
    'Evdw+0.1Eelec', 'Evdw+0.1Eelec_sd', 'Evdw', 'Evdw_sd',
    'Eelec', 'Eelec_sd',
    'Eair', 'Eair_sd',
    'Edesolv', 'Edesolv_sd',
    'AIRviol', 'AIRviol_sd',
    'dihedviol', 'dihedviol_sd',
    'BSA', 'BSA_sd',
    'haddock-score', 'haddock-score_sd'
    ]]

## Show DataFrame
experiment_results_df

Unnamed: 0,experiment_name,best_cluster,best_pdb_path,Nstruc,Evdw+0.1Eelec,Evdw+0.1Eelec_sd,Evdw,Evdw_sd,Eelec,Eelec_sd,...,Edesolv,Edesolv_sd,AIRviol,AIRviol_sd,dihedviol,dihedviol_sd,BSA,BSA_sd,haddock-score,haddock-score_sd
0,SARS-CoV-2-B.1.1.7-N__CCL3,4,./example_results/SARS-CoV-2-B.1.1.7-N__CCL3/r...,5,-97.67,18.76,-76.53,14.92,-211.37,59.69,...,13.32,4.858,47.2,0.0,0.0,0.0,2358.872,311.43,180.798,29.661
1,SARS-CoV-2-B.1.1.7-N__CCL5,1,./example_results/SARS-CoV-2-B.1.1.7-N__CCL5/r...,8,-115.48,11.49,-88.08,11.01,-274.04,33.67,...,0.737,5.024,47.25,0.0,0.0,0.0,2586.364,255.102,150.538,29.638
2,SARS-CoV-2-B.1.1.7-N__CCL7,6,./example_results/SARS-CoV-2-B.1.1.7-N__CCL7/r...,4,-105.72,6.64,-76.4,9.05,-293.23,28.69,...,20.126,1.838,42.25,0.0,0.0,0.0,2682.547,136.492,169.214,19.125
3,SARS-CoV-2-B.1.1.7-N__CXCL12beta,5,./example_results/SARS-CoV-2-B.1.1.7-N__CXCL12...,4,-104.95,7.53,-70.94,4.87,-340.05,32.72,...,7.557,2.745,49.0,0.0,0.0,0.0,2684.883,129.006,157.299,12.184


In [53]:
## Write out results DataFrame
experiment_results_df.to_csv("experiment_results.csv")