In [1]:
import os
from utils import *

# Set working directory and root paths
# remove this line when uploading to github
os.chdir('/home/lwang/ProteinComplex_HDX_prediction')
root_dir = './example'
task_folder = '1UGH_docking'

# Load configuration
# config_path should be input parameter
config_path = f'{root_dir}/{task_folder}/state.json'

In [None]:
import os
from utils import *

# code starts here
settings, apo_states, complex_states = parse_config(config_path)

## calculate HDXRank Score ###
# load settings #
root_dir = settings['root_dir']
task_folder = settings['task_folder']
protein_name = settings['protein_name']
cluster_id = settings['cluster_id']
N_decoys = settings['N_decoys']
HDX_fpath = f'{root_dir}/{task_folder}/{settings["HDX_file"]}'
pred_cluster = settings['pred_cluster']
 
# get average update from source HDX file #
timepoints = settings['timepoints']
complex_batch_list = [f'MODEL_{i}_REVISED' for i in range(1, N_decoys+1)] + [f'MODEL_{i}_NATIVE' for i in range(1, N_decoys+1)]

# get dockq score#
global_list = [f'MODEL_{i}_REVISED.pdb' for i in range(1, N_decoys+1)]
native_list = [f'MODEL_{i}_NATIVE.pdb' for i in range(1, N_decoys+1)]
model_list = global_list + native_list
mapping = settings['chain_mapping']

pred_labels = ['Batch','Y_Pred_short','Y_Pred_middle','Y_Pred_long','Chain','Range']

In [2]:
apo_states, complex_states = parse_states_from_json(f'{root_dir}/{task_folder}/state.json')
print(apo_states)
print(complex_states)

hdx_true_diffs = []
hdx_epitope_peps = []
for apo, complex_ in zip(apo_states, complex_states):
    print(apo, complex_)
    true_diff, _ = get_true_diff(HDX_fpath, apo, complex_, cluster_id, timepoints)
    _, epitope_pep = get_hdx_epitopes(true_diff)
    hdx_true_diffs.append(true_diff)
    hdx_epitope_peps.append(epitope_pep)

[('hUNG', 'apo', -9, '1AKZ'), ('UGI', 'apo', 0, '1UGI')]
[('hUNG-UGI', 'complex', -9, '1UGH'), ('UGI-hUNG', 'complex', 0, '1UGH')]
('hUNG', 'apo', -9, '1AKZ') ('hUNG-UGI', 'complex', -9, '1UGH')
Common peptides num: 69
('UGI', 'apo', 0, '1UGI') ('UGI-hUNG', 'complex', 0, '1UGH')
Common peptides num: 57


In [3]:
pred_suffix = 'prediction_HDXRank_global'
pred_dir = f'{root_dir}/{task_folder}/predictions'
pred_df = parse_predictions(pred_dir, suffix=pred_suffix)
print(pred_df.head())
print(len(pred_df['Batch'].unique()))

  Batch  Y_Pred_short  Y_Pred_middle  Y_Pred_long  Chain    Range
0  1AKZ      0.267078       0.379798     0.522689      0    90-93
1  1AKZ      0.292748       0.424624     0.575526      0    96-99
2  1AKZ      0.289542       0.418995     0.609270      0  101-106
3  1AKZ      0.285510       0.416634     0.680649      0  102-117
4  1AKZ      0.249326       0.385646     0.669758      0  102-118
1002


In [4]:
# Precompute pred_df into a dictionary of DataFrames for faster access
pred_df_dict = {batch: group for batch, group in pred_df.groupby('Batch')}

# Precompute unique complex batches
complex_batch_list = list(pred_df_dict.keys())
complex_batch_list = [batch for batch in complex_batch_list if 'MODEL' in batch]

HDX_scores = {}
y_true_list = []
y_pred_list = []
for complex_batch in tqdm(complex_batch_list):
    y_true, y_pred = prepare_data(pred_df_dict, complex_batch, apo_states, hdx_true_diffs, hdx_epitope_peps=hdx_epitope_peps, pred_cluster=pred_cluster)
    if y_true is None:
        continue
    y_true_list.append(y_true)
    y_pred_list.append(y_pred)
    HDX_scores[complex_batch] = {'batch': complex_batch, 'HDXRank_score': root_mean_square_error(y_true, y_pred)}


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:04<00:00, 212.63it/s]


In [5]:
score_df = pd.DataFrame(HDX_scores, index=['batch', 'HDXRank_score']).T.reset_index(drop=True)
score_df = score_df.sort_values(by='HDXRank_score', ascending=True)
display(score_df)

Unnamed: 0,batch,HDXRank_score
111,MODEL_1_REVISED,0.148521
447,MODEL_502_REVISED,0.153306
258,MODEL_332_REVISED,0.15348
577,MODEL_61_REVISED,0.153938
222,MODEL_2_REVISED,0.15433
...,...,...
607,MODEL_647_REVISED,0.205552
959,MODEL_964_REVISED,0.207597
926,MODEL_934_REVISED,0.208801
459,MODEL_513_REVISED,0.209768
