In [None]:
import pandas as pd
import os

pred_method = "af2" ## method
folder_path = f"{pred_method}"
## file hierarchy should be like this
# af2
# ├── 7ar0_B_A_af2
# │   ├── 7ar0_B_A_af2.a3m
# │   ├── 7ar0_B_A_af2.png
# │   ├── 7ar0_B_A_af2.png
# │   ├── 7ar0_B_A_af2.png
# │   ├── ...
# ├── 7bnv_H_L_A_af2
# ├── ...

complex_list = [
    f for f in os.listdir(folder_path)
    if os.path.isdir(os.path.join(folder_path, f)) and not f.startswith(".")
]
print(complex_list)  # should print prediction folders ['7ar0_B_A_af2', '...', '...']

original_directory = "path/to/native_PDBs" ## Native PDB directory

## create results folder
result_path = "path/to/results_folder"
dockq_output = f"{pred_method}_dockq_fnat_scores.csv"  ## dockq output
pdockq_output = f"{pred_method}_pdockq2_fit.csv" ## pdockq2 output
combined = f"{pred_method}_combined.csv" ### combined results with dockq_pdock2 and model scores

In [None]:
## run dockq function
import os
import sys
import csv
import glob
from DockQ.DockQ import load_PDB, run_on_all_native_interfaces
from statistics import mean

def merge_chains(model, chains_to_merge):
    """
    Merges specified chains in the given model.

    Parameters
    ----------
    model : Bio.PDB.Structure
        The model in which the chains are to be merged.
    chains_to_merge : list of str
        The list of chain IDs to be merged.

    Returns
    -------
    model : Bio.PDB.Structure
        The model with the specified chains merged.
    """
    print(f"Merging chains {chains_to_merge} in model")
    for chain in chains_to_merge[1:]:
        for res in list(model[chain]):
            res.id = (chains_to_merge[0], res.id[1], res.id[2])
            model[chains_to_merge[0]].add(res)
        model.detach_child(chain)
    model[chains_to_merge[0]].id = "".join(chains_to_merge)
    return model

def calculate_dockq(model, native, chain_map):
    """
    Calculates DockQ scores for the given model and native structures based on the chain map.

    Parameters
    ----------
    model : Bio.PDB.Structure
        The model structure.
    native : Bio.PDB.Structure
        The native structure.
    chain_map : dict
        The mapping of chains between model and native structures.

    Returns
    -------
    results : dict
        The results containing various DockQ metrics.
    dockq_score : float
        The DockQ score.
    """
    print(f"Calculating DockQ score with chain_map: {chain_map}")
    results, dockq_score = run_on_all_native_interfaces(model, native, chain_map=chain_map)
    return results, dockq_score

def process_models(models):
    """
    Processes the provided models and calculates DockQ scores.

    Parameters
    ----------
    models : list of tuple
        List of tuples where each tuple contains the model file path and native file path.

    Returns
    -------
    results_list : list of tuple
        List of tuples containing model_id, DockQ, DockQ_F1, and fnat scores.
    """
    results_list = []
    for model_file, native_file in models:
        print(f"Processing model: {model_file}, native: {native_file}")
        model_id = os.path.basename(model_file).split(".")[0]
        model = load_PDB(model_file)
        native = load_PDB(native_file)

        chain_ids = list(model.child_dict.keys())
        print(chain_ids)
        native_chain_ids = list(native.child_dict.keys())

        if len(chain_ids) == 3:
            print(f"Model {model_id} has 3 chains: {chain_ids}")
            
            model_merged = merge_chains(model, chain_ids[:2])
            print(model_merged)
            native_merged = merge_chains(native, native_chain_ids[:2])
            chain_map_merged = {native_chain_ids[2]: chain_ids[2], "".join(native_chain_ids[:2]): "".join(chain_ids[:2])}
            results_merged, dockq_score_merged = calculate_dockq(model_merged, native_merged, chain_map_merged)
            merged_result = results_merged[list(results_merged.keys())[0]]
            results_list.append((model_id, merged_result['DockQ'], merged_result['fnat'],
                                merged_result['iRMSD'], merged_result['LRMSD'],merged_result['F1']))

        elif len(chain_ids) == 2:
            print(f"Model {model_id} has 2 chains: {chain_ids}")
            # Assume there are only 2 chains
            chain_map = {native_chain_ids[0]: chain_ids[0], native_chain_ids[1]: chain_ids[1]}
            results, dockq_score = calculate_dockq(model, native, chain_map)
            results_list.append((model_id, results[list(results.keys())[0]]['DockQ'], results[list(results.keys())[0]]['fnat'],
                                results[list(results.keys())[0]]['iRMSD'], results[list(results.keys())[0]]['LRMSD'],
                                results[list(results.keys())[0]]['F1']))

        else:
            print(f"Model {model_id} does not have 2 or 3 chains: {chain_ids}, skipping.")
            continue
    return results_list

def save_results_to_csv(results, filename):
    print(f"Saving results to CSV file: {filename}")
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['model_id', 'DockQ', 'fnat',"iRMSD","LRMSD","F1"])

        for model_id, dockq, fnat, irms, lrms, f1 in results:
            print(f"Writing row: {model_id}, {dockq}, {fnat}, {irms},{lrms},{f1}")
            writer.writerow([model_id, dockq, fnat, irms, lrms,f1])   

def main(directory, original_directory):
    """
    Main function to find PDB files, process models, and save results to a CSV file.

    Parameters
    ----------
    directory : str
        Path to the directory containing PDB files to be analyzed.
    original_directory : str
        Path to the directory containing original PDB files for comparison.

    Returns
    -------                                                                                                                                                                                                                                                                                                                                    
    None
    """
    pdb_files = glob.glob(f"{directory}/*_unrelaxed_rank_00*.pdb")
    if not pdb_files:
        print(f"No PDB files found in directory: {directory}")
        return

    print(f"Found PDB files: {pdb_files}")
    models = []

    for pdb_file in pdb_files:
        pdb_id_chains = os.path.basename(pdb_file).split('_unrelaxed')[0] # for af2.3
        pdb_id_chains = "_".join(pdb_id_chains.split("_")[:-1])
        print(f"Searching for original files for: {pdb_id_chains}")
        original_pdb_files = glob.glob(f"{original_directory}/{pdb_id_chains}.pdb")  # Find matching original PDB files
        print(f"Found original PDB files: {original_pdb_files} for {pdb_file}")
        if not original_pdb_files:
            print(f"No matching original PDB file found for {pdb_file}, skipping.")
            continue
        for original_pdb_file in original_pdb_files:
            models.append((pdb_file, original_pdb_file))
            print(f"Adding model-native pair: {pdb_file}, {original_pdb_file}")

    if not models:
        print("No valid model-native pairs found. Exiting.")
        return
    os.chdir(result_path)
    results = process_models(models)
    save_results_to_csv(results, str(pdb_id_chains)+'_dockq_fnat_scores.csv')
    for model_id, dockq, fnat, irms, lrms, f1 in results:
        print(f"Writing row: {model_id}, {dockq}, {fnat}, {irms},{lrms},{f1}")

In [None]:
# Set directory paths
import pandas as pd
import glob
import os



for i in complex_list:
    fpath = f"{folder_path}/{i}"
    directory = os.path.expanduser(fpath)
    original_directory = os.path.expanduser(original_directory)
    main(directory, original_directory)

def combine_csv_files(result_path, output_file=None):
    # Find all CSV files in the folder
    csv_files = glob.glob(os.path.join(result_path, "*dockq_fnat_scores.csv"))
    
    # Read and combine all CSVs
    df_list = [pd.read_csv(file) for file in csv_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    
    # Save to CSV if an output file is provided
    if output_file:
        combined_df.to_csv(output_file, index=False)
        print(f"Combined CSV saved to {output_file}")
    return combined_df
combined_df = combine_csv_files(result_path, dockq_output)

In [None]:
### pdockq2 functions

from Bio.PDB import PDBIO
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Selection import unfold_entities
import numpy as np
import sys,os
import argparse
import pickle
import itertools
import pandas as pd
from scipy.optimize import curve_fit

def retrieve_IFplddt(structure, chain1, chain2_lst, max_dist):
    chain_lst = list(chain1) + chain2_lst
    ifplddt = []
    contact_chain_lst = []
    for res1 in structure[0][chain1]:
        for chain2 in chain2_lst:
            count = 0
            for res2 in structure[0][chain2]:
                if res1.has_id('CA') and res2.has_id('CA'):
                   dis = abs(res1['CA']-res2['CA'])
                   ## add criteria to filter out disorder res
                   if dis <= max_dist:
                      ifplddt.append(res1['CA'].get_bfactor())
                      count += 1
                elif res1.has_id('CB') and res2.has_id('CB'):
                   dis = abs(res1['CB']-res2['CB'])
                   if dis <= max_dist:
                      ifplddt.append(res1['CB'].get_bfactor())
                      count += 1
            if count > 0:
              contact_chain_lst.append(chain2)
    contact_chain_lst = sorted(list(set(contact_chain_lst)))   
    if len(ifplddt)>0:
       IF_plddt_avg = np.mean(ifplddt)
    else:
       IF_plddt_avg = 0
    return IF_plddt_avg, contact_chain_lst


def retrieve_IFPAEinter(structure, paeMat, contact_lst, max_dist):
    chain_lst = [x.id for x in structure[0]]
    seqlen = [len(x) for x in structure[0]]
    ifch1_col=[]
    ifch2_col=[]
    ch1_lst=[]
    ch2_lst=[]
    ifpae_avg = []
    d=10
    for ch1_idx in range(len(chain_lst)):
      idx = chain_lst.index(chain_lst[ch1_idx])
      ch1_sta=sum(seqlen[:idx])
      ch1_end=ch1_sta+seqlen[idx]
      ifpae_col = []   
      for contact_ch in contact_lst[ch1_idx]:
        index = chain_lst.index(contact_ch)
        ch_sta = sum(seqlen[:index])
        ch_end = ch_sta+seqlen[index]
        paeMat = np.array(paeMat)
        remain_paeMatrix = paeMat[ch1_sta:ch1_end,ch_sta:ch_end]
        mat_x = -1
        for res1 in structure[0][chain_lst[ch1_idx]]:
          mat_x += 1
          mat_y = -1
          for res2 in structure[0][contact_ch]:
              mat_y+=1
              if res1['CA'] - res2['CA'] <=max_dist:
                 ifpae_col.append(remain_paeMatrix[mat_x,mat_y])
      if not ifpae_col:
        ifpae_avg.append(0)
      else:
        norm_if_interpae=np.mean(1/(1+(np.array(ifpae_col)/d)**2))
        ifpae_avg.append(norm_if_interpae)
    return ifpae_avg

def calc_pmidockq(ifpae_norm, ifplddt):
    df = pd.DataFrame()
    df['ifpae_norm'] = ifpae_norm
    df['ifplddt'] = ifplddt
    df['prot'] = df.ifpae_norm*df.ifplddt
    fitpopt = [1.31034849e+00, 8.47326239e+01, 7.47157696e-02, 5.01886443e-03] ## from orignal fit function  
    df['pmidockq'] = sigmoid(df.prot.values, *fitpopt)
    return df

def sigmoid(x, L ,x0, k, b):
    y = L / (1 + np.exp(-k*(x-x0)))+b
    return (y)

def process_pdb_file(pdb_file, json_file, distance, file_id, chains_part=""): 
    pdbp = PDBParser(QUIET=True)
    structure = pdbp.get_structure('', pdb_file)
    chains = [chain.id for chain in structure[0]]
    remain_contact_lst = []
    plddt_lst = []
    for idx in range(len(chains)):
        chain2_lst = list(set(chains)-set(chains[idx]))
        IF_plddt, contact_lst = retrieve_IFplddt(structure, chains[idx], chain2_lst, distance)
        plddt_lst.append(IF_plddt)
        remain_contact_lst.append(contact_lst)
    pae_data = pd.read_json(json_file, lines=True)
    avgif_pae = retrieve_IFPAEinter(structure, pae_data["pae"][0], remain_contact_lst, distance)
    res = calc_pmidockq(avgif_pae, plddt_lst)
    pdb_id = os.path.basename(pdb_file).split('_')[0]
    result = {
        "model_id" : pdb_file,
        "pdb_id": file_id,
        "pdb_id_with_chains": '{0}_{1}'.format(pdb_id, chains_part),
        "ipae_norm_ag": res['ifpae_norm'].tolist()[-1],
        "ipae_norm_avg": np.mean(res['ifpae_norm']), 
        "iplddt_ag": res['ifplddt'].tolist()[-1],
        "iplddt_avg": np.mean(res['ifplddt']), 
        "pDockQ2_ag": res['pmidockq'].tolist()[-1],
        "pDockQ2_avg": np.mean(res['pmidockq'])}
    return result

In [None]:
import os
import glob
import pandas as pd

def find_matching_json(pdb_file, directory):
    pdb_file_basename = os.path.basename(pdb_file)
    parts = pdb_file_basename.split('_')
    pdb_id = parts[0]
    chains_part = "_".join(parts[1:parts.index([p for p in parts if p.startswith('unrelaxed')][0])])
    pattern = f"{pdb_id}_{chains_part}_scores_rank_00*.json"
    json_files = glob.glob(os.path.join(directory, pattern))
    return json_files[0] if json_files else print(pattern + " not found")

def run_processing(directory):
    pdb_files = glob.glob(os.path.join(directory, "*_unrelaxed_rank_00*.pdb"))
    results = []   
    for pdb_file in pdb_files:
        base_name = pdb_file.rsplit('_', 4)[0]
        pdb_file_basename = os.path.basename(pdb_file)
        parts = pdb_file_basename.split('_')
        pdb_id = parts[0]
        chains_part = "_".join(parts[1:parts.index([p for p in parts if p.startswith('unrelaxed')][0])])
        json_file_name = find_matching_json(pdb_file, directory)
        if os.path.exists(json_file_name):
            print(f"{pdb_id}_{chains_part}")
            file_id = f"{pdb_id}_{chains_part}"
            result = process_pdb_file(pdb_file, json_file_name, 8, pdb_id, chains_part)  # Adjust Ca-ca distance
            print(f"Result for {pdb_file}: {result}")  
            results.append(result)
        else:
            print(f"JSON file for {pdb_file} not found.")
            return
    if results:
        df = pd.DataFrame(results)
        os.chdir(result_path)
        csv_file_path = str(file_id)+"_pdockq2_fit.csv"
        df.to_csv(csv_file_path, index=False)
        print(f"Data saved to {csv_file_path}")
    else:
        print("No results to save.")

### run for all predictions
for i in complex_list:
    fpath = f"{folder_path}/{i}"
    directory = fpath
    run_processing(directory)   

def combine_csv_files(result_path, output_file=None):
    csv_files = glob.glob(os.path.join(result_path, "*_pdockq2_fit.csv"))
    df_list = [pd.read_csv(file) for file in csv_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    if output_file:
        combined_df.to_csv(output_file, index=False)
        print(f"Combined CSV saved to {output_file}")
    return combined_df
    
combined_df = combine_csv_files(result_path, pdockq_output)

In [None]:
#extract log metrics
import pandas as pd
import re

def parse_log_file(log_file):
    data = []
    query_pdb = None

    with open(log_file, 'r') as f:
        for line in f:
            line = line.strip()
            match_query = re.search(r'Query \d+/\d+: (\S+) ', line)
            if match_query:
                query_pdb = match_query.group(1)
            match_model = re.search(r'(rank_\d+_alphafold2_multimer_v3_model_\d+_seed_\d+) '
                                    r'pLDDT=([\d.]+) pTM=([\d.]+) ipTM=([\d.]+) actifpTM=([\d.]+)', line)
            if match_model and query_pdb:
                model_name = match_model.group(1)
                scores = list(map(float, match_model.groups()[1:]))  
                data.append([query_pdb, model_name] + scores)
    df = pd.DataFrame(data, columns=["Query_PDB", "Model_Name", "pLDDT", "pTM", "ipTM","actifpTM"])
    return df

In [None]:
### combine all files

x=[]
for i in complex_list:
    df = parse_log_file(f"{folder_path}/{i}/log.txt")
    x.append(df)
dfy = pd.concat(x)

dfy['Model_Name']=dfy['Model_Name'].str.replace("rank","unrelaxed_rank")
dfy['model_id'] = dfy['Query_PDB'] + "_"+dfy['Model_Name']
dfy['model_id']
dfy=dfy[["model_id","pLDDT","pTM","ipTM","actifpTM"]]
print(list(dfy))

df = pd.read_csv(f"{result_path}/{pdockq_output}")
df['model_id']=df['model_id'].str.split("/").str[-1]
df['model_id']=df['model_id'].str.replace(".pdb","")

df2 = pd.read_csv(f"{result_path}/{dockq_output}")

dfx = pd.merge(df,df2,on="model_id")

df_fin = pd.merge(dfx,dfy,on="model_id")
df_fin=df_fin.drop_duplicates()
print(list(df))
df_fin["pdb_id_wchains"]=df_fin["pdb_id_with_chains"].str.replace("_af2","")
df_fin=df_fin[['model_id', 'pdb_id', 'pdb_id_wchains', 'ipae_norm_ag', 'ipae_norm_avg', 'iplddt_ag', 'iplddt_avg', 
               'pDockQ2_ag', 'pDockQ2_avg','pLDDT', 'pTM', 'ipTM', 'actifpTM']]

# AntiConf = 0.3pDockQ2_ag + 0.7pTM
df_fin["AntiConf"] = 0.3* df_fin["pDockQ2_ag"] + 0.7 * df_fin["pTM"]
df_fin.to_csv(f"{result_path}/{combined}",index=False)