In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import numpy as np

no_cyc_method_list = [
    'AlphaFold 2',
    'HelixFold 3',
    'Chai-1',
    'Chai-1_wo_MSA',
    'ProteniX',
    'ProteniX_wo_MSA',
    'AlphaFold 3'
]

no_cyc_type_list = [
    'CAA_seq',
    'CAA_smiles',
    'NCAA_ptm',
    'NCAA_smiles',
]

cyc_type_list = [
    'CYC_ptm',
    'CYC_smiles',
    'CYC_ptm_bond'
]

color_map = {
  'AlphaFold 2': '#4C72B0',
  'HelixFold 3': '#55A868',
  'Chai-1': '#8172B3',
  'Chai-1_wo_MSA': '#AEA8D3',
  'ProteniX': '#64B5CD',
  'ProteniX_wo_MSA': '#A1D3DE',
  'AlphaFold 3': '#1F77B4'
}

cyc_method_list = [
    'Chai-1',
    'Chai-1_wo_MSA',
    'ProteniX',
    'ProteniX_wo_MSA',
    'AlphaFold 3'
]

score_dir = '/mnt/data/public/AF3_benchmark/all_scores_v3'
save_dir = '/mnt/data/public/AF3_benchmark/all_figures_v4_m'

In [None]:

def get_all_score_1(method_list,peptide_type_list,idx):
    ret_dict = {}
    for method in method_list:
        ret_dict[method]={}
        for pep_type in peptide_type_list:
            score_file = f'{score_dir}/{method}/{pep_type}_all.csv'
            scores = get_score_1(score_file,idx)
            ret_dict[method][pep_type] = scores
    
    return ret_dict

def get_score_1(file_name, idx):
    ret_score = []
    if not os.path.isfile(file_name):
        return []
    for line in open(file_name,'r'):
        score = float(line.split(',')[idx])
        ret_score.append(score)
    return ret_score





In [None]:

this_peptide_type_list = cyc_type_list
this_method_list = cyc_method_list
score_dicts = get_all_score_1(this_method_list, this_peptide_type_list,3)

rows = []
for method, type_scores in score_dicts.items():
  for peptide_type, scores in type_scores.items():
    for score in scores:
      rows.append({
        'Method': method,
        'Peptide_Type': peptide_type,
        'Score': score
      })

df = pd.DataFrame(rows)
palette = {method: color_map[method] for method in df['Method'].unique()}

from scipy.stats import friedmanchisquare

p_values = {}
for pep_type, group in df.groupby("Peptide_Type"):
  methods = group["Method"].unique()
  method_scores_raw = [group[group["Method"] == m]["Score"].values for m in methods]
  min_len = min(len(s) for s in method_scores_raw)
  method_scores_trimmed = [s[:min_len] for s in method_scores_raw]
  stat, p = friedmanchisquare(*method_scores_trimmed)
  p_values[pep_type] = p



plt.figure(figsize=(8, 6),dpi=100)
ax = sns.boxplot(data=df, x="Peptide_Type", y="Score", hue="Method", palette=palette)
ax.legend_.remove()
plt.ylabel("Center Distance")
plt.xlabel("Input Mode")
#plt.legend(title="Method", loc="upper right")
plt.grid(axis='y', linestyle='--', alpha=0.3)

y_min = df["Score"].min()
label_y = y_min - 0.02 * (df["Score"].max() - y_min)
for i, pep_type in enumerate(sorted(df["Peptide_Type"].unique(), key=lambda x: df["Peptide_Type"].tolist().index(x))):
  pval = p_values.get(pep_type)
  if pval is not None:
    label = f"p = {pval:.3g}"
    ax.text(i, label_y, label, ha='center', va='top', fontsize=9, color='black')


plt.tight_layout()

plt.savefig(f"{save_dir}/f4_a.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# get_score_2
def get_all_score_2(method_list,peptide_type_list,idx,th=1):
    ret_dict = {}
    for method in method_list:
        ret_dict[method]={}
        for pep_type in peptide_type_list:
            score_file = f'{score_dir}/{method}/{pep_type}_all.csv'
            score = get_score_2(score_file,idx,th)
            ret_dict[method][pep_type] = score
    
    return ret_dict

def get_score_2(file_name, idx, th):
    right_count = 0
    all_count = 0
    if not os.path.isfile(file_name):
        return -0.05
    for line in open(file_name,'r'):
        score = float(line.split(',')[idx])
        if score<th:
            right_count+=1
        all_count += 1
    return right_count/all_count

In [None]:

this_type_list = no_cyc_type_list
this_method_list = no_cyc_method_list
score_dicts_1 = get_all_score_2(this_method_list, this_type_list,3, th=1)
score_dicts_2 = get_all_score_2(this_method_list, this_type_list,3, th=4.5)
score_dicts_3 = get_all_score_2(this_method_list, this_type_list,3, th=6)

groups = [score_dicts_1,score_dicts_2,score_dicts_3]
group_names = ['Success Rate(threshold=1)', 'Success Rate(threshold=4.5)', 'Success Rate(threshold=6)']


def convert_group_to_df(group, group_name):
  rows = []
  for method, scores in group.items():
    for peptide_type, score in scores.items():
      rows.append({
        'Method': method,
        'Peptide_Type': peptide_type,
        'Score': score,
        'Group': group_name
      })
  return pd.DataFrame(rows)

df_all = pd.concat([convert_group_to_df(g, name) for g, name in zip(groups, group_names)], ignore_index=True)

fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

for i, (group_name, ax) in enumerate(zip(group_names, axes)):
  df_sub = df_all[df_all['Group'] == group_name]
  sns.barplot(
    data=df_sub,
    x='Peptide_Type',
    y='Score',
    hue='Method',
    palette=color_map,
    ax=ax
  )
  ax.set_title(group_name)
  ax.set_xlabel("Input Mode")
  ax.set_ylabel("Score" if i == 0 else "") 
  ax.set_ylim(0, 1)
  ax.grid(axis='y', linestyle='--', alpha=0.3)
  ax.get_legend().remove()

handles, labels = axes[0].get_legend_handles_labels()


plt.tight_layout(rect=[0, 0, 0.92, 1])  
plt.savefig(f"{save_dir}/f2_b.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:


def get_all_score_1(method_list,peptide_type_list,idx):
    ret_dict = {}
    for method in method_list:
        ret_dict[method]={}
        for pep_type in peptide_type_list:
            score_file = f'{score_dir}/{method}/{pep_type}_all.csv'
            scores = get_score_1(score_file,idx)
            ret_dict[method][pep_type] = scores
    
    return ret_dict

def get_score_1(file_name, idx):
    ret_score = []
    if not os.path.isfile(file_name):
        return []
    for line in open(file_name,'r'):
        score = float(line.split(',')[idx])
        ret_score.append(score)
    return ret_score

this_method_list = no_cyc_method_list
this_peptide_type_list = no_cyc_type_list
score_dicts = get_all_score_1(this_method_list, this_peptide_type_list,2)

rows = []
for method, type_scores in score_dicts.items():
  for peptide_type, scores in type_scores.items():
    for score in scores:
      rows.append({
        'Method': method,
        'Peptide_Type': peptide_type,
        'Score': score
      })

df = pd.DataFrame(rows)

palette = {method: color_map[method] for method in df['Method'].unique()}


# === 5. 计算p value ===
from scipy.stats import friedmanchisquare

p_values = {}
for pep_type, group in df.groupby("Peptide_Type"):
  methods = group["Method"].unique()
  method_scores_raw = [group[group["Method"] == m]["Score"].values for m in methods]
  min_len = min(len(s) for s in method_scores_raw)
  method_scores_trimmed = [s[:min_len] for s in method_scores_raw]
  stat, p = friedmanchisquare(*method_scores_trimmed)
  p_values[pep_type] = p


plt.figure(figsize=(8, 6),dpi=100)
ax = sns.boxplot(data=df, x="Peptide_Type", y="Score", hue="Method", palette=palette)
ax.legend_.remove()
plt.ylabel("RMSD(Protein)")
plt.xlabel("Input Mode")
plt.ylim((-0.3,10))
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()

y_min = df["Score"].min()
label_y = -0.07
for i, pep_type in enumerate(sorted(df["Peptide_Type"].unique(), key=lambda x: df["Peptide_Type"].tolist().index(x))):
  pval = p_values.get(pep_type)
  if pval is not None:
    label = f"p = {pval:.3g}"
    ax.text(i, label_y, label, ha='center', va='top', fontsize=9, color='black')


plt.savefig(f"{save_dir}/f3_a.png", dpi=300, bbox_inches='tight')
plt.show()


In [None]:

def get_all_score_8(method_list, peptide_type_list,idx):
    ret_dict = {}
    for method in method_list:
        ret_dict[method]={}
        for pep_type in peptide_type_list:
            score_file = f'{score_dir}/{method}/{pep_type}_all.csv'
            scores = get_score_8(score_file,idx)
            ret_dict[method][pep_type] = scores
    
    return ret_dict

def get_score_8(file_name, idx):
    if not os.path.isfile(file_name):
        return []
    ret = []
    for line in open(file_name,'r'):
        score = float(line.split(',')[idx])
        ret.append(score)
    return ret


p_types = no_cyc_type_list
method_list = no_cyc_method_list
score_dicts = get_all_score_8(method_list,p_types, 5)


rows = []
for method, type_scores in score_dicts.items():
  for peptide_type in p_types:
    scores = type_scores[peptide_type]
    for score in scores:
      rows.append({
        'Method': method,
        'Peptide_Type': peptide_type,
        'Score': score
      })

df = pd.DataFrame(rows)

palette = {method: color_map[method] for method in df['Method'].unique()}


from scipy.stats import friedmanchisquare

p_values = {}
for pep_type, group in df.groupby("Peptide_Type"):
  methods = group["Method"].unique()
  method_scores_raw = [group[group["Method"] == m]["Score"].values for m in methods]
  min_len = min(len(s) for s in method_scores_raw)
  method_scores_trimmed = [s[:min_len] for s in method_scores_raw]
  stat, p = friedmanchisquare(*method_scores_trimmed)
  p_values[pep_type] = p


plt.figure(figsize=(10, 6),dpi=100)
ax = sns.boxplot(data=df, x="Peptide_Type", y="Score", hue="Method", palette=palette)
plt.ylabel("Peptide RMSD after Protein Alignment")
plt.xlabel("Input Mode")
plt.legend(title="Method", loc="upper right")
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()

y_min = df["Score"].min()
label_y = y_min - 0.02 * (df["Score"].max() - y_min)
for i, pep_type in enumerate(sorted(df["Peptide_Type"].unique(), key=lambda x: df["Peptide_Type"].tolist().index(x))):
  pval = p_values.get(pep_type)
  if pval is not None:
    label = f"p = {pval:.3g}"
    ax.text(i, label_y, label, ha='center', va='top', fontsize=9, color='black')

plt.savefig(f"{save_dir}/f7.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:


def get_all_score_8(method_list, peptide_type_list,idx):
    ret_dict = {}
    for method in method_list:
        ret_dict[method]={}
        for pep_type in peptide_type_list:
            score_file = f'{score_dir}/{method}/{pep_type}_all.csv'
            scores = get_score_8(score_file,idx)
            ret_dict[method][pep_type] = scores
    
    return ret_dict

def get_score_8(file_name, idx):
    if not os.path.isfile(file_name):
        return []
    ret = []
    for line in open(file_name,'r'):
        score = float(line.split(',')[idx])
        ret.append(score)
    return ret



p_types = no_cyc_type_list
p_method = no_cyc_method_list
score_dicts = get_all_score_8(p_method,p_types, 4)


rows = []
for method, type_scores in score_dicts.items():
  for peptide_type in p_types:
    scores = type_scores[peptide_type]
    for score in scores:
      rows.append({
        'Method': method,
        'Peptide_Type': peptide_type,
        'Score': score
      })

df = pd.DataFrame(rows)

palette = {method: color_map[method] for method in df['Method'].unique()}


from scipy.stats import friedmanchisquare

p_values = {}
for pep_type, group in df.groupby("Peptide_Type"):
  methods = group["Method"].unique()
  method_scores_raw = [group[group["Method"] == m]["Score"].values for m in methods]
  min_len = min(len(s) for s in method_scores_raw)
  method_scores_trimmed = [s[:min_len] for s in method_scores_raw]
  stat, p = friedmanchisquare(*method_scores_trimmed)
  p_values[pep_type] = p



plt.figure(figsize=(8, 6),dpi=100)
ax = sns.boxplot(data=df, x="Peptide_Type", y="Score", hue="Method", palette=palette)
plt.ylabel("RMSD(Peptide)")
plt.xlabel("Input Mode")
plt.legend(title="Method", loc="upper right")
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()


y_min = df["Score"].min()
label_y = y_min - 0.02 * (df["Score"].max() - y_min)
for i, pep_type in enumerate(sorted(df["Peptide_Type"].unique(), key=lambda x: df["Peptide_Type"].tolist().index(x))):
  pval = p_values.get(pep_type)
  if pval is not None:
    label = f"p = {pval:.3g}"
    ax.text(i, label_y, label, ha='center', va='top', fontsize=9, color='black')

plt.savefig(f"{save_dir}/f3_b.png", dpi=300, bbox_inches='tight')
plt.show()