In [251]:
from biopandas.pdb import PandasPdb
import pandas as pd
from Bio.PDB import PDBParser, PDBIO, Select

In [252]:
class NonHetSelect(Select):
    def accept_atom(self, atom):
        # Accept all atoms except hydrogens
        return not atom.id.startswith('H')

def remove_hydrogens(input_pdb, output_pdb):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", input_pdb)

    io = PDBIO()
    io.set_structure(structure)
    io.save(output_pdb, NonHetSelect())

# Define the input and output PDB file paths
input_pdb = "./tmp/2lueB/2lueB_clean.pdb"
output_pdb = "./tmp/2lueB/2lueB_clean_noH.pdb"

# Call the function to remove hydrogen atoms
remove_hydrogens(input_pdb, output_pdb)

print(f"Hydrogen atoms removed and saved to {output_pdb}")

Hydrogen atoms removed and saved to ./tmp/2lueB/2lueB_clean_noH.pdb


In [253]:
ppdb_pred = PandasPdb().read_pdb("./tmp/2lueB/2lueB_tmp.pdb")
ppdb_true = PandasPdb().read_pdb("./tmp/2lueB/2lueB_clean_noH.pdb")

df_ppdb_pred = pd.concat([ppdb_pred.df["HETATM"], ppdb_pred.df["ATOM"]], axis=0)
df_ppdb_true = pd.concat([ppdb_true.df["HETATM"], ppdb_true.df["ATOM"]], axis=0)

# 去掉 H
PandasPdb()._get_heavy(df_ppdb_true, invert=False)

start_idx = max(df_ppdb_pred["residue_number"].max(), df_ppdb_true["residue_number"].max()) + 1

In [254]:
peptide_true = df_ppdb_true[df_ppdb_true["chain_id"] == "B"]
protein_true = df_ppdb_true[df_ppdb_true["chain_id"] == "A"]

peptide_pred = df_ppdb_pred[df_ppdb_pred["chain_id"] == "A"]
protein_pred = df_ppdb_pred[df_ppdb_pred["chain_id"] == "B"]

print(len(set(protein_true["residue_number"])), len(set(protein_pred["residue_number"])))

119 119


In [255]:
def renumber_atom_number(df, start_idx = 1):
    df["atom_number"] = range(start_idx, start_idx + len(df))

def renumber_residue_number(df, set_start_idx=1):
    df.reset_index(drop=True, inplace=True)
    residue_number = set_start_idx
    last_residue_number = None
    last_residue_name = None
    
    for i in range(len(df)):
        current_residue_number = df.at[i, "residue_number"]
        current_residue_name = df.at[i, "residue_name"]
        if (last_residue_number is None) and (last_residue_name is None):
            last_residue_number = current_residue_number
            last_residue_name = current_residue_name
        
        if (current_residue_number == last_residue_number) and (current_residue_name == last_residue_name):
            df.at[i, "residue_number"] = residue_number
        else:
            residue_number += 1
            df.at[i, "residue_number"] = residue_number
            last_residue_number = current_residue_number
            last_residue_name = current_residue_name


In [256]:
def copy_residue(res_from, res_to):
    res_from.reset_index(inplace=True, drop=True)
    res_to.reset_index(inplace=True, drop=True)
    required_columns = ["x_coord", "y_coord", "z_coord", "occupancy", "b_factor"]
    
    # Ensure the necessary columns exist in both DataFrames
    for col in required_columns:
        if col not in res_from.columns or col not in res_to.columns:
            raise ValueError(f"Column '{col}' must be present in both DataFrames.")
    
    # Ensure the DataFrames have the same length
    if len(res_from) != len(res_to):
        raise ValueError("Both DataFrames must have the same number of rows.")
    
    # Copy the data
    for i in range(len(res_to)):
        res_to.loc[i, required_columns] = res_from.loc[i, required_columns]

In [257]:
# 先拿到所有的 LG
LG_true_list = []
LG_pred_list = []

for (residue_name, residue_number), group in peptide_true[peptide_true["record_name"] == "HETATM"].groupby(["residue_name", "residue_number"]):
    LG_true_list.append(group)
for (residue_name, residue_number), group in peptide_pred[peptide_pred["record_name"] == "HETATM"].groupby(["residue_name", "residue_number"]):
    LG_pred_list.append(group)

# 遍历 多肽的 pred 然后一个一个插进去
new_peptide_pred = []
for (residue_number, record_name), group in peptide_true.groupby(["residue_number", "record_name"]):
    print(residue_number, record_name)
    if record_name == "ATOM":
        new_peptide_pred.append(group)
    elif record_name == "HETATM":
        LG_true = LG_true_list.pop(0)
        LG_pred = LG_pred_list.pop(0)
        copy_residue(res_from=LG_pred, res_to=LG_true)
        new_peptide_pred.append(LG_true)

169 ATOM
170 HETATM
171 HETATM
172 ATOM
173 HETATM
174 HETATM
175 ATOM
176 ATOM
177 HETATM
178 ATOM
179 ATOM
180 ATOM
181 ATOM
182 ATOM
183 ATOM
184 ATOM
185 ATOM


In [258]:
df_new_peptide_pred = pd.concat(new_peptide_pred, axis=0)

In [259]:
renumber_residue_number(df_new_peptide_pred, start_idx)
renumber_atom_number(df_new_peptide_pred)
print(len(set(df_new_peptide_pred['residue_number'])))

17


In [260]:
new_pdb_pred = PandasPdb()

In [261]:
new_pdb_pred._df = {
    "ATOM": df_new_peptide_pred[df_new_peptide_pred["record_name"] == "ATOM"],
    "HETATM": df_new_peptide_pred[df_new_peptide_pred["record_name"] == "HETATM"],
    'ANISOU': pd.DataFrame(),
    'OTHERS': pd.DataFrame()
}

In [262]:
new_pdb_pred.to_pdb("./tmp/some.pdb")