In [1]:
import os
import re
import numpy as np
import subprocess
from biotite.structure.io.pdb import PDBFile, get_structure
import biotite.sequence as seq
from rdkit import Chem
from tqdm import tqdm
import torch



In [2]:
def extract_leap_info(log_text):
    # 使用正则表达式匹配错误、警告和注意的数量
    match = re.search(r"Exiting LEaP: Errors = (\d+); Warnings = (\d+); Notes = (\d+).", log_text)
    if match:
        errors, warnings, notes = map(int, match.groups())
        return errors, warnings, notes
    else:
        return None


def minimize_use_amber(input_pdb_file, output_path, AMBERHOME='/home/xiaoruiwang/software/amber20'):
    is_success = False
    abs_input_pdb_path = os.path.abspath(input_pdb_file)
    abs_output_path = os.path.abspath(output_path)
    
    minimize_command = f'./minimize_script_folder/minimize_protein.sh {abs_input_pdb_path} {abs_output_path} {AMBERHOME}'
    completed_process = subprocess.run(minimize_command, shell=True)
    if completed_process.returncode != 0:
        print("Error: The command did not run successfully!")
    
    # 抓错误
    pdb_name = os.path.split(abs_input_pdb_path)[-1].split('.')[0]
    with open(os.path.join(output_path, f'{pdb_name}_minimized_tleap.log'), 'r') as f:
        for line in f.readlines():
            line = line.strip()
            info = extract_leap_info(line)
            if info:
                errors, warnings, notes = info
                if errors == 0:
                    is_success = True

        
    return abs_output_path, is_success
        
        
def read_sequence_from_pdb(pdb_path):
    pdb_file = PDBFile.read(pdb_path)
    structure = get_structure(pdb_file)
    sequence_np = np.array([seq.ProteinSequence.convert_letter_3to1(x) for x in structure.res_name[structure.atom_name == 'CA']])
    return sequence_np


def check_af2_and_desigh_structure(af2_str_path, rfdiffusion_str_path):
    af2_sequence_np = read_sequence_from_pdb(af2_str_path)
    rfdiffusion_sequence_np = read_sequence_from_pdb(rfdiffusion_str_path)
    active_site_index = np.argwhere(rfdiffusion_sequence_np != 'G').reshape(-1)
    active_site_aa_af2 = af2_sequence_np[active_site_index]
    active_site_aa_rfdiffusion = rfdiffusion_sequence_np[active_site_index]
    return np.all(active_site_aa_af2==active_site_aa_rfdiffusion).item(), active_site_index


def find_corresponding_files(dir1, dir2):
    files1 = os.listdir(dir1)
    files2 = os.listdir(dir2)
    corresponding_files = {}
    for file1 in files1:
        i, j = file1.split('_')[0][2], file1.split('_')[2].split('.')[0]
        corresponding_file2 = f'EC{i}_design_{j}.pdb'
        if corresponding_file2 in files2:
            corresponding_files[file1] = corresponding_file2
    return corresponding_files

In [3]:
rfdiffusion_enzyme_structure_path = '/home/xiaoruiwang/data/ubuntu_work_beta/protein_work/rfdiffusion_nature/PUBLIC_RELEASE/insilico_tested/Enzyme_active_site_scaffolds'


rfdiffusion_enzyme_structure_minimized_path = '../dataset/rfdiffusion_enzyme_design'
os.makedirs(rfdiffusion_enzyme_structure_minimized_path, exist_ok=True)

In [4]:
corresponding_files = find_corresponding_files(os.path.join(rfdiffusion_enzyme_structure_path, 'af2'), os.path.join(rfdiffusion_enzyme_structure_path, 'rfdiffusion'))


not_same_list = []
pdb_to_active_site_dict = {}

for af2_file, rfdiffusion_file in corresponding_files.items():
    is_active_site_same, active_site_index = check_af2_and_desigh_structure(
    af2_str_path=os.path.join(rfdiffusion_enzyme_structure_path, 'af2', af2_file),
    rfdiffusion_str_path=os.path.join(rfdiffusion_enzyme_structure_path, 'rfdiffusion', rfdiffusion_file)
    )
    if not is_active_site_same:
        not_same_list.append((af2_file, rfdiffusion_file))
    else:
        pdb_to_active_site_dict[(af2_file, rfdiffusion_file)] = active_site_index

print(len(not_same_list))
print(len(pdb_to_active_site_dict))



0
157


In [5]:
pdb_to_active_site_dict

{('EC4_AF2_19.pdb', 'EC4_design_19.pdb'): array([16, 39, 94]),
 ('EC4_AF2_9.pdb', 'EC4_design_9.pdb'): array([25, 49, 99]),
 ('EC3_AF2_10.pdb', 'EC3_design_10.pdb'): array([ 23,  92, 125]),
 ('EC4_AF2_34.pdb', 'EC4_design_34.pdb'): array([ 48,  98, 130]),
 ('EC4_AF2_7.pdb', 'EC4_design_7.pdb'): array([ 10,  75, 107]),
 ('EC3_AF2_19.pdb', 'EC3_design_19.pdb'): array([ 33,  77, 128]),
 ('EC1_AF2_8.pdb', 'EC1_design_8.pdb'): array([ 17,  50, 112]),
 ('EC4_AF2_54.pdb', 'EC4_design_54.pdb'): array([ 42,  90, 126]),
 ('EC4_AF2_11.pdb', 'EC4_design_11.pdb'): array([ 58,  85, 128]),
 ('EC1_AF2_29.pdb', 'EC1_design_29.pdb'): array([ 24,  56, 120]),
 ('EC4_AF2_25.pdb', 'EC4_design_25.pdb'): array([ 74,  94, 121]),
 ('EC4_AF2_8.pdb', 'EC4_design_8.pdb'): array([ 50,  99, 118]),
 ('EC5_AF2_28.pdb', 'EC5_design_28.pdb'): array([ 56,  75, 106]),
 ('EC3_AF2_24.pdb', 'EC3_design_24.pdb'): array([ 38,  71, 135]),
 ('EC3_AF2_14.pdb', 'EC3_design_14.pdb'): array([ 11,  62, 118]),
 ('EC3_AF2_18.pdb', 'EC3

In [11]:
torch.save(pdb_to_active_site_dict, os.path.join(rfdiffusion_enzyme_structure_minimized_path, 'file_name_and_active_site.pkl'))

In [7]:
rfdiffusion_enzyme_structure_minimized_path
af2_rfdiffusion_enzyme_structure_minimized_path = os.path.join(rfdiffusion_enzyme_structure_minimized_path, 'af2')
os.makedirs(af2_rfdiffusion_enzyme_structure_minimized_path, exist_ok=True)
minimize_failure_list_test = []
for pdb_file in os.listdir(os.path.join(rfdiffusion_enzyme_structure_path, 'af2')):
    abs_input_path = os.path.abspath(os.path.join(rfdiffusion_enzyme_structure_path, 'af2', pdb_file))
    _, is_success = minimize_use_amber(
        abs_input_path, 
        os.path.abspath(af2_rfdiffusion_enzyme_structure_minimized_path), 
        # AMBERHOME='/home/ipmgpu2022a/Software/amber20'
                                       )
    if not is_success:
        minimize_failure_list_test.append(abs_input_path)

In [8]:
erro_pdb = []

for file in tqdm(os.listdir(af2_rfdiffusion_enzyme_structure_minimized_path)):
    if file.endswith('.pdb'):
        mol = Chem.MolFromPDBFile(os.path.join(af2_rfdiffusion_enzyme_structure_minimized_path, file))
        if mol is None:
            erro_pdb.append(file)
        

[15:35:21] Explicit valence for atom # 1862 O, 4, is greater than permitted
[15:35:22] Explicit valence for atom # 34 O, 3, is greater than permitted
100%|██████████| 314/314 [00:12<00:00, 25.42it/s]


In [9]:
print(len(erro_pdb))

2


In [10]:
erro_pdb

['EC1_AF2_22_minimized.pdb', 'EC2_AF2_1_minimized.pdb']