In [1]:
import pandas as pd
import os
import shutil
from tqdm import tqdm
import random

In [2]:
def get_mcsa_normal_dataset(dir, subset=['train', 'valid'], flag='mcsa', read_new=False):
    dataset = pd.DataFrame()
    for dataset_flag in subset:
        sub_df = pd.read_csv(os.path.join(dir, f'{dataset_flag}_dataset' if not read_new else f'new_{dataset_flag}_dataset', f'{flag}_{dataset_flag}.csv'))
        sub_df['dataset_flag'] = [dataset_flag for _ in range(len(sub_df))]
        dataset = pd.concat([dataset, sub_df])
        
    return dataset
def site_labels_to_protected_positions(site_labels):
    protected_positions = []
    for one_site in site_labels:
        if len(one_site) == 1:
            protected_positions.append(one_site[0]-1)          # 这里的活性标签依然是从1开始算起，转为index需要减1
        elif len(one_site) == 2:
            b, e = one_site
            site_indices = [k - 1 for k in range(b, e+1)]
            protected_positions.extend(site_indices)
        else:
            raise ValueError(
                'The label of active site is not standard !!!')
    return protected_positions

def protected_positions_to_site_labels(protected_positions): 
    site_labels = [[x+1] for x in protected_positions]     # 互转加1
    return site_labels

In [3]:
# 修改生成的pdb为标准名称
# aug_dir_flag = 'mcsa_aug_20_mutation_rate_0.2_insertion_rate_0.1_deletion_rate_0.1_max_length_150_seed_123'
aug_dir_flag = 'mcsa_aug_40_mutation_rate_0.35_insertion_rate_0.1_deletion_rate_0.1_max_length_150_seed_123'

org_structure_dir = '../dataset/mcsa_fine_tune/structures/alphafolddb_download'
org_generated_pdb_dir = f'../dataset/mcsa_fine_tune/{aug_dir_flag}/esmfold_generated_aug_pdb'
fixed_generated_structure_dir = f'../dataset/mcsa_fine_tune/{aug_dir_flag}/esmfold_generated_aug_structures'
os.makedirs(fixed_generated_structure_dir, exist_ok=True)

not_exist_path = []
for fname in tqdm(os.listdir(org_generated_pdb_dir)):
    new_fname = fname.replace(' ', '')
    shutil.copyfile(os.path.join(org_generated_pdb_dir, fname), os.path.join(fixed_generated_structure_dir, new_fname))
    org_fname = 'AF-{}-F1-model_v4.pdb'.format(new_fname.split('_')[0].split('-')[0])
    org_pdb_path = os.path.join(org_structure_dir, org_fname)
    if not os.path.exists(os.path.join(fixed_generated_structure_dir, '{}.pdb'.format(new_fname.split('_')[0].split('-')[0]))):
        try:
            shutil.copyfile(org_pdb_path, os.path.join(fixed_generated_structure_dir, '{}.pdb'.format(new_fname.split('_')[0].split('-')[0])))
        except:
            if org_pdb_path not in not_exist_path:
                print(f'{org_pdb_path} not exist.')
                not_exist_path.append(org_pdb_path)

        
    
    
    

  1%|          | 245/34840 [00:00<00:28, 1234.17it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-Q06129-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P15879-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P0ABJ3-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P63098-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P0A6H5-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P01116-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P0A112-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P80643-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-Q7SIF3-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-O66186-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_d

  1%|▏         | 510/34840 [00:00<00:26, 1291.61it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P00459-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-Q59472-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P00969-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P13340-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P11439-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P13063-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P80078-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P00806-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P00588-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P0A7F3-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_d

  3%|▎         | 995/34840 [00:00<00:27, 1239.86it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P04716-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P00903-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-E3PRJ4-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P21873-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P13449-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P11560-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P03472-F1-model_v4.pdb not exist.


  5%|▍         | 1737/34840 [00:01<00:16, 2058.33it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-O43924-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P10507-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P08773-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-Q59471-F1-model_v4.pdb not exist.


  7%|▋         | 2440/34840 [00:01<00:11, 2803.71it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P0A836-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P09787-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P00435-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P61926-F1-model_v4.pdb not exist.
../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-Q52060-F1-model_v4.pdb not exist.


 10%|█         | 3630/34840 [00:01<00:08, 3591.00it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P00720-F1-model_v4.pdb not exist.


 17%|█▋        | 5896/34840 [00:02<00:07, 3932.70it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P06229-F1-model_v4.pdb not exist.


 26%|██▌       | 9031/34840 [00:02<00:05, 5018.93it/s]

../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P03950-F1-model_v4.pdb not exist.


100%|██████████| 34840/34840 [00:09<00:00, 3585.14it/s]


In [4]:
print(len(not_exist_path))

63


In [5]:
mcsa_test_set = get_mcsa_normal_dataset('../dataset/mcsa_fine_tune/normal_mcsa', subset=['test'])
display(mcsa_test_set)

test_not_exist_path = []

for structure_id in tqdm(mcsa_test_set['alphafolddb-id'].tolist()):
    structure_id = structure_id.replace(' ', '')
    org_fname = 'AF-{}-F1-model_v4.pdb'.format(structure_id)
    org_pdb_path = os.path.join(org_structure_dir, org_fname)
    try:
    
        shutil.copyfile(org_pdb_path, os.path.join(fixed_generated_structure_dir, '{}-c0.pdb'.format(structure_id)))
    
    except:
        test_not_exist_path.append(org_pdb_path)
    

    
print(len(test_not_exist_path))


Unnamed: 0,reaction,ec,alphafolddb-id,aa_sequence,site_labels,site_types,cluster,ec_level1,dataset_flag
0,CC(C)=CCC1=C(C)C(=O)c2ccccc2C1=O.O=C[O-].[H+]|...,1.17.5.3,P0AAJ3,MDVSRRQFFKICAGGMAGTTVAALGFAPKQALAQARNYKLLRAKEI...,"[[169], [196], [197]]",,Cluster 38,1,test
1,S1[Fe]S[Fe+]1.S1[Fe]S[Fe+]1.[H+].[H+]|MSRTVMER...,1.12.7.2,P07598,MSRTVMERIEYEMHTPDPKADPDKLHFVQIDEAKCIGCDTCSQYCP...,"[[156], [159], [178], [198], [237], [240], [24...",,Cluster 8052,1,test
2,O=O.[Fe+2].[Fe+2].[Fe+2].[Fe+2].[H+].[H+].[H+]...,1.9.3.1,Q5SJ80,MAVRASEISRVYEAYPEKKATLYFLVLGFLALIVGSLFGPFQALNY...,"[[86], [88], [233], [237], [384], [385], [386]...",,Cluster 1144,1,test
3,C.NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O...,1.14.13.25,P27353,MSQPQSSQVTKRGLTDPERAAIIAAAVPDHALDTQRKYHYFIQPRW...,"[[114], [144], [147], [209], [243], [246]]",,Cluster 9876,1,test
4,O=C([O-])c1ccc(O)c(O)c1.O=O|MPIELLPETPSQTAGPYV...,1.13.11.3,P00436,MPIELLPETPSQTAGPYVHIGLALEAAGNPTRDQEIWNRLAKPDAP...,"[[109], [148], [158], [161], [163]]",,Cluster 24440,1,test
...,...,...,...,...,...,...,...,...,...
100,NC(=O)CC[C@H]([NH3+])C(=O)[O-].Nc1ncnc2c1ncn2[...,6.3.5.4,P22106,MCSIFGVFDIKTDAVELRKKALELSRLMRHRGPDWSGIYASDNAIL...,"[[2], [51], [75], [76], [322], [325]]",,Cluster 1460,6,test
101,C[C@H]1O[C@H](COP(C)(=O)[O-])[C@@H](OP(=O)([O-...,6.5.1.4,P46849,MKRMIALDGAQGEGGGQILRSALSLSMITGQPFTITSIRAGRAKPG...,"[[13], [308]]",,Cluster 14477,6,test
102,C[C@@H](O)C(=O)[O-].C[C@@H]([NH3+])C(=O)[O-].N...,6.1.2.1,P25051,MNRIKVAILFGGCSEEHDVSVKSAIEIAANINKEKYEPLYIGITKS...,"[[19], [22], [99], [244], [250], [290], [311],...",,Cluster 13982,6,test
103,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,7.1.2.2,P01096,MLSVRVAAAVARALPRRAGLVSKNALGSSFIAARNLHASNSRLQKT...,"[[212], [238], [239], [416]]",,Cluster 1502,7,test


  0%|          | 0/105 [00:00<?, ?it/s]

100%|██████████| 105/105 [00:00<00:00, 1189.53it/s]

17





In [6]:
# 此时已经得到突变之后的3d结构了，但是不一定每个突变都保留活性，按照RFdiffusion的标准，motif骨架和侧链残基rmsd<=1.5A则有活性，否则清空活性位点
mcsa_train_valid_set = get_mcsa_normal_dataset('../dataset/mcsa_fine_tune/normal_mcsa', subset=['train', 'valid'])
mcsa_train_valid_set

Unnamed: 0,reaction,ec,alphafolddb-id,aa_sequence,site_labels,site_types,cluster,ec_level1,dataset_flag
0,CC1C(=O)N(C)C1C.O|MSIQHFRVALIPFFAAFCLPVFAHPETL...,3.5.2.6,P62593,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,"[[68], [71], [128], [164], [232], [235]]",,Cluster 18632,3,train
1,CC1=C(C)C(=O)C(C)=C(C)C1=O.NC(=O)C1=CN([C@@H]2...,1.6.5.2,P15559,MVGRRALIVLAHSERTSFNYAMKEAAAAALKKKGWEVVESDLYAMN...,"[[150], [156], [162]]",,Cluster 19381,1,train
2,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)O...,1.8.1.7,P00390,MALLPRALSAGAGPSWRRAARAFRGFLLLLPEPAALTRALSRAMAC...,"[[102], [107], [110], [241], [245], [511], [516]]",,Cluster 2616,1,train
3,O=C([O-])c1cccnc1C(=O)[O-].O=P([O-])([O-])OC[C...,2.4.2.19,P9WJJ7,MGLSDWELAAARAAIARGLDEDLRYGPDVTTLATVPASATTTASLV...,"[[105], [140], [172], [201], [222]]",,Cluster 18704,2,train
4,CSCC[C@H]([NH3+])C(=O)[O-].Nc1ncnc2c1ncn2[C@@H...,2.5.1.6,P31153,MNGQLNGFHEAFIEEGTFLFTSESVGEGHPDKICDQISDAVLDAHL...,"[[29], [31], [32], [57], [70], [181], [250], [...",,Cluster 9717,2,train
...,...,...,...,...,...,...,...,...,...
103,CC(=O)C(=O)[O-].Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO...,6.4.1.1,Q2K340,MPISKILVANRSEIAIRVFRAANELGIKTVAIWAEEDKLALHRFKA...,"[[283], [297], [299], [305], [353], [549], [65...",,Cluster 21,6,valid
104,CP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[...,6.1.1.18,P00962,MSEAEARPTNFIRQIIDEDLASGKHTTVHTRFPPEPNGYLHIGHAK...,"[[35], [261], [271]]",,Cluster 1462,6,valid
105,NC(=[NH2+])NCCC[C@H]([NH2+]CCC(=O)[O-])C(=O)[O...,6.3.3.4,P0DJQ7,MGAPVLPAAFGFLASARTGGGRAPGPVFATRGSHTDIDTPQGERSL...,"[[348], [382], [443]]",,Cluster 2977,6,valid
106,CC[C@H](C)[C@H]([NH3+])C(=O)[O-].CP(=O)([O-])O...,6.1.1.5,P56690,MFKEVGEPNFPKLEEEVLAFWKREKIFQKSVENRKGGPRYTVYEGP...,"[[46], [85], [518], [554], [558], [591], [594]]",,Cluster 34,6,valid


In [7]:
P07342_row = mcsa_train_valid_set.loc[mcsa_train_valid_set['alphafolddb-id'] == 'P07342']

In [8]:
aug_mcsa_train_valid_set = get_mcsa_normal_dataset(f'../dataset/mcsa_fine_tune/{aug_dir_flag}', subset=['train', 'valid'], flag='aug_mcsa')
aug_mcsa_train_valid_set

Unnamed: 0,reaction,ec,alphafolddb-id,aa_sequence,site_labels,site_types,cluster,ec_level1,dataset_flag
0,CC1C(=O)N(C)C1C.O|MSIQHFRVALIPFFAAFCLPVFAHPETL...,3.5.2.6,P62593,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,"[[68], [71], [128], [164], [232], [235]]",,Cluster 18632,3,train
1,CC1C(=O)N(C)C1C.O|MSIQHFRVALIPFFAAFCLPVFAHPETL...,3.5.2.6,P62593-c0_m0,PIPNDAPMPSCVKVSCGSLSRGRAGGVQLGRCIYPQNHIEYSPVTE...,"[[10], [13], [65], [98], [159], [162]]",,Cluster 18632,3,train
2,CC1C(=O)N(C)C1C.O|MSIQHFRVALIPFFAAFCLPVFAHPETL...,3.5.2.6,P62593-c0_m1,FRMEETFSMMSTVKVLKMGAVLPRVGCNEQLSRFIPNSMNDMVAYS...,"[[11], [14], [67], [98], [159], [162]]",,Cluster 18632,3,train
3,CC1C(=O)N(C)C1C.O|MSIQHFRVALIPFFAAFCLPVFAHPETL...,3.5.2.6,P62593-c0_m2,FRPGEQFPVMSHKCELCMAVSRDAQEQLGQRSHTIQNDLVQAPWTE...,"[[11], [13], [64], [94], [157], [160]]",,Cluster 18632,3,train
4,CC1C(=O)N(C)C1C.O|MSIQHFRVALIPFFAAFCLPVFAHPETL...,3.5.2.6,P62593-c0_m3,FEIRFPYMSGFKVLHNAVSRPVAMQEFSGNEVAGRQDYYPVTEKHK...,"[[9], [12], [60], [93], [158], [161]]",,Cluster 18632,3,train
...,...,...,...,...,...,...,...,...,...
4013,NC(=[NH2+])NCCC[C@H]([NH2+]CCC(=O)[O-])C(=O)[O...,6.3.3.4,P0DJQ7-c0_m35,DGCEIPLLMYCPDRPAQLGGMWREDRLPLLKTLNPDMATIWDLCEM...,"[[10], [45], [106]]",,Cluster 2977,6,valid
4014,NC(=[NH2+])NCCC[C@H]([NH2+]CCC(=O)[O-])C(=O)[O...,6.3.3.4,P0DJQ7-c0_m36,DGTERQNGTGYPAATLGGRHRDLEPALNKLAHCMAFGLNEMNKVLD...,"[[11], [40], [107]]",,Cluster 2977,6,valid
4015,NC(=[NH2+])NCCC[C@H]([NH2+]CCC(=O)[O-])C(=O)[O...,6.3.3.4,P0DJQ7-c0_m37,DNPQRGILFTGWYGADQLANFHREDRLPALDTKVPVTMMTCFNGNE...,"[[13], [46], [106]]",,Cluster 2977,6,valid
4016,NC(=[NH2+])NCCC[C@H]([NH2+]CCC(=O)[O-])C(=O)[O...,6.3.3.4,P0DJQ7-c0_m38,DGKTRRILADYGLIRPMGDHEDMDALPVWLAHDMAYTFDEQGVNEM...,"[[11], [45], [107]]",,Cluster 2977,6,valid


In [9]:
P07342_m1_row = aug_mcsa_train_valid_set.loc[aug_mcsa_train_valid_set['alphafolddb-id'] == 'P07342-c0_m1']
P07342_m1_row

Unnamed: 0,reaction,ec,alphafolddb-id,aa_sequence,site_labels,site_types,cluster,ec_level1,dataset_flag
1232,CC(=O)C(=O)[O-].CC(=O)C(=O)[O-].[H+]|MIRQSTLKN...,2.2.1.6,P07342-c0_m1,DGFNFVFPKHEIFGHMAEGYFASEPGNNGNSMLGATNVDPVMADNA...,"[[11], [65], [66], [108], [409]]",,Cluster 106,2,valid


In [10]:
import MDAnalysis as mda
from MDAnalysis.analysis import align

def calculate_rmsd(ref_pdb_path, target_pdb_path, ref_resid_ids, target_resid_ids, only_backbone=False):
    """
    Calculate the RMSD between specified residues of two protein structures.
    
    Parameters:
    - ref_pdb_path (str): Path to the reference PDB file.
    - target_pdb_path (str): Path to the target PDB file.
    - ref_resid_ids (list): List of residue IDs for the reference structure.
    - target_resid_ids (list): List of residue IDs for the target structure.
    
    Returns:
    float: RMSD value.
    """
    # Load structures
    ref = mda.Universe(ref_pdb_path)
    target = mda.Universe(target_pdb_path)
    
    # Convert resid lists to string
    ref_resid_str = ' '.join(map(str, ref_resid_ids))
    target_resid_str = ' '.join(map(str, target_resid_ids))
    
    # Align and calculate RMSD
    if only_backbone:
        alignment = align.AlignTraj(target, ref, select=(f"resid {target_resid_str} and backbone", f"resid {ref_resid_str}  and backbone"), in_memory=True)
    else:
        alignment = align.AlignTraj(target, ref, select=(f"resid {target_resid_str}", f"resid {ref_resid_str}"), in_memory=True)
    alignment.run()
    
    # Retrieve and return the RMSD
    rmsd = alignment.rmsd.item()
    return rmsd


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# ref_pdb = "../dataset/mcsa_fine_tune/structures/alphafolddb_download/AF-P07342-F1-model_v4.pdb"
# target_pdb = f"../dataset/mcsa_fine_tune/{aug_dir_flag}/esmfold_generated_aug_pdb/P07342_m1.pdb"
# ref_resids = [139, 201, 202, 251, 582]
# target_resids = [11, 69, 70, 113, 408]

# rmsd = calculate_rmsd(ref_pdb, target_pdb, ref_resids, target_resids)

# print(f"RMSD: {rmsd:.2f} Å")


In [12]:
ref_pdb_dir = '../dataset/mcsa_fine_tune/structures/alphafolddb_download/'
target_pdb_dir = f'../dataset/mcsa_fine_tune/{aug_dir_flag}/esmfold_generated_aug_pdb/'

new_aug_save_path = f'../dataset/mcsa_fine_tune/{aug_dir_flag}/'

# max_pos_number_per_data_point = 6
max_pos_number_per_data_point = 12

all_pos_numbers = []
all_neg_numbers = []

for dataset_flag in ['train', 'valid', 'test']:
    this_flag_new_aug_save_path = os.path.join(new_aug_save_path, f'new_{dataset_flag}_dataset')
    os.makedirs(this_flag_new_aug_save_path, exist_ok=True)
    if dataset_flag != 'test':
        this_flag_mcsa_train_valid_set = mcsa_train_valid_set.loc[mcsa_train_valid_set['dataset_flag']==dataset_flag]
        this_flag_aug_mcsa_train_valid_set = aug_mcsa_train_valid_set.loc[aug_mcsa_train_valid_set['dataset_flag']==dataset_flag]
        
        this_flag_new_aug_mcsa_set = pd.DataFrame(columns=aug_mcsa_train_valid_set.columns)

        
        for uniprot_id, site_label in tqdm(zip(this_flag_mcsa_train_valid_set['alphafolddb-id'].tolist(), this_flag_mcsa_train_valid_set['site_labels'].tolist()), total=len(this_flag_mcsa_train_valid_set)):
            
            uniprot_id = uniprot_id.replace(' ', '')
            ref_pdb_path=os.path.join(ref_pdb_dir, f'AF-{uniprot_id}-F1-model_v4.pdb')
            if not os.path.exists(ref_pdb_path): continue
            
            this_aug_set = this_flag_aug_mcsa_train_valid_set.loc[this_flag_aug_mcsa_train_valid_set['alphafolddb-id'].str.startswith(uniprot_id)]
            site_label_plus1 = [x+1 for x in site_labels_to_protected_positions(eval(site_label))]
            

            
            this_pos_rows = []
            this_neg_rows = []
            
            for idx, row in this_aug_set.iterrows():
                aug_site_label_plus1 = [x+1 for x in site_labels_to_protected_positions(eval(row['site_labels']))]   # MDanalysis选择残基是从1开始的
                structure_id = row['alphafolddb-id']
                structure_id = structure_id.replace(' ', '')
                
                if structure_id == uniprot_id:
                    # this_flag_new_aug_mcsa_set.loc[len(this_flag_new_aug_mcsa_set)] = row.tolist()

                    this_pos_rows.append(row.tolist())
                else:
                    try:
                        rmsd = calculate_rmsd(
                            ref_pdb_path=ref_pdb_path,
                            target_pdb_path=os.path.join(target_pdb_dir, f'{structure_id}.pdb'),
                            ref_resid_ids=site_label_plus1,
                            target_resid_ids=aug_site_label_plus1,
                            )
                    except:
                        continue
                    
                    if rmsd <= 1.5:
                        # 包含活性位点的增强数据

                        this_pos_rows.append(row.tolist())
                    else:
                        # motif构象变化太大，看作没有活性位点，清空活性位点
                        row['site_labels'] = []
                        # this_flag_new_aug_mcsa_set.loc[len(this_flag_new_aug_mcsa_set)] = row.tolist()
                        this_neg_rows.append(row.tolist())
                        
            if len(this_pos_rows) > max_pos_number_per_data_point:
                this_pos_rows_to_df = random.sample(this_pos_rows, max_pos_number_per_data_point)
            else:
                this_pos_rows_to_df = this_pos_rows
            
            this_neg_rows_to_df = random.sample(this_neg_rows, min(2*len(this_pos_rows_to_df), len(this_neg_rows)))
            
            for row in this_pos_rows_to_df+this_neg_rows_to_df:
                this_flag_new_aug_mcsa_set.loc[len(this_flag_new_aug_mcsa_set)] = row
            
            all_pos_numbers.append(len(this_pos_rows_to_df))
            all_neg_numbers.append(len(this_neg_rows_to_df))
            
        this_flag_new_aug_mcsa_set.to_csv(os.path.join(this_flag_new_aug_save_path, f'aug_mcsa_{dataset_flag}.csv'), index=False)
    
    else:
        pd.read_csv(os.path.join(f'../dataset/mcsa_fine_tune/{aug_dir_flag}', 'test_dataset/aug_mcsa_test.csv')).to_csv(os.path.join(this_flag_new_aug_save_path, f'aug_mcsa_{dataset_flag}.csv'), index=False)
    
        

  0%|          | 0/824 [00:00<?, ?it/s]

  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory.ts.dt * step,
  dt=self.trajectory

In [15]:
print(sum(all_pos_numbers)/len(all_pos_numbers))
print(sum(all_neg_numbers)/len(all_neg_numbers))

2.4191263282172373
3.846517119244392


In [16]:
import numpy as np
(np.array(all_pos_numbers)==1).sum()

547

In [17]:
len(all_neg_numbers)

847