In [38]:
from typing import List
from itertools import combinations
import numpy as np
import pandas as pd

import Bio
import Bio.PDB
import Bio.SVDSuperimposer
from Bio.PDB import PDBList

import utils


pdb_codes = [
    "1adq",
    "1fbi",
    "1h0d",
    "1nsn",
    "1ob1",
    "1wej",
    "2ypv",
    "3raj",
    "3vrl",
    "5e94",
]
antigens = sorted(pdb_codes[:])

In [39]:
def download_pdb(pdbs: List[str]):
    pdbl = PDBList()
    for pdb_code in pdbs:
        pdbl.retrieve_pdb_file(pdb_code, pdir="data/pdb", file_format="pdb")

download_pdb(pdb_codes)

Structure exists: 'data/pdb/pdb1adq.ent' 
Structure exists: 'data/pdb/pdb1fbi.ent' 
Structure exists: 'data/pdb/pdb1h0d.ent' 
Structure exists: 'data/pdb/pdb1nsn.ent' 
Structure exists: 'data/pdb/pdb1ob1.ent' 
Structure exists: 'data/pdb/pdb1wej.ent' 
Structure exists: 'data/pdb/pdb2ypv.ent' 
Structure exists: 'data/pdb/pdb3raj.ent' 
Structure exists: 'data/pdb/pdb3vrl.ent' 
Structure exists: 'data/pdb/pdb5e94.ent' 


In [40]:
df = pd.read_csv("data/summary.tsv", sep='\t')
df = df.loc[df['pdb'].isin(pdb_codes)].copy().sort_values("pdb")
df["pdb_filepath"] = df["pdb"].apply(
    lambda pdb_code: f"./data/pdb/pdb{pdb_code}.ent"
)
df = df.drop_duplicates(["pdb"])  # warning
df = df.set_index("pdb")
df

Unnamed: 0_level_0,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,...,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid,pdb_filepath
pdb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1adq,H,L,0,A,protein,,igg4 rea fc,COMPLEX (IMMUNOGLOBULIN/AUTOANTIGEN),02/18/97,CRYSTAL STRUCTURE OF A HUMAN IGM RHEUMATOID FA...,...,False,IGHV3,IGLV3,Lambda,,,,,,./data/pdb/pdb1adq.ent
1fbi,H,L,0,X,protein,,guinea fowl lysozyme,COMPLEX (ANTIBODY/ANTIGEN),01/19/95,CRYSTAL STRUCTURE OF A CROSS-REACTION COMPLEX ...,...,True,IGHV1,IGKV10,Kappa,,,,,,./data/pdb/pdb1fbi.ent
1h0d,B,A,0,C,protein,,angiogenin,IMMUNE SYSTEM/HYDROLASE,06/19/02,Crystal structure of Human Angiogenin in compl...,...,True,IGHV5,IGKV3,Kappa,,,,,,./data/pdb/pdb1h0d.ent
1nsn,H,L,0,S,protein,,staphylococcal nuclease,COMPLEX (IMMUNOGLOBULIN/HYDROLASE),06/06/95,THE CRYSTAL STRUCTURE OF ANTIBODY N10-STAPHYLO...,...,False,IGHV3,IGKV3,Kappa,1e-10,-13.63,Other,,1704035,./data/pdb/pdb1nsn.ent
1ob1,B,A,0,C,protein,,major merozoite surface protein,IMMUNE SYSTEM,01/22/03,Crystal structure of a Fab complex whith Plas...,...,False,IGHV9,IGKV4,Kappa,2e-10,-12.29,Other,4.0,12729744,./data/pdb/pdb1ob1.ent
1wej,H,L,0,F,protein,,cytochrome c,COMPLEX (ANTIBODY/ELECTRON TRANSPORT),03/26/98,IGG1 FAB FRAGMENT (OF E8 ANTIBODY) COMPLEXED W...,...,False,IGHV14,IGKV12,Kappa,1.58e-08,-10.63,ITC,25.0,7539913,./data/pdb/pdb1wej.ent
2ypv,H,L,0,A,protein,,lipoprotein,IMMUNE SYSTEM,11/01/12,Crystal structure of the Meningococcal vaccine...,...,False,IGHV1,IGKV14,Kappa,3e-11,-14.355775131829494,SPR,,TBD,./data/pdb/pdb2ypv.ent
3raj,H,L,0,A,protein,,adp-ribosyl cyclase 1,HYDROLASE/IMMUNE SYSTEM,03/28/11,Crystal structure of human CD38 in complex wit...,...,True,IGHV2,IGKV13,Kappa,,,,,,./data/pdb/pdb3raj.ent
3vrl,H,L,0,C,protein,,gag protein,IMMUNE SYSTEM/VIRAL PROTEIN,04/12/12,Crystal structure of BMJ4 p24 capsid protein i...,...,False,IGHV5,IGKV12,Kappa,,,,,,./data/pdb/pdb3vrl.ent
5e94,B,A,0,G,protein,,glucagon-like peptide 1 receptor,MEMBRANE PROTEIN,10/14/15,Antibody-bound Glucagon-like Peptide-1 recepto...,...,True,IGHV3,IGKV10,Kappa,,,,,,./data/pdb/pdb5e94.ent


In [43]:
df_res = pd.DataFrame(
    data = np.zeros((10, 10)), 
    columns = antigens,
    index = antigens
)

# records = []
for ag1, ag2 in combinations(antigens, 2):
    pdb_fp1 = df.loc[ag1, "pdb_filepath"]
    pdb_fp2 = df.loc[ag2, "pdb_filepath"]
    hchain_name_1 = df.loc[ag1]['Hchain']
    lchain_name_1 = df.loc[ag1]['Lchain']
    hchain_name_2 = df.loc[ag2]['Hchain']
    lchain_name_2 = df.loc[ag2]['Lchain']

    parser = Bio.PDB.PDBParser(PERMISSIVE=0)  # strict parser
    structure1 = parser.get_structure(ag1, pdb_fp1)
    structure2 = parser.get_structure(ag2, pdb_fp2)

    h_res_1, _ = utils.extract_variable_regions_residues(structure1, hchain_name_1, lchain_name_1)
    h_res_2, _ = utils.extract_variable_regions_residues(structure2, hchain_name_2, lchain_name_2)

    coord_h1 = utils.extract_calpha_coord(h_res_1)
    coord_h2 = utils.extract_calpha_coord(h_res_2)

    sup = Bio.SVDSuperimposer.SVDSuperimposer()
    rmsd_h12 = utils.compute_rmsd(coord_h1, coord_h2, sup)
    
    df_res.loc[ag1, ag2] = rmsd_h12
    df_res.loc[ag2, ag1] = rmsd_h12

    # records.append({
    #     "ag1": ag1,
    #     "ag2": ag2,
    #     "rmsd_h12": rmsd_h12,
    # })
# df_res = pd.DataFrame.from_records(records)


df_res



Unnamed: 0,1adq,1fbi,1h0d,1nsn,1ob1,1wej,2ypv,3raj,3vrl,5e94
1adq,0.0,2.825719,2.836272,10.516155,4.058735,6.901963,4.99048,5.63027,1.828644,2.735488
1fbi,2.825719,0.0,2.472031,9.761009,2.883387,5.98034,3.856136,4.784676,1.92354,2.913895
1h0d,2.836272,2.472031,0.0,9.226916,2.183493,5.150612,3.117919,3.897954,1.876613,3.214049
1nsn,10.516155,9.761009,9.226916,0.0,8.142435,5.870526,7.495962,7.735213,9.998324,10.57073
1ob1,4.058735,2.883387,2.183493,8.142435,0.0,3.93499,1.977593,3.310788,3.051037,4.317094
1wej,6.901963,5.98034,5.150612,5.870526,3.93499,0.0,3.141066,4.221,6.100825,7.028906
2ypv,4.99048,3.856136,3.117919,7.495962,1.977593,3.141066,0.0,3.106918,4.068642,5.237804
3raj,5.63027,4.784676,3.897954,7.735213,3.310788,4.221,3.106918,0.0,4.871726,5.909105
3vrl,1.828644,1.92354,1.876613,9.998324,3.051037,6.100825,4.068642,4.871726,0.0,2.622618
5e94,2.735488,2.913895,3.214049,10.57073,4.317094,7.028906,5.237804,5.909105,2.622618,0.0


In [44]:
df_res.to_csv("data/DATA_SLACK_1_Ab_rmsd_h12.tsv", sep='\t')