In [12]:
import os
import urllib
import shutil

import numpy as np

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
from Bio.PDB.PDBIO import PDBIO

from Bio import SeqIO
from Bio.PDB.Atom import Atom


## functions

In [13]:
def check_structure_seq(InDir):
    
    PepBuilder = PPBuilder()
    parser = PDBParser(PERMISSIVE=1, QUIET=True)

    seq = {}
    for InPDB in os.listdir(InDir):

        if InPDB.endswith(".pdb"):
            InStruct = parser.get_structure("target", f"{InDir}/{InPDB}")
            InSeq = str(PepBuilder.build_peptides(InStruct)[0].get_sequence())

            if InSeq in seq:
                seq[InSeq].append(InPDB)
            else:
                seq[InSeq] = [InPDB]
            # print(InPDB, InSeq)

    print(seq)

    return

def check_atom(InDir):
    
    parser = PDBParser(PERMISSIVE=1, QUIET=True)
    Natoms_dict = {}

    for InPDB in os.listdir(InDir):

        if InPDB.endswith(".pdb"):
            InStruct = parser.get_structure("target", f"{InDir}/{InPDB}")
            # print(InStruct[0]["A"].child_list)
            Natoms = []
            
            for resi in InStruct[0]["A"].child_list:
                Natoms.append(len(resi.child_list))
            
            Natoms = "".join(map(str, Natoms))
            # sum_atoms = sum(Natoms)
            
            # if sum_atoms in Natoms_dict:
            #     Natoms_dict[sum_atoms].append(InPDB)
            # else:
            #     Natoms_dict[sum_atoms] = [InPDB]

            if Natoms in Natoms_dict:
                Natoms_dict[Natoms].append(InPDB)
            else:
                Natoms_dict[Natoms] = [InPDB]
        # break

    print(Natoms_dict)
    return

In [3]:
def download_from_IMGT(PDB_id_dict:dict, RootOutDir:str):
    
    OutDir = RootOutDir

    for allele, PDB_id_list in PDB_id_dict.items():
        
        # OutDir = f"{RootOutDir}/{allele}"
        # if not os.path.exists(OutDir):
        #     os.makedirs(OutDir)
        
        print(f"==={allele}===")
        for i, id in enumerate(PDB_id_list):
            print(f"Downloading: {id}  {i+1}/{len(PDB_id_list)}")
            urllib.request.urlretrieve(f"http://www.imgt.org/3Dstructure-DB/IMGT-FILE/IMGT-{id.upper()}.pdb.gz", f"{OutDir}/{id}.pdb.gz")

    return

In [125]:
# class atom():
#     def __init__(self, coord:list) -> None:
#         self.coord = coord
#         pass

#     def rmsd_per_atom(self):
#         return

class Residue():
    def __init__(self,residue_name: str, atom_coord_dict: dict) -> None:
        self.resname = residue_name
        self.coord = atom_coord_dict
        self.dist = {}
        pass

    def list_atoms(self):
        return list(self.coord.keys())

    def symmetry_swap(self):
        if self.resname == "ARG":
            if all(x in self.coord.keys() for x in ["NH1", "NH2"]):
                self.coord["NH1"], self.coord["NH2"] = self.coord["NH2"], self.coord["NH1"]

        elif self.resname == "ASP":
            if all(x in self.coord.keys() for x in ["OD1", "OD2"]):
                self.coord["OD1"], self.coord["OD2"] = self.coord["OD2"], self.coord["OD1"]

        elif self.resname == "PHE":
            if all(x in self.coord.keys() for x in ["CD1", "CD2", "CE1", "CE2"]):
                self.coord["CD1"], self.coord["CD2"] = self.coord["CD2"], self.coord["CD1"]
                self.coord["CE1"], self.coord["CE2"] = self.coord["CE2"], self.coord["CE1"]

        elif self.resname == "GLU":
            if all(x in self.coord.keys() for x in ["OE1", "OE2"]):
                self.coord["OE1"], self.coord["OE2"] = self.coord["OE2"], self.coord["OE1"]

        elif self.resname == "LEU":
            if all(x in self.coord.keys() for x in ["CD1", "CD2"]):
                self.coord["CD1"], self.coord["CD2"] = self.coord["CD2"], self.coord["CD1"]

        elif self.resname == "TYR":
            if all(x in self.coord.keys() for x in ["CD1", "CD2", "CE1", "CE2"]):
                self.coord["CD1"], self.coord["CD2"] = self.coord["CD2"], self.coord["CD1"]
                self.coord["CE1"], self.coord["CE2"] = self.coord["CE2"], self.coord["CE1"]

        elif self.resname == "VAL":
            if all(x in self.coord.keys() for x in ["CG1", "CG2"]):
                self.coord["CG1"], self.coord["CG2"] = self.coord["CG2"], self.coord["CG1"]

        return

    def calc_dist(self, ref_coord_dict):
        # check if have same atoms
        if self.coord.keys() != ref_coord_dict.keys():
            raise ValueError(f"residue have different atoms! ref: {list(ref_coord_dict.keys())} object: {list(self.coord.keys())}")
            # print(f"residue have different atoms! ref: {list(ref_coord_dict.keys())} object: {list(self.coord.keys())}")
                
        for atom in ref_coord_dict:

            distance = np.linalg.norm(ref_coord_dict[atom]-self.coord[atom])
            self.dist[atom] = distance

            # if target structure lack atoms, atom coord and dist will be NAN
            # self.dist[atom] = np.NAN
            # self.coord[atom] = np.array([np.Nan, np.Nan, np.Nan])

        return

    def resi_dist_sum(self):
        return sum(self.dist.values())

    def coord_flat(self, ref_coord_dict):
        flat_coord = []
        for atom in ref_coord_dict:
            flat_coord.append(self.coord[atom])

        return flat_coord

    def dist_flat(self, ref_coord_dict):
        flat_dist = []
        for atom in ref_coord_dict:
            flat_dist.append(self.dist[atom])
        return flat_dist

    def update_coord(self, coord_list):
        # for ref struct
        if len(self.coord) != len(coord_list):
            raise ValueError("coord_list doesn't match!")

        for coord, atom in zip(coord_list, self.coord.keys()):
            self.coord[atom] = coord

    def update_dist(self, dist_list):
        if len(self.dist) != len(dist_list):
            raise ValueError("dist_list doesn't match!")

        for dist, atom in zip(dist_list, self.dist.keys()):
            self.dist[atom] = dist

class Chain():
    """
    List of Residue() class
    """
    
    def __init__(self, chain) -> None:
        self.chain = chain
        pass

    @classmethod
    def from_pdb(cls, InPDB):
        resi_list = []
        parser = PDBParser(PERMISSIVE=1, QUIET=True)
        InStruct = parser.get_structure("target", InPDB)

        for resi in InStruct[0]["A"]:
            atom_coord_dict = {}

            for atom in resi:
                atom_coord_dict[atom.id] = atom.coord

            resi_list.append(Residue(resi.resname, atom_coord_dict))
            # cls.add_residue(cls, resi.resname, atom_coord_dict)

        return cls(resi_list)

    def symmetry_refine(self, ref_chain):

        for resiT, resiR in zip(self.chain, ref_chain.chain):
            if resiT.resname in ["ARG", "ASP", "PHE", "GLU", "LEU", "TYR", "VAL"]:
                resiT.calc_dist(resiR.coord)
                old_dist = resiT.resi_dist_sum()
                
                resiT.symmetry_swap()
                
                resiT.calc_dist(resiR.coord)
                new_dist = resiT.resi_dist_sum()
                if old_dist <= new_dist:
                    resiT.symmetry_swap() # flip back

        return

    def calc_dist(self, ref_chain):
        dist_list = []
        
        for resiT, resiR in zip(self.chain, ref_chain.chain):
            resiT.calc_dist(resiR.coord)
            dist_list.append(resiT.resi_dist_sum())

        return dist_list

    def coord_flat(self, ref_chain):
        flat_coord = []
        
        for resiT, resiR in zip(self.chain, ref_chain.chain):
            flat_coord += resiT.coord_flat(resiR.coord)
        
        return flat_coord

    def dist_flat(self, ref_chain):
        flat_dist = []

        for resiT, resiR in zip(self.chain, ref_chain.chain):
            flat_dist += resiT.dist_flat(resiR.coord)

        return flat_dist

    def update_coord_from_list(self, coord_list):

        # check atom numbers
        AtomSum = 0
        for resi in self.chain:
            AtomSum += len(resi.list_atoms())

        if AtomSum != len(coord_list):
            raise ValueError("check coord list!!")
        start = 0
        for resi in self.chain:
            Natoms = len(resi.coord)
            resi.update_coord(coord_list[start:start+Natoms])
            start += Natoms

        return

class RMSF():
    def __init__(self) -> None:
        pass

    @classmethod
    def RMSF_from_dir(cls, InDir, outPDB="RMSF.pdb"):
        
        cls().get_mean_struct(InDir, outPDB)
        
        # cls.save_PDB_rmsf(cls, f"{InDir}/{outPDB}")
        
        return

    def fix_missing_atoms(self):
        # The struct with most atoms is used as reference
        # Natom = []
        # for chain in self.struct_collect:
        #     Natom.append(sum([len(resi.list_atoms()) for resi in chain.chain]))

        # max_index = np.argmax(np.array(Natom))
        # RefChain = self.struct_collect[max_index]
        # RefFile = ListofFilename[max_index]
        RefChain = self.struct_collect[0] # use first struct as reference

        # fill chains that lack atoms with NAN atom coord
        for i in range(1, len(self.struct_collect)):
            # skip reference struct
            chain = self.struct_collect[i]
            for resiT, resiR in zip(chain.chain, RefChain.chain):
                if len(resiR.list_atoms()) < len(resiT.list_atoms()): # check if reference structure miss atoms
                    for atomT in resiT.list_atoms():
                        if not atomT in resiR.coord:
                            resiR.coord[atomT] = np.array([np.NAN, np.NAN, np.NAN])

                for atomR in resiR.list_atoms():
                    if not atomR in resiT.coord:
                        resiT.coord[atomR] = np.array([np.NAN, np.NAN, np.NAN])
        
        return RefChain

    def get_mean_struct(self, InDir, outPDB):
        self.struct_collect = []
        coord_collect = []
        # filename_collect = []
        for PDBfile in os.listdir(InDir):
            if PDBfile.endswith(".pdb"):
                # print(PDBfile)
                # filename_collect.append(PDBfile)
                self.struct_collect.append(Chain.from_pdb(f"{InDir}/{PDBfile}"))

        # fix missing atoms
        ref_chain = self.fix_missing_atoms()

        # round 1: rough calculation

        # ref_chain = Chain.from_pdb(f"{InDir}/{RefPDB}") # use the pdb file with most atoms as template

        for struct in self.struct_collect:
            coord_collect.append(struct.coord_flat(ref_chain))

        ref_coord = np.nanmean(coord_collect, axis=0).round(3)
        ref_chain.update_coord_from_list(ref_coord)

        # round 2: flip symmetrical residues to decrease rmsd
        coord_collect = []
        dist_collect = []

        for struct in self.struct_collect:
            struct.symmetry_refine(ref_chain)
            coord_collect.append(struct.coord_flat(ref_chain))

        # rebuild mean structure
        ref_coord = np.nanmean(coord_collect, axis=0).round(3)
        ref_chain.update_coord_from_list(ref_coord)

        # calculate rmsf
        for struct in self.struct_collect:
            struct.calc_dist(ref_chain)
            dist_collect.append(struct.dist_flat(ref_chain))

        rmsf = np.sqrt(np.nanmean(dist_collect, axis=0)).round(3)
        # print(len(rmsf))
        # sys.exit()

        self.save_PDB_rmsf(f"{InDir}/{PDBfile}", ref_chain, ref_coord, rmsf, outPDB) # the last one as template

        return
        """
        if g.resn=='ARG':
        switchName("flippedRes", "NH1", "NH2")
        elif g.resn=='HIS':
        switchName("flippedRes", "ND1", "CD2")
        switchName("flippedRes", "CE1", "NE2")
        elif g.resn=='ASP':
        switchName("flippedRes", "OD1", "OD2")
        elif g.resn=='PHE':
        switchName("flippedRes", "CD1", "CD2")
        switchName("flippedRes", "CE1", "CE2")
        elif g.resn=='GLN':
        switchName("flippedRes", "OE1", "NE2")
        elif g.resn=='GLU':
        switchName("flippedRes", "OE1", "OE2")
        elif g.resn=='LEU':
        switchName("flippedRes", "CD1", "CD2")
        elif g.resn=='ASN':
        switchName("flippedRes", "OD1", "ND2")
        elif g.resn=='TYR':
        switchName("flippedRes", "CD1", "CD2")
        switchName("flippedRes", "CE1", "CE2")
        elif g.resn=='VAL':
        switchName("flippedRes", "CG1", "CG2")
        """

    def save_PDB_rmsf(self, ref_PDB, ref_chain, coord_list, rmsf_list, out_PDB):
        parser = PDBParser(PERMISSIVE=1, QUIET=True)
        Struct = parser.get_structure("target", ref_PDB) # template struct object

        for resiT, resiR in zip(Struct[0]["A"].get_residues(), ref_chain.chain):
            Tatoms = [x.id for x in resiT.get_atoms()]
            if len(Tatoms) < len(resiR.list_atoms()): # if PDB file miss atoms
                for atomR in resiR.list_atoms():
                    if not atomR in Tatoms:
                        resiT.add(Atom(atomR, (0,0,0), 0.0, 1.00, ' ',f' {atomR} ',0))
                        # resiT[atomR] = np.array([0, 0, 0])
        for atomT, coord, rmsf in zip(Struct[0]["A"].get_atoms(), coord_list, rmsf_list):
            atomT.set_coord(coord)
            atomT.set_bfactor(rmsf)

        io = PDBIO()
        io.set_structure(Struct)
        io.save(out_PDB)
        
        return

In [123]:
from Bio.PDB.Atom import Atom

parser = PDBParser(PERMISSIVE=1, QUIET=True)
Struct = parser.get_structure("target", "1i4f_Crown.pdb")
for resiT in Struct[0]["A"].get_residues():
    # Tatoms = [x.id for x in resiT.get_atoms()]
    Tatoms = list(resiT.get_atoms())
    # print("|"+Tatoms[0].fullname+"|")
    resiT.add(Atom('CC', (0,0,0), 0, 1.00, ' ',' CC ',0))
    # break

for atom in Struct[0]["A"].get_atoms():
    atom.set_coord((0,0,0))
    atom.set_bfactor(0.0)

io = PDBIO()
io.set_structure(Struct)
io.save("test.pdb")



## download

In [30]:
## HLA-A
A_id_dict = {"A01_01":["3BO8","4NQX","1W72","4NQV","6AT9"],
    "A30_03":["6J2A","6J29","6J1V"],
    "A30_01":["6J1W"],
    "A02_01":["3MRE","3D25","6TRN","5N1Y","1I4F","6O53","4U6Y","5DDH","6O4Z","2GTW","6O51","6O4Y","2V2W","2V2X","2VLL","3BGM","3GSO","3PWN","3V5H","3BH8","3FQN","3KLA","3PWL","3UTQ",
    "3QFD","4U6X","2GIT","2GTZ","3BH9","3FQR","3FQX","3O3D","3PWJ","2GT9","6OPD","1DUZ","1I7U","1TVB","1TVH","3FQT","3FQU","3FT2","3GSU","3O3A","3GSW","1JF1","3O3E","5ENW","3H7B","5MER",
    "3MYJ","1T1Z","2GUO","2X4Q","3FT4","3GJF","3GSV","3O3B","3REW","5HHP","6PTE","3FQW","2AV1","3FT3","3GSR","1T1Y","2CLR","2X70","3GIV","3HPJ","3V5D","4K7F","5SWQ","5HHN","2AV7","2X4U",
    "3GSX","3IXA","5HHQ","4UQ3","3GSQ","1DUY","1JHT","6PTB","1T21","1I1Y","1I7R","1IM3","1QEW","1S8D","1S9W","1T1W","1T1X","1T20","1T22","3BHB","3I6G","3MGT","6APN","1EEY","1EEZ","1S9Y",
    "2X4O","2X4P","2X4R","2X4T","3MGO","3V5K","2X4N","2C7U","1QR1","1B0G","1HHI","1HHJ","1HHK","1S9X","5HHM","2X4S","1HHG","3HLA","3TO2","1AKJ","5MEP","1I1F","1I7T","3I6K","1B0R","3GJG",
    "3HAE","5HHO","1HHH","4WUU","6NCA","1P7Q","3H9H","3MRG","3MRB","3MRK","3MRR","3MRD","3MRC","3MRJ","3MRM","3MR9","3MRI","3MRP","3MRQ","3MRF","3MRN","3MRO","3MRH","3MRL","4JFO",
    "4JFP","4JFQ","2J8U","2UWE","2JCC","2P5E","2P5W","3HG1","1LP9","3UTT","3UTS","2BNQ","2BNR","2F54","2F53","3QFJ","2GJ6","1AO7","3PWP","3H9S","1QRN","1QSE","1QSF","3D3V","3D39","6RSY",
    "3O4L","1BD2","2PYE","3QEQ","3QDM","3QDJ","3QDG","6TRO","1OGA","2VLR","2VLJ","2VLK","6RPB","6RPA","6RP9","3GSN","5TEZ","5YXN","5YXU"],
    "A02_03":["3OX8"],
    "A02_06":["3OXR","6P64"],
    "A02_07":["3OXS"],
    "A03_01":["3RL1","3RL2","2XPG","6ENY"],
    "A11_01":["1X7Q","2HN7","1QVO","1Q94","4BEO","4UQ2","4N8V"],
    "A68_01":["1HSB","2HLA"],
    "A23_01":["5WWJ","5WWU","5WWI","5XOV"],
    "A24_02":["4F7T","4F7P","3VXN","5HGD","5HGA","5WXC","3NFN","5HGH","3I6L","4F7M","5HGB","3VXP","3VXO","2BCK","3QZW","5WXD","3NFJ","3VXM","3VXS","3VXR","3W0W","3VXU"]
}

B_id_dict = {"B07_02":["6AT5","6AVF","6AVG","4U1H","3VCL","5EO0","5EO1","4U1K"],
    "B35_01":["1XH3","2FYY","1ZHK","4PRN","1ZSD","2CIK","3LKO","3LKP","3LKQ","4PR5","4PRA","2H6P","3LKS","1A1N","2AXG","3LKN","3LKR","4LNR","1A9E","1CG9","4QRR","1A9B","2NX5","3MV7","3MV8","4PRP","3MV9"],
    "B42_01":["4U1M","4U1J"],
    "B51_01":["1E27","1E28","4MJI"],
    "B53_01":["1A1M","1A1O"],
    "B08_01":["3SPV","4QRS","4QRT","6P2C","6P2F","6P27","4QRU","6P23","6P2S","4QRQ","3SKM","3X13","1M05","3X14","1AGD","1AGC","1AGB","1AGF","1AGE","3SKO","3FFC","4QRP","1MI5","3SJV"],
    "B14_02":["3BXN","3BVN"],
    "B27_03":["6PYL","6PZ5"],
    "B27_04":["5DEF"],
    "B27_05":["5IB2","5IB1","5IB3","5IB4","5IB5","6PYJ","4G8I","4G9D","3B6S","3BP4","3LV3","2BSS","2BST","3DTX","1W0V","2BSR","4G9F","4G8G"],
    "B27_06":["5DEG"],
    "B27_09":["1K5N","3CZF","1UXW","3D18","3BP7","3B3I","1JGD","3HCV","1W0W","1OF2"],
    "B39_01":["4O2C","4O2F","4O2E"],
    "B18_01":["6MT3","4XXC"],
    "B37_01":["6MT6","6MT4","6MT5","6MTM"],
    "B40_01":["6IEX"],
    "B40_02":["5IEK"],
    "B44_02":["1M6O","3KPM","3DX6","3L3I","3L3D","3KPL","3L3G","3L3J","3L3K"],
    "B44_03":["3DX7","1N2R","3KPN","3KPO","1SYS"],
    "B57_01":["6D29","6D2T","6BXP","6BXQ","5VUE","6D2R","5VUF","5VUD","3VH8","3X12","3UPR","3WUW","6D2B","5B38","2RFX","5B39"],
    "B58_01":["5VWH","5VWJ","5IND","5IM7","5INC"],
    "B15_01":["5TXS","1XR9","3C9N","1XR8","5VZ5"],
    "B46_01":["4LCY"]
}

C_id_dict = {"C01_02":["5W1W","5W1V"],
    "C03_04":["1EFX"],
    "C08_01":["4NT6","6JTN","6JTP"],
    "C16_04":["5XOS","5XOT"],
    "C04_01":["1QQD","1IM9"],
    "C05_01":["6JTO"],
    "C06_02":["5XS3","5W6A","5W69"],
    "C15_10":["6JOZ","6JP3","5T6X","5T6Y","6PBH","5T6W","5T6Z","5T70","6ID4","5WJN","5V5M","5WJL","6MPP","5WKF","5WKH"]
}

# download_from_IMGT(C_id_dict, "../crystal2")

In [14]:
All_HLA1_dict = {"HLA1_crystals":["3BWA","3LN4","6V2P","2BVP","6MTL","6PYW","5WMQ","6P2C","6VB2","6PYV","6P2F",
"1ZHL","2HJL","3VFN","5IEH","6Q3K","6WZY","3VFV","6X00","5WMR","6P27","3SKO","6P23","6V2Q","2BVO","3TID","3VFU",
"4PRE","6P2S","1SYV","2NW3","3KYO","3VFO","4QRQ","5XOS","3BW9","4PRB","4PRD","6VB1","4U1S","4U1N","2AXF","3SKM",
"3X12","3X13","4HX1","5GSB","5VWD","5VWF","6L9K","6UZM","3KPQ","2HJK","3VFP","3VFR","3VFS","4Z77","1M05","1YDP",
"2FZ3","3KPP","3LN5","3VFM","6VB0","4U1I","3ERY","3VFT","6V3J","1A1N","2BSS","2BVQ","3H9H","3WUW","3X14","5VVP",
"5VCL","1VGK","4U1L","3DX8","3IXA","6GH1","6VB7","3TJH","3X11","6VB5","6VB6","6GH4","3KWW","5GSR","6UZN","3TIE",
"5TRZ","7DZM","6ZKW","1A1M","1A1O","2BSR","3VFW","4JRX","4Z78","5TS1","5VGD","6VB4","6VIU","6UZO","6NPR","2YPL",
"3KYN","5GR7","5JWE","6H6D","6L9L","2AK4","2DYP","3BZE","3BZF","4PRH","5GSX","5JWD","5XS3","6PAG"]}
# All_HLA1_dict = {"HLA1_crystals":["1hsa","1jge","1kpr","1ktl","1mhe","1ogt","1rog","1roh","1roi","1roj",
# "1rok","1rol","1syv","1tmc","1uxs","1vgk","1ydp","1zhl","1zvs","2ak4","2axf","2bvo","2bvp","2bvq","2d31",
# "2dyp","2esv","2f74","2fz3","2hjk","2hjl","2nw3","2ypk","2ypl","3am8","3bw9","3bwa","3bze","3bzf","3cdg",
# "3cii","3dx8","3dxa","3ery","3jts","3kpp","3kpq","3kww","3kxf","3kyn","3kyo","3ln4","3ln5","3m17","3pqy",
# "3rwj","3tid","3tie","3tjh","3vfm","3vfn","3vfo","3vfp","3vfr","3vfs","3vft","3vfu","3vfv","3vfw","3vri",
# "3vrj","3w39","3wl9","3wlb","3x11","4e5x","4eup","4euq","4gkn","4gks","4hkj","4hwz","4hx1","4i48","4i4w",
# "4jqv","4jqx","4jrx","4l29","4l3c","4mj5","4mj6","4nnx","4nny","4no0","4no2","4no3","4no5","4prb","4prd",
# "4pre","4prh","4u1i","4u1l","4u1n","4u1s","4wj5","4wu5","4wu7","4z77","4z78","5c07","5c08","5c09","5c0a",
# "5c0b","5c0c","5c0d","5c0e","5c0f","5c0g","5c0h","5c0i","5c0j","5d9s","5e00","5e9d","5eot","5eu3","5eu5",
# "5euo","5f7d","5f9j","5fa3","5fa4","5fdw","5gr7","5grd","5grg","5gsb","5gsd","5gsr","5gsx","5hyj","5ieh",
# "5iro","5iue","5jwd","5jwe","5meo","5n6b","5nht","5nmh","5nmk","5nqk","5trz","5ts1","5u98","5vcl","5vvp",
# "5vwd","5vwf","5w67","5wmn","5wmo","5wmp","5wmq","5wmr","5wsh","6aee","6amt","6amu","6ei2","6eqa","6ewa",
# "6ewc","6ewo","6g3j","6ggm","6gh1","6gh4","6ghn","6gl1","6h6d","6k60","6l9k","6l9l","6mtl","6npr","6o9b",
# "6o9c","6pa1","6pag","6pyv","6pyw","6q3k","6ss7","6ss8","6ss9","6ssa","6uj7","6uj8","6uj9","6ujo","6ujq",
# "6uk2","6uk4","6uli","6ulk","6uln","6ulr","6uon","6uz1","6uzm","6uzn","6uzo","6uzp","6uzq","6uzs","6v2o",
# "6v2p","6v2q","6v3j","6vb0","6vb1","6vb2","6vb3","6vb4","6vb5","6vb6","6vb7","6viu","6vm7","6vm8","6vm9",
# "6vma","6vmc","6vmx","6vpz","6vq2","6vqd","6vqe","6vqy","6vqz","6vr1","6wl2","6wzy","6x00","6xqa","6y26",
# "6y27","6y28","6y29","6y2a","6y2b","6z9v","6z9w","6z9x","6zkw","6zky","7alo","7duu","7dzm","7dzn","7edo",
# "7ejl","7ejm","7ejn","7f4w","7jyu","7jyv","7kgo","7lfz","7lg0","7lg2","7lg3","7lgd","7m8s","7m8t","7m8u",
# "7mj6","7mj7","7mj8","7mj9","7mja"]}

In [15]:
download_from_IMGT(All_HLA1_dict, "../temp/RAW2")
# check_structure_seq("../temp/ALIGN")

===HLA1_crystals===
Downloading: 3BWA  1/121
Downloading: 3LN4  2/121
Downloading: 6V2P  3/121
Downloading: 2BVP  4/121
Downloading: 6MTL  5/121
Downloading: 6PYW  6/121
Downloading: 5WMQ  7/121
Downloading: 6P2C  8/121
Downloading: 6VB2  9/121
Downloading: 6PYV  10/121
Downloading: 6P2F  11/121
Downloading: 1ZHL  12/121
Downloading: 2HJL  13/121
Downloading: 3VFN  14/121
Downloading: 5IEH  15/121
Downloading: 6Q3K  16/121
Downloading: 6WZY  17/121
Downloading: 3VFV  18/121
Downloading: 6X00  19/121
Downloading: 5WMR  20/121
Downloading: 6P27  21/121
Downloading: 3SKO  22/121
Downloading: 6P23  23/121
Downloading: 6V2Q  24/121
Downloading: 2BVO  25/121
Downloading: 3TID  26/121
Downloading: 3VFU  27/121
Downloading: 4PRE  28/121
Downloading: 6P2S  29/121
Downloading: 1SYV  30/121
Downloading: 2NW3  31/121
Downloading: 3KYO  32/121
Downloading: 3VFO  33/121
Downloading: 4QRQ  34/121
Downloading: 5XOS  35/121
Downloading: 3BW9  36/121
Downloading: 4PRB  37/121
Downloading: 4PRD  38/121
D

HTTPError: HTTP Error 404: Not Found

In [6]:
# HLA2_dict = {"HLA2":["3lqz","3wex","4p4k","4p4r","4p57","4p5k","4p5m","1jk8","1s9v","1uvq","2nna","3pl6","4d8p","4gg6","4grl","4May","4ozf","4ozg","4ozh","4ozi","4z7u","4z7v","4z7w","5ks9","5ksa","5ksb","5ksu","5ksv","5ujt","6dfx","6dig","6mff","6mfg","6px6","6py2","6u3m","6u3n","6u3o","6xc9","6xco","6xcp","6xp6","7kei","1a6a","1aqd","1bx2","1d5m","1d5x","1d5z","1d6e","1dlh","1fv1","1fyt","1h15","1hqr","1hxy","1j8h","1jwm","1jws","1jwu","1kg0","1klg","1klu","1lo5","1pyw","1r5i","1seb","1sje","1sjh","1t5w","1t5x","1ymm","1zgl","2fse","2g9h","2iam","2ian","2icw","2ipk","2oje","2q6w","2seb","2wbj","2xn9","3c5j","3l6f","3o6f","3pdo","3pgc","3pgd","3qxa","3qxd","3s4s","3s5l","3t0e","4aen","4ah2","4c56","4E41","4fqx","4gbx","4h1l","4h25","4h26","4i5b","4is6","4mcy","4mcz","4md0","4md4","4md5","4mdi","4mdj","4ov5","4x5w","4x5x","4y19","4y1a","5jlz","5lax","5ni9","5nig","5v4m","5v4n","6atf","6ati","6atz","6bij","6bil","6bin","6bir","6biv","6bix","6biy","6biz","6cpl","6cpn","6cpo","6cqj","6cql","6cqn","6cqq","6cqr","6hby","6nix","6t3y","6v0y","6v13","6v15","6v18","6v19","6v1a","7n19","6dfs","6dfw"]}
HLA2_dict = {"HLA2":["6cqr","6hby","6nix","6t3y","6v0y","6v13","6v15","6v18","6v19","6v1a","7n19","6dfs","6dfw"]}
download_from_IMGT(HLA2_dict, "../crystal/Class2")

===HLA2===
Downloading: 6cqr  1/13
Downloading: 6hby  2/13
Downloading: 6nix  3/13
Downloading: 6t3y  4/13
Downloading: 6v0y  5/13
Downloading: 6v13  6/13
Downloading: 6v15  7/13
Downloading: 6v18  8/13
Downloading: 6v19  9/13
Downloading: 6v1a  10/13
Downloading: 7n19  11/13
Downloading: 6dfs  12/13
Downloading: 6dfw  13/13


In [52]:
confirmed_dict = {"A01_01":['1W72','3BO8','4NQV','4NQX','6AT9','6MPP'],
"A02_01":['1AKJ','1AO7','1B0G','1B0R','1BD2','1DUY','1DUZ','1EEY','1EEZ','1HHG','1HHH','1HHI','1HHJ','1HHK','1I1F','1I1Y','1I4F','1I7R','1I7T','1I7U','1IM3','1JF1','1JHT','1LP9','1OGA','1P7Q','1QEW','1QR1','1QRN','1QSE','1QSF','1S8D','1S9W','1S9X','1S9Y','1T1W','1T1X','1T1Y','1T1Z','1T20','1T21','1T22','1TVB','1TVH','2BNQ','2BNR','2C7U','2CLR','2F53','2F54','2GIT','2GJ6','2GT9','2GTW','2GTZ','2GUO','2P5E','2P5W','2PYE','2V2W','2V2X','2VLJ','2VLK','2VLL','2VLR','2X4N','2X4O','2X4P','2X4Q','2X4R','2X4S','2X4T','2X4U','2X70','3BGM','3BH8','3BH9','3BHB','3D25','3D39','3D3V','3FQN','3FQR','3FQT','3FQU','3FQW','3FQX','3FT2','3FT3','3FT4','3GIV','3GJF','3GJG','3GSN','3GSO','3GSQ','3GSR','3GSU','3GSV','3GSW','3GSX','3H7B','3H9S','3HAE','3HG1','3HLA','3HPJ','3I6G','3I6K','3KLA','3MGO','3MGT','3MR9','3MRB','3MRC','3MRD','3MRE','3MRF','3MRG','3MRH','3MRI','3MRJ','3MRK','3MRL','3MRM','3MRN','3MRO','3MRP','3MRQ','3MRR','3MYJ','3O3A','3O3B','3O3D','3O3E','3O4L','3PWJ','3PWL','3PWN','3PWP','3QDG','3QDJ','3QDM','3QEQ','3QFD','3QFJ','3REW','3TO2','3UTQ','3UTS','3UTT','3V5D','3V5H','3V5K','4JFO','4JFP','4JFQ','4K7F','4U6X','4U6Y','4UQ3','4WUU','5DDH','5ENW','5HHM','5HHN','5HHO','5HHP','5HHQ','5MEP','5MER','5N1Y','5SWQ','5TEZ','5YXN','5YXU','6NCA','6O4Y','6O4Z','6O51','6O53','6OPD','6PTB','6PTE','6RP9','6RPA','6RPB','6RSY','6TRN','6TRO'],
"A02_03":['3OX8'],
"A02_06":['3OXR','6P64'],
"A02_07":['3OXS'],
"A03_01":['2XPG','3RL1','3RL2','6ENY'],
"A11_01":['1Q94','1QVO','1X7Q','2HN7','4BEO','4N8V','4UQ2','5WJL','5WJN','5WKF','5WKH','6ID4','6JOZ','6JP3'],
"A24_02":['2BCK','3I6L','3NFJ','3NFN','3QZW','3VXM','3VXN','3VXO','3VXP','3VXR','3VXS','3VXU','3W0W','4F7M','4F7P','4F7T','5HGA','5HGB','5HGD','5HGH','5WWI','5WWJ','5WWU','5WXC','5WXD','5XOV'],
"A30_01":['6J1W'],
"A30_03":['6J1V','6J29','6J2A'],
"A68_01":['1HSB','2HLA','6PBH'],
"B07_02":['3VCL','4U1H','4U1K','5EO0','5EO1','6AT5','6AVF','6AVG'],
"B08_01":['1AGB','1AGC','1AGD','1AGE','1AGF','1MI5','3FFC','3SJV','3SPV','4QRP','4QRS','4QRT','4QRU'],
"B14_02":['3BVN','3BXN'],
"B15_01":['1XR8','1XR9','3C9N','5TXS','5VZ5'],
"B18_01":['4XXC','6MT3'],
"B27_03":['6PYL','6PZ5'],
"B27_04":['5DEF'],
"B27_05":['1W0V','2BST','3B6S','3BP4','3DTX','3LV3','4G8G','4G8I','4G9D','4G9F','5IB1','5IB2','5IB3','5IB4','6PYJ'],
"B27_06":['5DEG'],
"B27_09":['1JGD','1K5N','1OF2','1UXW','1W0W','3B3I','3BP7','3CZF','3D18','3HCV','5IB5'],
"B35_01":['1A9B','1A9E','1XH3','1ZHK','1ZSD','2AXG','2CIK','2FYY','2H6P','2NX5','3LKN','3LKO','3LKP','3LKQ','3LKR','3LKS','3MV7','3MV8','3MV9','4LNR','4PR5','4PRA','4PRN','4PRP','4QRR'],
"B37_01":['6MT4','6MT5','6MT6','6MTM'],
"B39_01":['4O2C','4O2E','4O2F'],
"B40_01":['6IEX'],
"B40_02":['5IEK'],
"B42_01":['4U1J','4U1M'],
"B44_02":['1M6O','3DX6','3KPL','3KPM','3L3D','3L3G','3L3I','3L3J','3L3K'],
"B44_03":['1N2R','1SYS','3DX7','3KPN','3KPO'],
"B46_01":['4LCY'],
"B51_01":['1E27','4MJI'],
"B57_01":['2RFX','3UPR','3VH8','5B38','5B39','5T6W','5T6X','5T6Y','5T6Z','5T70','5V5M','5VUD','5VUE','5VUF','6BXP','6BXQ','6D29','6D2B','6D2R','6D2T'],
"B58_01":['5IM7','5INC','5IND','5VWH','5VWJ'],
"C03_04":['1EFX'],
"C04_01":['1IM9','1QQD'],
"C05_01":['6JTO'],
"C06_02":['5W69','5W6A'],
"C08_01":['4NT6'],
"C08_02":['6JTN','6JTP']}

In [None]:
confirmed_dict2 = {"DPA101_03_DPB102_01":["4p5m","4p5k","4p4k","4p4r","3lqz"],
"DPA101_03_DPB104_02":["4p57"],
"DPA102_02_DPB105_01":["3wex"],
"DQA101_02_DQB105_01":["3pl6"],
"DQA101_02_DQB105_02":["4May","4grl"],
"DQA101_02_DQB106_02":["1uvq","6dig"],
"DQA101_11_DQB106_02":["7kei"],
"DQA102_01_DQB102_02":["6py2","6px6"],
"DQA103_01_DQB102_01":["4d8p"],
"DQA103_01_DQB103_02":["2nna","5ks9","4z7v","4z7u","4z7w","4gg6"],
"DQA103_01_I75C_DQB103_01":["6dfx"],
"DQA103_01_I75C_DQB103_02":["5ujt","6xc9","6xcp","6xco"],
"DQA103_02_DQB103_02":["1jk8"],
"DQA105_01_DQB102_01":["6mfg","5ksv","1s9v","6xp6","6mff","5ksu","4ozh","4ozg","4ozi"],
"DQA105_01_C47S_DQB102_01":["6u3m","6u3n","6u3o"],
"DQA105_01_C47S_DQB103_02":["5ksa","5ksb"],
"DQA105_08_DQB102_01":["4ozf"],
"DRA01_01_DRB101_01":["4x5w","1klu","3pdo","6hby","2g9h","1pyw","4aen","4ov5","1sjh","1jwu","2ipk","2xn9","1klg","1t5w","1aqd","1sje","1t5x","1fyt","1jws","4E41","1kg0","3pgc","1jwm","1seb","3qxa","3pgd","6cqj","1dlh","2iam","2ian","4c56","6cqr","1lo5","1hxy","4ah2","4x5x"],
"DRA01_01_DRB103_01":["7n19","1a6a"],
"DRA01_01_DRB104_01":["5ni9","5nig","4md4","5jlz","1d5m","1d5z","6bij","6nix","4md0","6v1a","4mcy","6v18","6bil","4mcz","1d5x","1d6e","2seb","4is6","4y19","5lax","6v19","6v0y","6v13","3o6f","6v15","6biv","3t0e","4y1a"],
"DRA01_01_DRB104_01_L170V":["1j8h"],
"DRA01_01_DRB104_02":["4mdj","4mdi"],
"DRA01_01_DRB104_04":["4md5","6biy","6biz","6bix"],
"DRA01_01_DRB104_05":["6bir"],
"DRA01_01_DRB111_01":["6cpn","6cql","6cpl","6cqn"],
"DRA01_01_DRB114_02":["6atf","6ati","6atz"],
"DRA01_01_DRB115_01":["5v4m","1bx2","2wbj","1ymm"],
"DRA01_01_DRB115_02":["6cpo","6cqq"],
"DRA01_01_DRB301_01":["2q6w"],
"DRA01_01_DRB303_01":["3c5j","4h1l"],
"DRA01_01_DRB303_01_N77T":["4h26"],
"DRA01_01_DRB501_01":["1fv1","1zgl","1h15","1hqr"],
"DRA01_01_A182T_DRB101_01":["5v4n"],
"DRA01_01_F54C_DRB101_01":["3qxd"],
"DRA01_01_H177Q_DRB101_01":["3l6f","3s5l","3s4s"],
"DRA01_01_V65C_DRB101_01_C29S":["4i5b","4fqx","4gbx"],
"DRA01_03_DRB303_01_N77T":["4h25"]}

In [45]:
from Bio import Align

def extract_fasta_pdb(PDB_id_dict:dict, PDBDir, RootOutDir:str, FastaFile:str):
    
    fasta_dict = SeqIO.to_dict(SeqIO.parse(FastaFile, "fasta"))
    PepBuilder = PPBuilder()
    parser = PDBParser(PERMISSIVE=1, QUIET=True)
    aligner = Align.PairwiseAligner()
    
    for allele, id_list in PDB_id_dict.items():
        fullname = f"{allele[0]}*{allele[1:3]}:{allele[4:6]}"
        record = fasta_dict[fullname]

        OutDir = f"{RootOutDir}/{allele}"
        if not os.path.exists(OutDir):
            os.makedirs(OutDir)
        # save sequence of allele in fasta format
        SeqIO.write(record, f"{allele}.faa", "fasta")
        
        for pdb in id_list:
            if os.path.exists(f"{PDBDir}/{pdb}.pdb"):
                InStruct = parser.get_structure("target", f"{PDBDir}/{pdb}.pdb")
                InSeq = str(PepBuilder.build_peptides(InStruct)[0].get_sequence())

                pident = aligner.score(InSeq, record.seq)/len(InSeq)
                if pident == 1.0:
                    shutil.move(f"{PDBDir}/{pdb}.pdb", OutDir)
                else:
                    print(pdb)
                    shutil.copy2(f"../crystal2/{pdb}_A.fasta", "../crystal2/QUESTION")

# extract_fasta_pdb(C_id_dict, "../crystal2/ALIGN", "../crystal2/CONFIRM", "../crystal2/BLAST/HLA.fas")

## mean crystal structure

In [127]:
# allele_list = ["A0101", "A0201", "A3003", "A3001", "A0203", "A0206", "A0207", "A0301", "A1101", "A6801", "A2301", "A2402"]
# allele_list = ["B0702","B3501","B4201","B5101","B5301","B0801","B1402","B2703","B2704","B2705","B2706","B2709","B3901","B1801","B3701","B4001","B4002","B4402","B4403","B5701","B5801","B1501","B4601"]
# allele_list = ["C01_02", "C03_04", "C04_01", "C05_01", "C06_02", "C08_02", "C15_10", "C16_04"]
# for allele in allele_list:
#     print(allele)
#     name = allele+".pdb"
    # check_structure_seq(f"../crystal/{allele}/ALIGN")
    # check_atom(f"../crystal/{allele}/ALIGN")
    # RMSF.RMSF_from_dir(f"../crystal/{allele}/ALIGN", f"../crystal/{name}")

for allele in confirmed_dict.keys():
# for allele in ["C06_02"]:
    print(allele)
    # check_structure_seq(f"../crystal/CONFIRM/{allele}")
    # check_atom(f"../crystal/CONFIRM/{allele}")
    RMSF.RMSF_from_dir(f"../crystal/CONFIRM/{allele}", f"../crystal/CONFIRM/{allele+'.pdb'}")

A01_01
done
A02_01
done
A02_03
done
A02_06
done
A02_07
done
A03_01




done
A11_01
done
A24_02
done
A30_01
done
A30_03
done
A68_01
done
B07_02
done
B08_01
done
B14_02
done
B15_01
done
B18_01
done
B27_03
done
B27_04
done
B27_05
done
B27_06
done
B27_09
done
B35_01
done
B37_01
done
B39_01
done
B40_01
done
B40_02
done
B42_01
done
B44_02
done
B44_03
done
B46_01
done
B51_01
done
B57_01
done
B58_01




done
C03_04
done
C04_01
done
C05_01
done
C06_02
done
C08_01
done
C08_02
done




In [7]:
import matplotlib.pyplot as plt
contact = [7,9,24,45,59,62,63,66,67,69,70,73,74,76,77,80,81,84,95,97,99,114,116,118,143,147,150,152,156,158,159,163,167,171]
# contactB = [5,7,8,9,24,45,59,62,63,65,66,67,70,73,74,76,77,80,81,84,95,97,99,114,116,123,143,146,147,152,155,156,159,160,163,167,171]

B_list = ["B07_02", "B08_01", "B15_01", "B27_05", "B27_09", "B35_01", "B37_01", "B39_01", "B44_02", "B44_03", "B57_01", "B58_01"]
def sum_rmsf(InDir, AlleleList):
    
    parser = PDBParser(PERMISSIVE=1, QUIET=True)
    fig = plt.figure(figsize=(24,8), facecolor="w")

    mean_rmsf_collect = []

    for allele in AlleleList:
        mean_rmsf_list = []
        resi_id_list = []
        InStruct = parser.get_structure("target", f"{InDir}/{allele}.pdb")
        # print(InStruct[0]["A"].child_list)
        
        for resi in InStruct[0]["A"].child_list:
            resi_list = []
            for atom in resi:
                resi_list.append(atom.bfactor)

            resi_id_list.append(resi.id[1])
            mean_rmsf_list.append(np.mean(resi_list))

        plt.plot(resi_id_list, mean_rmsf_list, label=allele)

        # print(rmsf_list)
        mean_rmsf_collect.append(mean_rmsf_list)

    plt.legend()
    plt.vlines(contact, 0, 2, linestyles="dashed")
    plt.show()

    fig = plt.figure(figsize=(24,8), facecolor="w")
    std_list = np.std(np.transpose(mean_rmsf_collect), axis=1)
    mean_list = np.mean(np.transpose(mean_rmsf_collect), axis=1)
    plt.errorbar(resi_id_list, mean_list, std_list)
    plt.vlines(contact, 0, 2, linestyles="dashed")
    plt.show()

    return mean_rmsf_collect