In [1]:
### the below code to to compute the epitope informtion for encoding. 
### the below code is taking 6nmv.pdb (cothia version) as an example. pleases change to your pdb of interest.


In [1]:
#  the 20230207_0460443_summary.tsv is the antigen-antibody complex file downloded from
# sabdab.

import pandas as pd
import os

#curdir = os.path.dirname(__file__)


def get_sabdab_details():
    f = f"../../Feb_2023/Include_antigens/20230207_0460443_summary.tsv"
    df = pd.read_csv(f, sep="\t").drop_duplicates()
   # df = df.query(
    #    "antigen_type == antigen_type and antigen_type.str.contains('protein')"
    #)
    df = df.query("Hchain != Lchain")
    df = df.query("Hchain == Hchain and Lchain == Lchain")
    print(f"SabDab\nAbAg Complexes: {len(df)}\nPDB Files: {len(set(df.pdb.values))}\n")
    return df


def read_pdb_line(line, pqr=False):
    aname = line[12:16].strip()
    anumb = int(line[5:11].strip())
    resname = line[17:21].strip()[:3]
    chain = line[21]
    resnumb = line[22:27]
    x = float(line[30:38])
    y = float(line[38:46])
    z = float(line[46:54])
    if pqr:
        return chain, (resname, resnumb), (aname, anumb), (x, y, z), line[63:70]
    return chain, (resname, resnumb), (aname, anumb), (x, y, z)


def pqr2xyzr(fin, fout, cdrnumb):
    xyzr = []
    atoms = {}
    with open(fin) as f:
        for line in f:
            if line.startswith("ATOM "):
                (
                    chain,
                    (resname, resnumb),
                    (aname, anumb),
                    (x, y, z),
                ) = read_pdb_line(line)
                r = line[63:70]
                newline = f"{x} {y} {z} {r}"
                xyzr.append(newline)
                atoms[anumb] = (chain, resnumb, resname, aname, cdrnumb)

    recalc = False
    if os.path.isfile(fout):
        with open(fout, "r") as f:
            print(fout)
            if len(f.readlines()) != len(xyzr):
                recalc = True

    with open(fout, "w") as f:
        f.write("".join(xyzr))

    return atoms, recalc


def remove_redundant(df):
    unique = set()
    keep = []
    discard = []
    for i in range(0, int(len(df)), 6):
        old_unique = len(unique)

        ab = df.iloc[i : i + 6]

        cdrs = ""
        for cdr in ab["cdr_seq"].values:
            cdrs += cdr

        unique.add(cdrs)
        new_unique = len(unique)

        if old_unique != new_unique:
            keep += list(range(i, i + 6))
        else:
            discard += list(range(i, i + 6))

    df_removed = df.iloc[discard]
    df = df.iloc[keep]
    return df, df_removed

In [2]:
import pandas as pd
from Bio.Data.IUPACData import protein_letters_3to1 as AA_CONVERTER
from abnumber import Chain
#from utils import read_pdb_line, get_sabdab_details, remove_redundant
import numpy as np
from scipy.spatial import distance_matrix
from multiprocessing import Pool


class Structure:
    def __init__(self, idcode, df_pdb):
        self.idcode = idcode
        self.pdb_path = f"{idcode}.pdb"

        self.chains = []
        for i, values in df_pdb.iterrows():
            chains = list(values[["Hchain", "Lchain"]].dropna().unique())
            if len(chains) == 2:
                self.chains += chains

        self.antigen_chains = df_pdb.antigen_chain.values[0].replace(" | ", "")

        self.get_sequence()
        self.ab_chains = {}

    def get_sequence(self):
        self.sequence = {chain: "" for chain in self.chains}
        self.seqnumbs = {chain: [] for chain in self.chains}
        last = None
        with open(self.pdb_path) as f:
            for line in f:
                if line.startswith("ATOM"):
                    chain, res, *_ = read_pdb_line(line)
                    (resname, resnumb) = res
                    new = resname, chain, resnumb
                    if chain in self.chains and last != new:
                        res = AA_CONVERTER[resname.capitalize()]
                        self.sequence[chain] += res
                        self.seqnumbs[chain].append(resnumb)
                    last = new[:]

    def get_cdrs(self, scheme):
        cdrs = {}
        for ichain in range(0, len(self.chains), 2):
            hchain, lchain = self.chains[ichain : ichain + 2]
            chains = {}
            for chainid in (hchain, lchain):
                seq = self.sequence[chainid]
                print(chainid, seq, scheme)
                try:
                    chain = Chain(seq, scheme=scheme)
                    chains[chainid] = chain
                except:
                    continue

            if len(chains) != 2:
                continue

            for chainid in (hchain, lchain):
                seq = self.sequence[chainid]
                chain = chains[chainid]
                ab_chain = AbChain(seq, chainid, chain.chain_type)
                self.ab_chains[chainid] = ab_chain

                cdrs[chainid] = {
                    1: chain.cdr1_seq,
                    2: chain.cdr2_seq,
                    3: chain.cdr3_seq,
                }
                for i, cdr in cdrs[chainid].items():
                    b = seq.index(cdr)
                    e = b + len(cdr) - 1

                    b_resnumb = self.seqnumbs[chainid][b]
                    e_resnumb = self.seqnumbs[chainid][e]

                    ab_chain.add_cdr(i, cdr, b_resnumb, e_resnumb)

    def get_cdr_epitope(self, cutoff):
        if not self.ab_chains:
            return

        cur_cdr = None
        cdr_i = None
        prev_resnumb = None
        # Assign atoms to CDRs
        with open(self.pdb_path) as f:
            for line in f:
                if line.startswith("ATOM"):
                    chain, res, atom, coords = read_pdb_line(line)
                    if chain in self.ab_chains:
                        (resname, resnumb) = res

                        abchain = self.ab_chains[chain]

                        if not cur_cdr:
                            for i, cdr in abchain.cdrs.items():
                                if resnumb >= cdr.i_begin and resnumb <= cdr.i_end:
                                    if resnumb == cdr.i_begin:
                                        cur_cdr = i
                                        cdr_i = -1
                                        break

                        if cur_cdr:
                            cdr = abchain.cdrs[cur_cdr]

                            if prev_resnumb != resnumb:
                                cdr_i += 1

                            if resnumb > cdr.i_end or cdr_i >= len(cdr.seq):
                                # print("Reset")
                                prev_resnumb = None
                                cur_cdr = None
                            else:
                                # print(
                                #     resnumb,
                                #     cdr.i_begin,
                                #     cdr.i_end,
                                #     resname,
                                #     AA_CONVERTER[resname.capitalize()],
                                #     cdr.seq[cdr_i],
                                #     cdr_i,
                                # )
                                assert (
                                    AA_CONVERTER[resname.capitalize()] == cdr.seq[cdr_i]
                                ), f"CDR from sequence does not match the structure\n{AA_CONVERTER[resname.capitalize()]}\n{cdr.seq[cdr_i]}"
                                cdr.add_atom(atom, res, coords)

                                prev_resnumb = resnumb

        candidates_details = []
        candidates_coords = []
        with open(self.pdb_path) as f:
            for line in f:
                if line.startswith("ATOM"):
                    chain, res, atom, coords = read_pdb_line(line)
                    if chain in self.antigen_chains:
                        (resname, resnumb) = res

                        candidates_details.append((chain, res, atom))
                        candidates_coords.append(coords)

        if len(candidates_coords) == 0:
            return

        residues = []
        for chainid, abchain in self.ab_chains.items():
            for cdr in abchain.cdrs.values():
                cdr_atoms = [atom.coords for atom in cdr.atoms.values()]

                to_print = {}
                for chain, res, atom in candidates_details:
                    if chain not in to_print:
                        to_print[chain] = []
                    to_print[chain].append(atom[1])

                if len(cdr_atoms):
                    dm = distance_matrix(candidates_coords, cdr_atoms)
                else:
                    continue
                cutoff_dm = dm < cutoff
                for i in range(dm.shape[0]):
                    if cutoff_dm[i].sum() > 0:
                        anumb = candidates_details[i][2][1]
                        resnumb = candidates_details[i][1][1]
                        cdr.save_epitope(anumb, resnumb)

                # cdr.print_pymol_selection()
                # exit()

    def print_cdrs(self):
        for chain in self.ab_chains.values():
            for i, cdr in chain.iter_cdrs():
                print(
                    chain.chain_id, chain.chain_type, i, cdr.seq, cdr.i_begin, cdr.i_end
                )

    def save_cdr_epitopes(self):
        cdrs = []
        for chain in self.ab_chains.values():
            for i, cdr in chain.iter_cdrs():
                cdrs.append(
                    [
                        self.idcode,
                        chain.chain_id,
                        chain.chain_type,
                        i,
                        cdr.seq,
                        cdr.i_begin,
                        cdr.i_end,
                        [atom.anumb for atom in cdr.atoms.values()],
                        cdr.epitope,
                        cdr.epitope_res,
                    ]
                )

        return cdrs


class AbChain:
    def __init__(self, seq, chain_id, chain_type):
        self.seq = seq
        self.chain_id = chain_id
        self.chain_type = chain_type

        self.cdrs = {}

    def add_cdr(self, cdr_type, seq, begin, end):
        cdr = CDR(cdr_type, seq, begin, end)
        self.cdrs[cdr_type] = cdr

    def iter_cdrs(self):
        for i, cdr in self.cdrs.items():
            yield i, cdr


class CDR:
    def __init__(self, cdr_type, seq, b, e):
        self.numb = cdr_type
        self.seq = seq
        self.i_begin = b
        self.i_end = e
        self.size = len(seq)
        self.atoms = {}
        self.epitope = []
        self.epitope_res = []

    def add_atom(self, atom, res, coords):
        new_atom = Atom(atom, res, coords)
        self.atoms[atom[1]] = new_atom

    def save_epitope(self, anumb, resnumb):
        self.epitope.append(anumb)
        self.epitope_res.append(resnumb)

    def print_pymol_selection(self):
        sel_cdr = "+".join([str(i) for i in self.atoms.keys()])
        sel_epitope = "+".join([str(i) for i in self.epitope])
        print("CDR:", len(self.atoms), sel_cdr)
        print("Epitope:", len(self.epitope), sel_epitope)


class Atom:
    def __init__(self, atom, res, coords):
        self.resname, self.resnumb = res
        self.aname, self.anumb = atom
        self.coords = np.array(coords)


def run(x):
    i, pdb = x
    print(i, len(unique_pdbs), pdb)
    df_pdb = df_details.query(f'pdb == "{pdb}"')[["Hchain", "Lchain", "antigen_chain"]]

    if True:#try:
        struct = Structure(pdb, df_pdb)
        struct.get_cdrs(scheme)
        struct.print_cdrs()
        struct.get_cdr_epitope(cutoff)
        cdrs = struct.save_cdr_epitopes()

    else:#except:
        cdrs = None
        print("failed ->", pdb)

    if cdrs:
        df = pd.DataFrame(
            cdrs,
            columns=(
                "idcode",
                "chainID",
                "chain_type",
                "cdr",
                "cdr_seq",
                "cdr_begin",
                "cdr_end",
                "cdr_atoms",
                "epitope_atoms",
                "epitope_residues",
            ),
        )
        df, removed = remove_redundant(df)
        return df, removed
    else:
        print("skipped")


if __name__ == "__main__":
    df_details = get_sabdab_details()
    scheme = "chothia"
    cutoff = 5

    df = pd.DataFrame(
        columns=(
            "idcode",
            "chainID",
            "chain_type",
            "cdr",
            "cdr_seq",
            "cdr_begin",
            "cdr_end",
            "cdr_atoms",
            "epitope_atoms",
            "epitope_residues",
        ),
    )
    df_removed = df.copy()
    unique_pdbs = df_details.pdb.unique()

    df_r = run((0, "6nmv"))
    # print(
    #     df_r[0][
    #         [
    #             "idcode",
    #             "chainID",
    #             "chain_type",
    #             "cdr",
    #             "cdr_atoms",
    #             "epitope_residues",
    #         ]
    #     ]
    # )
    # exit()

#     for i, pdb in enumerate(unique_pdbs[:10]):
#         print(f"### {i} ###")
#         df_r = run((i, pdb))
#        # print(df_r[["idcode", "chainID", "chain_type", "cdr", "epitope_residues"]])
#         exit()

    #with Pool(1) as p:
        #rs = p.map(run, [(i, pdb) for i, pdb in enumerate(unique_pdbs[:10])])
#     rs = list(map(run, [(i, pdb) for i, pdb in enumerate(unique_pdbs[1950:1960])]))
#     for r in rs:
#         if r is not None:
#             if r[0] is not None:
#                 df = df.append(r[0], ignore_index=True)
#             if r is not None or r[1] is not None:
#                 df_removed = df_removed.append(r[1], ignore_index=True)

#     print(f"PDBs: {len(df.idcode.unique())}\tCDRs: {len(df)}\tAbAg: {len(df)/6}")
#     df.to_pickle("../data/cdr_epitope.pickle")

#     print(
#         f"PDBs: {len(df_removed.idcode.unique())}\tCDRs: {len(df_removed)}\tAbAg: {len(df_removed)/6}"
#     )
   # df_removed.to_pickle("../data/cdr_epitope_redundant.pickle")

SabDab
AbAg Complexes: 6252
PDB Files: 3123

0 3123 6nmv
H DVQLVESGGGVVRPGESLTLSCTASGFTFTSSTMNWVRQAPGEGLDWVSSISTSGVITYYADSVKGRATISRDNSKNTLYLRLFSLRADDTAIYYCATDTFDHWGPGTLVTVSSASTKGPSVFPLAPALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPK chothia
L ALTQPASVSANPGETVKITCFGSSGNYGWFQQKSPGSAPVTVIHYNNKRPSDIPSRFSGSKSGSTGTLTITGVRAEDEAVYFCGAWETGSATFGAGTTLTVLGQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPTE chothia
H H 1 GFTFTSS   26    32 
H H 2 STSGVI   52    56 
H H 3 DTFDH   95   102 
L L 1 FGSSGNYG   24    34 
L L 2 YNNKRPS   50    56 
L L 3 GAWETGSAT   89    97 


In [3]:
len(df_r)

2

In [4]:
df_r[0]

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues
0,6nmv,H,H,1,GFTFTSS,26,32,"[1064, 1065, 1066, 1067, 1068, 1069, 1070, 107...","[641, 642, 643, 645, 649, 650, 830, 831, 832]","[ 87 , 87 , 87 , 88 , 88 , 88 , 11..."
1,6nmv,H,H,2,STSGVI,52,56,"[1266, 1267, 1268, 1269, 1270, 1271, 1272, 127...","[280, 281, 283, 284, 285, 286, 287, 288, 289, ...","[ 40 , 40 , 40 , 40 , 40 , 40 , 4..."
2,6nmv,H,H,3,DTFDH,95,102,"[1634, 1635, 1636, 1637, 1638, 1639, 1640, 164...","[293, 294, 295, 296, 297, 298, 299, 300, 302, ...","[ 42 , 42 , 42 , 42 , 42 , 43 , 4..."
3,6nmv,L,L,1,FGSSGNYG,24,34,"[2568, 2569, 2570, 2571, 2572, 2573, 2574, 257...","[321, 322, 323, 763, 764, 766, 767, 768, 769, ...","[ 46 , 46 , 46 , 102 , 102 , 102 , 10..."
4,6nmv,L,L,2,YNNKRPS,50,56,"[2749, 2750, 2751, 2752, 2753, 2754, 2755, 275...","[297, 321, 322, 688, 774, 775, 776, 777, 802, ...","[ 42 , 46 , 46 , 92 , 103 , 103 , 10..."
5,6nmv,L,L,3,GAWETGSAT,89,97,"[3044, 3045, 3046, 3047, 3048, 3049, 3050, 305...","[273, 275, 277, 294, 295, 297, 298, 299, 300, ...","[ 39 , 39 , 39 , 42 , 42 , 42 , 4..."


In [6]:
table = df_r[0]

# this will return all the cdr regions, cdr atom index, epitope atom index, corresponding epitope residue index.

In [7]:
table.head()

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues
0,6nmv,H,H,1,GFTFTSS,26,32,"[1064, 1065, 1066, 1067, 1068, 1069, 1070, 107...","[641, 642, 643, 645, 649, 650, 830, 831, 832]","[ 87 , 87 , 87 , 88 , 88 , 88 , 11..."
1,6nmv,H,H,2,STSGVI,52,56,"[1266, 1267, 1268, 1269, 1270, 1271, 1272, 127...","[280, 281, 283, 284, 285, 286, 287, 288, 289, ...","[ 40 , 40 , 40 , 40 , 40 , 40 , 4..."
2,6nmv,H,H,3,DTFDH,95,102,"[1634, 1635, 1636, 1637, 1638, 1639, 1640, 164...","[293, 294, 295, 296, 297, 298, 299, 300, 302, ...","[ 42 , 42 , 42 , 42 , 42 , 43 , 4..."
3,6nmv,L,L,1,FGSSGNYG,24,34,"[2568, 2569, 2570, 2571, 2572, 2573, 2574, 257...","[321, 322, 323, 763, 764, 766, 767, 768, 769, ...","[ 46 , 46 , 46 , 102 , 102 , 102 , 10..."
4,6nmv,L,L,2,YNNKRPS,50,56,"[2749, 2750, 2751, 2752, 2753, 2754, 2755, 275...","[297, 321, 322, 688, 774, 775, 776, 777, 802, ...","[ 42 , 46 , 46 , 92 , 103 , 103 , 10..."


In [8]:
from Bio.PDB import PDBParser

def get_atom_coordinates(pdb_file, chain_id, atom_id):
    parser = PDBParser()
    structure = parser.get_structure('pdb', pdb_file)
    model = structure[0]
    chain = model[chain_id]
    atom = chain[atom_id]
    coordinates = atom.get_coord()
    return coordinates

from Bio.PDB import PDBParser

def get_atom_coordinates_by_number(pdb_file, atom_number):
    parser = PDBParser()
    structure = parser.get_structure('pdb', pdb_file)
    model = structure[0]
    for chain in model:
        for residue in chain:
            for atom in residue:
                if atom.serial_number == atom_number:
                    coordinates = atom.get_coord()
                    return coordinates

                
from Bio.PDB import PDBParser

def get_atom_coordinates_and_residue_type_by_number(pdb_file, atom_number):
    parser = PDBParser()
    structure = parser.get_structure('pdb', pdb_file)
    model = structure[0]
    for chain in model:
        for residue in chain:
            for atom in residue:
                if atom.serial_number == atom_number:
                    coordinates = atom.get_coord()
                    residue_type = residue.resname
                    return coordinates, residue_type

                
import numpy as np

def euclidean_distance(point1, point2):
    distance = np.linalg.norm(point1 - point2)
    return distance



from Bio.PDB import PDBParser
import numpy as np

def get_close_atoms(pdb_file, lst1, lst2, distance_threshold=5.0):
    parser = PDBParser()
    structure = parser.get_structure('pdb', pdb_file)
    model = structure[0]
    close_atoms = []
    for atom_number1 in lst1:
        for atom_number2 in lst2:
            atom1 = None
            atom2 = None
            for chain in model:
                for residue in chain:
                    for atom in residue:
                        if atom.serial_number == atom_number1:
                            atom1 = atom
                        elif atom.serial_number == atom_number2:
                            atom2 = atom
                        if atom1 is not None and atom2 is not None:
                            distance = np.linalg.norm(atom1.get_coord() - atom2.get_coord())
                            if distance < distance_threshold:
                                residue_type1 = residue.resname
                                residue_number1 = residue.id[1]
                                residue_type2 = atom2.get_parent().resname
                                residue_number2 = atom2.get_parent().id[1]
                                residue2 =atom2.get_parent()
                                re2_atom_name, re2_atom_coord= [],[]
                                for atom in residue2:
                                    re2_atom_name.append(atom.get_name())
                                    re2_atom_coord.append(atom.coord)
                                close_atoms.append((residue_type1, residue_number1, residue_type2, residue_number2,[re2_atom_name,re2_atom_coord]))
                                
                                break
                    if atom1 is not None and atom2 is not None:
                        break
                if atom1 is not None and atom2 is not None:
                    break
    return close_atoms

In [9]:
new_lst = []
for i in range(len(table)):
    cdr_lst = table["cdr_atoms"][i]
    epitope_lst = table["epitope_atoms"][i]

    new_lst.append(get_close_atoms("6nmv.pdb", cdr_lst, epitope_lst))

In [22]:
import pandas as pd

the_count =[]
num_lst = []

def is_list_of_empty_lists(lst):
    for sublst in lst:
        if sublst:
            return False
    return True

# define function to convert cdr_seq to three letter code
def convert_seq(seq):
    aa_dict = {'A':'ALA', 'C':'CYS', 'D':'ASP', 'E':'GLU', 'F':'PHE', 'G':'GLY', 'H':'HIS', 'I':'ILE',
               'K':'LYS', 'L':'LEU', 'M':'MET', 'N':'ASN', 'P':'PRO', 'Q':'GLN', 'R':'ARG', 'S':'SER',
               'T':'THR', 'V':'VAL', 'W':'TRP', 'Y':'TYR'}
    return ''.join([aa_dict[s] for s in seq])

# define function to update close_atoms column based on cdr_seq and cdr_begin
def update_close_atoms(row):
    cdr_seq = row['cdr_seq']
    cdr_begin = row['cdr_begin']
    close_atoms = row['close_atoms']  # convert string to list
    updated_close_atoms = []
    for pos, aa  in enumerate(cdr_seq):
        sub_lst = []
        aa3 = convert_seq(aa)
        try:
            pos = int(cdr_begin) + pos
        except:
            updated_close_atoms.append(sub_lst)
            pass
            
        for atom in close_atoms:
            if atom[:2] == (aa3, pos):
                sub_lst.append(atom[2]+str(atom[3]))
        sub_lst = list(set(sub_lst))
        num_lst.append(len(sub_lst))
        
        
        
        updated_close_atoms.append(sub_lst)
        
    if not is_list_of_empty_lists(close_atoms) and is_list_of_empty_lists(updated_close_atoms):
        the_count.append(1)
        
        updated_close_atoms = []
        for pos, aa  in enumerate(cdr_seq):
            sub_lst = []
            aa3 = convert_seq(aa)

            for atom in close_atoms:
                if atom[0] == aa3:
                    sub_lst.append(atom[2]+str(atom[3]))
            sub_lst = list(set(sub_lst))
            num_lst.append(len(sub_lst))
            updated_close_atoms.append(sub_lst)

        
    return updated_close_atoms

In [19]:
table["close_atoms"] = new_lst

In [20]:
table.head()

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,close_atoms
0,6nmv,H,H,1,GFTFTSS,26,32,"[1064, 1065, 1066, 1067, 1068, 1069, 1070, 107...","[641, 642, 643, 645, 649, 650, 830, 831, 832]","[ 87 , 87 , 87 , 88 , 88 , 88 , 11...","[(SER, 31, GLY, 87, [['N', 'CA', 'C', 'O'], [a..."
1,6nmv,H,H,2,STSGVI,52,56,"[1266, 1267, 1268, 1269, 1270, 1271, 1272, 127...","[280, 281, 283, 284, 285, 286, 287, 288, 289, ...","[ 40 , 40 , 40 , 40 , 40 , 40 , 4...","[(SER, 52, GLY, 45, [['N', 'CA', 'C', 'O'], [a..."
2,6nmv,H,H,3,DTFDH,95,102,"[1634, 1635, 1636, 1637, 1638, 1639, 1640, 164...","[293, 294, 295, 296, 297, 298, 299, 300, 302, ...","[ 42 , 42 , 42 , 42 , 42 , 43 , 4...","[(ASP, 95, ALA, 42, [['N', 'CA', 'C', 'O', 'CB..."
3,6nmv,L,L,1,FGSSGNYG,24,34,"[2568, 2569, 2570, 2571, 2572, 2573, 2574, 257...","[321, 322, 323, 763, 764, 766, 767, 768, 769, ...","[ 46 , 46 , 46 , 102 , 102 , 102 , 10...","[(SER, 27, VAL, 102, [['N', 'CA', 'C', 'O', 'C..."
4,6nmv,L,L,2,YNNKRPS,50,56,"[2749, 2750, 2751, 2752, 2753, 2754, 2755, 275...","[297, 321, 322, 688, 774, 775, 776, 777, 802, ...","[ 42 , 46 , 46 , 92 , 103 , 103 , 10...","[(TYR, 50, ALA, 42, [['N', 'CA', 'C', 'O', 'CB..."


In [23]:
table['process_close_atoms2'] = table.apply(update_close_atoms, axis=1)


In [24]:
table.head()

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,close_atoms,process_close_atoms2
0,6nmv,H,H,1,GFTFTSS,26,32,"[1064, 1065, 1066, 1067, 1068, 1069, 1070, 107...","[641, 642, 643, 645, 649, 650, 830, 831, 832]","[ 87 , 87 , 87 , 88 , 88 , 88 , 11...","[(SER, 31, GLY, 87, [['N', 'CA', 'C', 'O'], [a...","[[], [], [], [], [], [THR88, GLY87, GLU111], []]"
1,6nmv,H,H,2,STSGVI,52,56,"[1266, 1267, 1268, 1269, 1270, 1271, 1272, 127...","[280, 281, 283, 284, 285, 286, 287, 288, 289, ...","[ 40 , 40 , 40 , 40 , 40 , 40 , 4...","[(SER, 52, GLY, 45, [['N', 'CA', 'C', 'O'], [a...","[[GLY45, GLU47], [], [], [], [], []]"
2,6nmv,H,H,3,DTFDH,95,102,"[1634, 1635, 1636, 1637, 1638, 1639, 1640, 164...","[293, 294, 295, 296, 297, 298, 299, 300, 302, ...","[ 42 , 42 , 42 , 42 , 42 , 43 , 4...","[(ASP, 95, ALA, 42, [['N', 'CA', 'C', 'O', 'CB...","[[PRO44, GLY43, ALA42, TYR90], [GLY43, ALA42],..."
3,6nmv,L,L,1,FGSSGNYG,24,34,"[2568, 2569, 2570, 2571, 2572, 2573, 2574, 257...","[321, 322, 323, 763, 764, 766, 767, 768, 769, ...","[ 46 , 46 , 46 , 102 , 102 , 102 , 10...","[(SER, 27, VAL, 102, [['N', 'CA', 'C', 'O', 'C...","[[], [], [], [GLU103, VAL102], [], [ARG46, GLU..."
4,6nmv,L,L,2,YNNKRPS,50,56,"[2749, 2750, 2751, 2752, 2753, 2754, 2755, 275...","[297, 321, 322, 688, 774, 775, 776, 777, 802, ...","[ 42 , 46 , 46 , 92 , 103 , 103 , 10...","[(TYR, 50, ALA, 42, [['N', 'CA', 'C', 'O', 'CB...","[[ALA42, SER106, GLU103, VAL92, ARG46], [], []..."


In [26]:
def update_close_atoms(row):
    cdr_seq = row['cdr_seq']
    cdr_begin = row['cdr_begin']
    close_atoms = row['close_atoms']  # convert string to list
    sub_lst= []
    f_lst = []
    for atom in close_atoms:
        if atom[2] + str(atom[3]) not in f_lst: 
            f_lst.append(atom[2] + str(atom[3]))
            sub_lst.append(atom[2:])  

    return sub_lst

In [27]:
table['process_unique_epitope_residues'] = table.apply(update_close_atoms, axis=1)


In [30]:
df_grouped = table.groupby(['idcode', 'chainID'], as_index=False).agg({
    'chain_type': 'first',
    'cdr': lambda x: list(x),
    'cdr_seq': lambda x: list(x),
    'cdr_begin': lambda x: list(x),
    'cdr_end': lambda x: list(x),
    'cdr_atoms': lambda x: list(x),
    'epitope_atoms': lambda x: list(x),
    'epitope_residues': lambda x: list(x),
    'close_atoms': lambda x: list(x),
    'process_unique_epitope_residues':lambda x: list(x)
})


# display the result
df_grouped.head()

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,close_atoms,process_unique_epitope_residues
0,6nmv,H,H,"[1, 2, 3]","[GFTFTSS, STSGVI, DTFDH]","[ 26 , 52 , 95 ]","[ 32 , 56 , 102 ]","[[1064, 1065, 1066, 1067, 1068, 1069, 1070, 10...","[[641, 642, 643, 645, 649, 650, 830, 831, 832]...","[[ 87 , 87 , 87 , 88 , 88 , 88 , 1...","[[(SER, 31, GLY, 87, [['N', 'CA', 'C', 'O'], [...","[[(GLY, 87, [['N', 'CA', 'C', 'O'], [array([ -..."
1,6nmv,L,L,"[1, 2, 3]","[FGSSGNYG, YNNKRPS, GAWETGSAT]","[ 24 , 50 , 89 ]","[ 34 , 56 , 97 ]","[[2568, 2569, 2570, 2571, 2572, 2573, 2574, 25...","[[321, 322, 323, 763, 764, 766, 767, 768, 769,...","[[ 46 , 46 , 46 , 102 , 102 , 102 , 1...","[[(SER, 27, VAL, 102, [['N', 'CA', 'C', 'O', '...","[[(VAL, 102, [['N', 'CA', 'C', 'O', 'CB', 'CG1..."


In [31]:
def update_process_unique_epitope_residuess(row):
    process_unique_epitope_residues = row['process_unique_epitope_residues']

    
    return process_unique_epitope_residues[0]+process_unique_epitope_residues[1]+process_unique_epitope_residues[2]

In [33]:
df_grouped['process_unique_epitope_residues_merged'] = df_grouped.apply(update_process_unique_epitope_residuess, axis=1)


In [34]:
df_grouped

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,close_atoms,process_unique_epitope_residues,process_unique_epitope_residues_merged
0,6nmv,H,H,"[1, 2, 3]","[GFTFTSS, STSGVI, DTFDH]","[ 26 , 52 , 95 ]","[ 32 , 56 , 102 ]","[[1064, 1065, 1066, 1067, 1068, 1069, 1070, 10...","[[641, 642, 643, 645, 649, 650, 830, 831, 832]...","[[ 87 , 87 , 87 , 88 , 88 , 88 , 1...","[[(SER, 31, GLY, 87, [['N', 'CA', 'C', 'O'], [...","[[(GLY, 87, [['N', 'CA', 'C', 'O'], [array([ -...","[(GLY, 87, [['N', 'CA', 'C', 'O'], [array([ -5..."
1,6nmv,L,L,"[1, 2, 3]","[FGSSGNYG, YNNKRPS, GAWETGSAT]","[ 24 , 50 , 89 ]","[ 34 , 56 , 97 ]","[[2568, 2569, 2570, 2571, 2572, 2573, 2574, 25...","[[321, 322, 323, 763, 764, 766, 767, 768, 769,...","[[ 46 , 46 , 46 , 102 , 102 , 102 , 1...","[[(SER, 27, VAL, 102, [['N', 'CA', 'C', 'O', '...","[[(VAL, 102, [['N', 'CA', 'C', 'O', 'CB', 'CG1...","[(VAL, 102, [['N', 'CA', 'C', 'O', 'CB', 'CG1'..."


In [35]:
# given a row. do a new row to make it as dictionary.


def update_epitope_residues_for_6d_coords(row):
    res_lst = row['process_unique_epitope_residues_merged']
    tmp_res_list = []
    for pair in res_lst:
        try:
            res_info = {"name":pair[0],"crds":{}}
            pos_lst = pair[2][0]
            coords_lst = pair[2][1]

            res_info["crds"]["CA"] = coords_lst[pos_lst.index("CA")].tolist()
            res_info["crds"]["N"] = coords_lst[pos_lst.index("N")].tolist()
            res_info["crds"]["C"] = coords_lst[pos_lst.index("C")].tolist()
            assert len(res_info["crds"]["C"]) == len(res_info["crds"]["N"]) == len(res_info["crds"]["CA"]) == 3
            tmp_res_list.append(res_info)
        except:
            pass
            print("wrong")
    return tmp_res_list

In [36]:
df_grouped['epitope_tmp_res_lst'] = df_grouped.apply(update_epitope_residues_for_6d_coords, axis=1)


In [37]:
df_grouped

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,close_atoms,process_unique_epitope_residues,process_unique_epitope_residues_merged,epitope_tmp_res_lst
0,6nmv,H,H,"[1, 2, 3]","[GFTFTSS, STSGVI, DTFDH]","[ 26 , 52 , 95 ]","[ 32 , 56 , 102 ]","[[1064, 1065, 1066, 1067, 1068, 1069, 1070, 10...","[[641, 642, 643, 645, 649, 650, 830, 831, 832]...","[[ 87 , 87 , 87 , 88 , 88 , 88 , 1...","[[(SER, 31, GLY, 87, [['N', 'CA', 'C', 'O'], [...","[[(GLY, 87, [['N', 'CA', 'C', 'O'], [array([ -...","[(GLY, 87, [['N', 'CA', 'C', 'O'], [array([ -5...","[{'name': 'GLY', 'crds': {'CA': [-6.0320000648..."
1,6nmv,L,L,"[1, 2, 3]","[FGSSGNYG, YNNKRPS, GAWETGSAT]","[ 24 , 50 , 89 ]","[ 34 , 56 , 97 ]","[[2568, 2569, 2570, 2571, 2572, 2573, 2574, 25...","[[321, 322, 323, 763, 764, 766, 767, 768, 769,...","[[ 46 , 46 , 46 , 102 , 102 , 102 , 1...","[[(SER, 27, VAL, 102, [['N', 'CA', 'C', 'O', '...","[[(VAL, 102, [['N', 'CA', 'C', 'O', 'CB', 'CG1...","[(VAL, 102, [['N', 'CA', 'C', 'O', 'CB', 'CG1'...","[{'name': 'VAL', 'crds': {'CA': [-21.930000305..."


In [None]:
###
### this df_grouped is just the pkl file (processsed_df_table_with_epitope_residue_info_2.pkl) I provided.
### then just follow the encoding_reurn_read_from_df_epitopes.ipynb to finish encoding.

In [39]:
df_grouped["epitope_tmp_res_lst"][0]

[{'name': 'GLY',
  'crds': {'CA': [-6.0320000648498535, 6.785999774932861, -41.98899841308594],
   'N': [-5.4770002365112305, 8.081000328063965, -42.32099914550781],
   'C': [-5.710000038146973, 5.793000221252441, -43.095001220703125]}},
 {'name': 'THR',
  'crds': {'CA': [-5.933000087738037, 3.4579999446868896, -43.86600112915039],
   'N': [-6.076000213623047, 4.5229997634887695, -42.900001525878906],
   'C': [-7.2789998054504395, 3.256999969482422, -44.57099914550781]}},
 {'name': 'GLU',
  'crds': {'CA': [-2.4639999866485596, 4.454999923706055, -46.07600021362305],
   'N': [-2.630000114440918, 3.562999963760376, -47.21900177001953],
   'C': [-2.444999933242798, 5.882999897003174, -46.60900115966797]}},
 {'name': 'GLY',
  'crds': {'CA': [-15.595999717712402, 0.6880000233650208, -36.50299835205078],
   'N': [-15.619999885559082, -0.7820000052452087, -36.46099853515625],
   'C': [-14.949000358581543, 1.2339999675750732, -37.77299880981445]}},
 {'name': 'GLU',
  'crds': {'CA': [-16.478000

In [47]:
len(df_grouped2["epitope_tmp_res_lst"][2284])

11

In [48]:
len(df_grouped["epitope_tmp_res_lst"][0])

11

In [40]:
import pickle 

with open("pd_from_narval/processsed_df_table_with_epitope_residue_info.pkl","rb") as f:
    df_grouped2= pickle.load(f)

In [10]:
len(new_lst)

6

In [11]:
len(new_lst[1])

49

In [17]:
new_lst[1][0]

('SER',
 52,
 'GLY',
 45,
 [['N', 'CA', 'C', 'O'],
  [array([-15.62 ,  -0.782, -36.461], dtype=float32),
   array([-15.596,   0.688, -36.503], dtype=float32),
   array([-14.949,   1.234, -37.773], dtype=float32),
   array([-14.446,   2.399, -37.78 ], dtype=float32)]])

In [41]:
df_grouped2[df_grouped2["idcode"]=="6nmv"].head()

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,close_atoms,process_unique_epitope_residues,process_unique_epitope_residues_merged,epitope_tmp_res_lst
2284,6nmv,H,H,"[1, 2, 3]","[GFTFTSS, STSGVI, DTFDH]","[ 26 , 52 , 95 ]","[ 32 , 56 , 102 ]","[[1064, 1065, 1066, 1067, 1068, 1069, 1070, 10...","[[641, 642, 643, 645, 649, 650, 830, 831, 832]...","[[ 87 , 87 , 87 , 88 , 88 , 88 , 1...","[[(SER, 31, GLY, 87, [['N', 'CA', 'C', 'O'], [...","[[(GLY, 87, [['N', 'CA', 'C', 'O'], [array([ -...","[(GLY, 87, [['N', 'CA', 'C', 'O'], [array([ -5...","[{'name': 'GLY', 'crds': {'CA': [-6.0320000648..."
2285,6nmv,L,L,"[1, 2, 3]","[FGSSGNYG, YNNKRPS, GAWETGSAT]","[ 24 , 50 , 89 ]","[ 34 , 56 , 97 ]","[[2568, 2569, 2570, 2571, 2572, 2573, 2574, 25...","[[321, 322, 323, 763, 764, 766, 767, 768, 769,...","[[ 46 , 46 , 46 , 102 , 102 , 102 , 1...","[[(SER, 27, VAL, 102, [['N', 'CA', 'C', 'O', '...","[[(VAL, 102, [['N', 'CA', 'C', 'O', 'CB', 'CG1...","[(VAL, 102, [['N', 'CA', 'C', 'O', 'CB', 'CG1'...","[{'name': 'VAL', 'crds': {'CA': [-21.930000305..."


In [16]:
df_grouped[df_grouped["idcode"]=="6nmv"]["close_atoms"].values[0][1][0]

('SER',
 52,
 'GLY',
 45,
 [['N', 'CA', 'C', 'O'],
  [array([-15.62 ,  -0.782, -36.461], dtype=float32),
   array([-15.596,   0.688, -36.503], dtype=float32),
   array([-14.949,   1.234, -37.773], dtype=float32),
   array([-14.446,   2.399, -37.78 ], dtype=float32)]])

In [44]:
len(df_grouped[df_grouped["idcode"]=="6nmv"]["close_atoms"].values[0][2])

42

In [45]:
len(new_lst[2])

42

In [63]:
table["close_atoms"] = new_lst

In [64]:
table.head()

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,close_atoms,process_close_atoms2
0,6nn3,H,H,1,GASISSGGY,26,32,"[3950, 3951, 3952, 3953, 3954, 3955, 3956, 395...",[],[],[],"[[], [], [], [], [], [], [], [], []]"
1,6nn3,H,H,2,YYSGT,52,56,"[4174, 4175, 4176, 4177, 4178, 4179, 4180, 418...","[312, 313, 314, 320, 321, 323, 324, 325, 326, ...","[ 59 , 59 , 59 , 60 , 60 , 61 , 6...","[(GLY, 55, SER, 62, [['N', 'CA', 'C', 'O', 'CB...","[[], [], [], [ALA60, SER63, SER62, ALA61], [AL..."
2,6nn3,H,H,3,VRVDSVRGAGDNGFDP,95,102,"[4509, 4510, 4511, 4512, 4513, 4514, 4515, 451...","[298, 299, 300, 301, 302, 303, 304, 305, 307, ...","[ 57 , 57 , 57 , 57 , 57 , 57 , 5...","[(ARG, 100, PHE, 57, [['N', 'CA', 'C', 'O', 'C...","[[], [PHE57, SER58, PRO59], [], [], [], [], [P..."
3,6nn3,L,L,1,SGDNLGDKYVY,24,34,"[4843, 4844, 4845, 4846, 4847, 4848, 4849, 485...","[217, 219, 234, 235, 3224]","[ 48 , 48 , 50 , 50 , 475 ]","[(ASN, 27, ILE, 475, [['N', 'CA', 'C', 'O', 'C...","[[], [], [], [PRO50, ILE475], [], [], [PRO50, ..."
4,6nn3,L,L,2,QDNKRPS,50,56,"[5057, 5058, 5059, 5060, 5061, 5062, 5063, 506...",[],[],[],"[[], [], [], [], [], [], []]"


In [46]:
# http://localhost:8888/notebooks/Desktop/backup_for_dell_15/pdb_projects/code/jupterNotebooks/Mar_2023/atigen_aware_generation/test_to_code_cut_epitopes_from_etigens.ipynb

import pandas as pd

the_count =[]
num_lst = []


def is_list_of_empty_lists(lst):
    for sublst in lst:
        if sublst:
            return False
    return True
# define function to convert cdr_seq to three letter code
def convert_seq(seq):
    aa_dict = {'A':'ALA', 'C':'CYS', 'D':'ASP', 'E':'GLU', 'F':'PHE', 'G':'GLY', 'H':'HIS', 'I':'ILE',
               'K':'LYS', 'L':'LEU', 'M':'MET', 'N':'ASN', 'P':'PRO', 'Q':'GLN', 'R':'ARG', 'S':'SER',
               'T':'THR', 'V':'VAL', 'W':'TRP', 'Y':'TYR'}
    return ''.join([aa_dict[s] for s in seq])

# define function to update close_atoms column based on cdr_seq and cdr_begin
def update_close_atoms(row):
    cdr_seq = row['cdr_seq']
    cdr_begin = row['cdr_begin']
    close_atoms = row['close_atoms']  # convert string to list
    updated_close_atoms = []
    for pos, aa  in enumerate(cdr_seq):
        sub_lst = []
        aa3 = convert_seq(aa)
        try:
            pos = int(cdr_begin) + pos
        except:
            updated_close_atoms.append(sub_lst)
            pass
            
        for atom in close_atoms:
            if atom[:2] == (aa3, pos):
                sub_lst.append(atom[2]+str(atom[3]))
        sub_lst = list(set(sub_lst))
        num_lst.append(len(sub_lst))
        
        
        
        updated_close_atoms.append(sub_lst)
        
    if not is_list_of_empty_lists(close_atoms) and is_list_of_empty_lists(updated_close_atoms):
        the_count.append(1)
        
        updated_close_atoms = []
        for pos, aa  in enumerate(cdr_seq):
            sub_lst = []
            aa3 = convert_seq(aa)

            for atom in close_atoms:
                if atom[0] == aa3:
                    sub_lst.append(atom[2]+str(atom[3]))
            sub_lst = list(set(sub_lst))
            num_lst.append(len(sub_lst))
            updated_close_atoms.append(sub_lst)

        
    return updated_close_atoms





In [48]:
### just checking they are the same a the provided encoded file. you could ignore the code below.
import pickle 

with open("pd_from_narval/processsed_df_table_with_epitope_residue_info.pkl","rb") as f:
    df_grouped= pickle.load(f)


In [56]:
from collections import Counter

def count_repetitions(lst):
    counter = Counter(lst)
    return counter

# Example usage
example_list = ['apple', 'banana', 'apple', 'orange', 'banana', 'apple']
repetition_counts = count_repetitions(example_list)
print(repetition_counts)

Counter({'apple': 3, 'banana': 2, 'orange': 1})


In [61]:
import pickle
with open("processsed_df_table_with_epitope_residue_info_2.pkl","wb") as fout:
    pickle.dump(df_grouped,fout)



In [59]:
lst = []
for i in range(len(df_grouped["chain_type"])):
    if df_grouped["chain_type"][i] == "K":
        df_grouped["chain_type"][i]="L"
    lst.append(df_grouped["chain_type"][i])


In [60]:
count_repetitions(lst)

Counter({'H': 2414, 'L': 2414})

In [58]:
1849+565

2414

In [45]:
import re

def custom_sort_key(s):
    # Extract the numeric part and the alphabetic part
    match = re.match(r'(\d+)([A-Za-z]*)', s)
    if match:
        num_part = int(match.group(1))
        alpha_part = match.group(2)
        return (num_part, alpha_part)
    return (float('inf'), s)  # In case of no match, put at the end

def sort_strings(strings):
    return sorted(strings, key=custom_sort_key)

# Example usage
strings = ["1", "2", "3", "4", "26A", "27A", "29B", "26", "27", "28", "29"]
sorted_strings = sort_strings(strings)
print(sorted_strings)

['1', '2', '3', '4', '26', '26A', '27', '27A', '28', '29', '29B']
