# Improve chain information based on fasta files

In [105]:
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [106]:
data = pd.read_json("data/20240827_data.json")

# Load fast files

In [108]:
records = SeqIO.to_dict(SeqIO.parse("./data/designed_sequences_1450.fasta", "fasta"))

In [109]:
records

{'7N3T_1|Chains': SeqRecord(seq=Seq('SCPDACCPHGSSGLRCTRDGALDSLHHLPGAENLTELYIENQQHLQHLELRDLR...HHH'), id='7N3T_1|Chains', name='7N3T_1|Chains', description='7N3T_1|Chains A, B|High affinity nerve growth factor receptor|Homo sapiens (9606)', dbxrefs=[]),
 '7N3T_2|Chains': SeqRecord(seq=Seq('MSHHHHHHHHSENLYFQSGGGRDEIKERIFKAVVRAIVTGNPEQLKEAKKLLEK...LRS'), id='7N3T_2|Chains', name='7N3T_2|Chains', description='7N3T_2|Chains C, D|Designed TrkA-binding miniprotein|Synthetic construct (32630)', dbxrefs=[]),
 '7LXP_1|Chains': SeqRecord(seq=Seq('PPGPPGPPGPPGXPGPRXPPGPPGPPGPPG'), id='7LXP_1|Chains', name='7LXP_1|Chains', description='7LXP_1|Chains A, B, C, D, E, F|Collagen mimetic peptide|synthetic construct (32630)', dbxrefs=[]),
 '7LXQ_1|Chains': SeqRecord(seq=Seq('PPGPPGPPGPPGPXGPRXPPGPPGPPGPPG'), id='7LXQ_1|Chains', name='7LXQ_1|Chains', description='7LXQ_1|Chains A, B, C|Collagen mimetic peptide|synthetic construct (32630)', dbxrefs=[]),
 '7N2Y_1|Chain': SeqRecord(seq=Seq('EWEALEKKLAALESKCQA

# Get chain ids (pdb and auth) and sequences per PDB

In [110]:
def extract_chain_labels(fasta_description):
    chain_part = (fasta_description.split("|")[1]).replace("Chains ", "").replace("Chain ", "").strip()
    auth_chain_ids = []
    pdb_chain_ids = []

    all_possible_chain_ids_pattern = re.compile(r'\b([A-Za-z0-9]+)\b(?![^\[]*\[auth\])')
    all_possible_chain_ids = all_possible_chain_ids_pattern.findall(chain_part)

    auth_idx = [idx for idx, element in enumerate(all_possible_chain_ids) if element == "auth"]
    for i in auth_idx:
        auth_chain_ids.append(all_possible_chain_ids[i+1])

    i = 0
    while i < len(all_possible_chain_ids):
        if i in auth_idx:
            i += 2
            continue
        pdb_chain_ids.append(all_possible_chain_ids[i])
        i += 1
    
    # Return the list of matches
    return pdb_chain_ids, auth_chain_ids

In [111]:
# By iterating through the json dataset
for i, row in data.iterrows():
    pdb = row["pdb"]
    print(pdb, str(i) + "/" + str(len(data)))
    
    chain_array = data[data["pdb"] == pdb]["chains"].values
    designed_chains = [item for sublist in chain_array for item in sublist]

    idx = 1
    fasta_records = {}
    while idx < 1000:
        try:
            fasta_record = records[pdb.upper()+"_"+str(idx)+"|Chain"]
            pdb_chain_ids, auth_chain_ids = extract_chain_labels(fasta_record.description)
            fasta_records[idx] = {"record": fasta_record}
            fasta_records[idx].update({"pdb_chain_ids": pdb_chain_ids})
            fasta_records[idx].update({"auth_chain_ids": auth_chain_ids})
            idx += 1
        except:
            try:
                fasta_record = records[pdb.upper()+"_"+str(idx)+"|Chains"]
                pdb_chain_ids, auth_chain_ids = extract_chain_labels(fasta_record.description)
                fasta_records[idx] = {"record": fasta_record}
                fasta_records[idx].update({"pdb_chain_ids": pdb_chain_ids})
                fasta_records[idx].update({"auth_chain_ids": auth_chain_ids})
                idx += 1
            except:
                break

    for chain in designed_chains:
        chain_id = chain["chain_id"].split(",")
        for record in fasta_records.keys():
            pdb_chain_ids = fasta_records[record]["pdb_chain_ids"]
            auth_chain_ids = fasta_records[record]["auth_chain_ids"]
    
            # Check if any label in chain_id is in pdb_chain_ids or auth_chain_ids
            if any(label in pdb_chain_ids for label in chain_id) or any(label in auth_chain_ids for label in chain_id):
                chain_dict = {"chain_id": chain["chain_id"],
                                "chain_id_pdb": pdb_chain_ids,
                                "chain_id_auth": auth_chain_ids,
                                "chain_source": chain["chain_source"],
                                "chain_type": chain["chain_type"],
                                "chain_seq_unnat": chain["chain_seq_unnat"],
                                "chain_seq_nat": chain["chain_seq_nat"],
                                "chain_seq_fasta": str(fasta_records[record]["record"].seq),
                                "chain_length": chain["chain_length"]}
                chain["chain_id_pdb"] = pdb_chain_ids
                chain["chain_id_auth"] = auth_chain_ids
                chain["chain_seq_fasta"] = str(fasta_records[record]["record"].seq)

7n3t 0/1448
7lxp 1/1448
7lxq 2/1448
7n2y 3/1448
7n2z 4/1448
7ome 5/1448
7on6 6/1448
7on7 7/1448
7on8 8/1448
7ona 9/1448
7onc 10/1448
7one 11/1448
7ong 12/1448
7onh 13/1448
7op4 14/1448
7opu 15/1448
7opv 16/1448
7ov7 17/1448
7qnp 18/1448
7qsv 19/1448
7qsw 20/1448
7qsx 21/1448
7qsy 22/1448
7qsz 23/1448
7qt1 24/1448
7qvi 25/1448
7qxj 26/1448
7r0r 27/1448
7r8y 28/1448
7r8z 29/1448
7udv 30/1448
7udw 31/1448
7udx 32/1448
7udy 33/1448
7udz 34/1448
7z71 35/1448
7z72 36/1448
7z73 37/1448
7z7e 38/1448
7zk1 39/1448
7zp5 40/1448
7zp6 41/1448
7zp7 42/1448
8ao0 43/1448
8ao1 44/1448
8d1d 45/1448
8ddf 46/1448
8ddg 47/1448
8ddh 48/1448
1byz 49/1448
1d7t 50/1448
1e0m 51/1448
1hqj 52/1448
1ic9 53/1448
1kyc 54/1448
1l4x 55/1448
1uw1 56/1448
2bkg 57/1448
2jab 58/1448
2jgo 59/1448
2jof 60/1448
2jws 61/1448
2jwu 62/1448
2k6r 63/1448
2kjn 64/1448
2kjo 65/1448
2kl8 66/1448
2koz 67/1448
2kp0 68/1448
2kpo 69/1448
2l69 70/1448
2l82 71/1448
2lci 72/1448
2lhc 73/1448
2lhd 74/1448
2lhe 75/1448
2lhg 76/1448
2ln3 77/1

In [112]:
data_result = data.to_json("data/20240827_data.json", orient="records", indent=4)

Unnamed: 0,pdb,picture_path,chains,authors,classification,classification_suggested,classification_suggested_reason,subtitle,tags,keywords,...,publication_country,abstract,crystal_structure,symmetry,exptl_method,formula_weight,synthesis_comment,review,previous_design,next_design
0,7n3t,https://cdn.rcsb.org/images/structures/7n3t_as...,"[{'chain_id': 'A,B', 'chain_source': 'Homo sap...","[{'forename': 'L.', 'surname': 'Cao'}, {'foren...",unknown,"[computational, deep-learning based]",[Author is: D. Baker],Trka ecd complex with designed miniprotein ligand,"[complex, de novo protein]","[diagnostic, strengths, targeted, vast, region...",...,UK,The design of proteins that bind to a specific...,"{'length_a': '42.203', 'length_b': '205.695', ...",P 1 21 1,[X-RAY DIFFRACTION],39318.297,,1,8oe6,7lxp
1,7lxp,https://cdn.rcsb.org/images/structures/7lxp_as...,"[{'chain_id': 'A,B,C,D,E,F', 'chain_source': '...","[{'forename': 'S.D.', 'surname': 'Melton'}, {'...",unknown,[],[],Collagen mimetic peptide with an xaa-position ...,"[biosynthetic protein, collagen, aza-amino acid]","[found, description]",...,,No description found.,"{'length_a': '51.139', 'length_b': '19.520', '...",P 1 21 1,[X-RAY DIFFRACTION],2720.884,,1,7n3t,7lxq
2,7lxq,https://cdn.rcsb.org/images/structures/7lxq_as...,"[{'chain_id': 'A,B,C', 'chain_source': 'synthe...","[{'forename': 'S.D.', 'surname': 'Melton'}, {'...",unknown,[],[],Collagen mimetic peptide with a yaa-position a...,"[biosynthetic protein, collagen, aza-amino aci...","[found, description]",...,,No description found.,"{'length_a': '116.871', 'length_b': '19.477', ...",C 1 2 1,[X-RAY DIFFRACTION],2720.884,,1,7lxp,7n2y
3,7n2y,https://cdn.rcsb.org/images/structures/7n2y_as...,"[{'chain_id': 'A', 'chain_source': 'synthetic ...","[{'forename': 'T.B.J.', 'surname': 'Pinter'}, ...",unknown,[rational],[Author is: V.L. Pecoraro],Crystal structure of a de novo three-stranded ...,"[biosynthetic protein, three-straded coiled co...","[metal, ii, evolution, implicated, similar, re...",...,US,The human long interspersed nuclear element 1 ...,"{'length_a': '38.328', 'length_b': '38.328', '...",H 3 2,[X-RAY DIFFRACTION],4104.785,,1,7lxq,7n2z
4,7n2z,https://cdn.rcsb.org/images/structures/7n2z_as...,"[{'chain_id': 'A', 'chain_source': 'synthetic ...","[{'forename': 'T.B.J.', 'surname': 'Pinter'}, ...",unknown,[rational],[Author is: V.L. Pecoraro],Crystal structure of a de novo three-stranded ...,"[desinged protein, heavy metal sites in protei...","[metal, ii, evolution, implicated, similar, re...",...,US,The human long interspersed nuclear element 1 ...,"{'length_a': '38.130', 'length_b': '38.130', '...",H 3 2,[X-RAY DIFFRACTION],4104.785,,1,7n2y,7ome
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1443,8ohp,https://cdn.rcsb.org/images/structures/8ohp_as...,"[{'chain_id': 'A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P...","[{'forename': 'N.', 'surname': 'Louros'}, {'fo...",unknown,[],[],Structure of the fmoc-tau-pam4 type 3 amyloid ...,"[cross-beta, tau, amyloid, neurodegeneration, ...","[preserving, distinct, hampers, synthetic, neu...",...,UK,Tauopathies encompass a group of neurodegenera...,"{'length_a': '1.00', 'length_b': '1.00', 'leng...",P 1,[ELECTRON MICROSCOPY],1652.269,,1,8ohi,8oi0
1444,8oi0,https://cdn.rcsb.org/images/structures/8oi0_as...,"[{'chain_id': 'A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P...","[{'forename': 'N.', 'surname': 'Louros'}, {'fo...",unknown,[],[],Structure of the fmoc-tau-pam4 type 4 amyloid ...,"[cross-beta, tau, amyloid, neurodegeneration, ...","[preserving, distinct, hampers, synthetic, neu...",...,UK,Tauopathies encompass a group of neurodegenera...,"{'length_a': '1.00', 'length_b': '1.00', 'leng...",P 1,[ELECTRON MICROSCOPY],1652.269,,1,8ohp,8g1h
1445,8g1h,https://cdn.rcsb.org/images/structures/8g1h_as...,"[{'chain_id': 'A', 'chain_source': 'synthetic ...","[{'forename': 'P.A.', 'surname': 'Cea'}, {'for...",unknown,[],[],Ancestral protein ancth of phosphomethylpirimi...,"[enzyme evolution, transferase, 5-phosphohydro...","[frequently, improve, reconstruction, stabiliz...",...,US,Natural proteins are frequently marginally sta...,"{'length_a': '59.410', 'length_b': '59.410', '...",P 61 2 2,[X-RAY DIFFRACTION],26798.918,,1,8oi0,8oe2
1446,8oe2,https://cdn.rcsb.org/images/structures/8oe2_as...,"[{'chain_id': 'A,B,C,D,E,F', 'chain_source': '...","[{'forename': 'A.', 'surname': 'Kunka'}, {'for...",unknown,[],[],Structure of hyperstable haloalkane dehalogena...,"[hydrolase, engineered haloalkane dehalogenase]","[provides, compare, stabilization, bioindustry...",...,US,Thermostability is an essential requirement fo...,"{'length_a': '67.350', 'length_b': '143.144', ...",P 1 21 1,[X-RAY DIFFRACTION],34642.508,,1,8g1h,8oe6
