In [18]:
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio import Entrez
from Bio import SeqIO
from io import StringIO
import re
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [19]:


# Search for a protein (e.g., "rpIL" in NCBI)
Entrez.email = "s230152@dtu.dk"  # Required by NCBI

# Search for the protein sequence
handle = Entrez.esearch(db="protein", term="P0A7R1", retmax=1)
record = Entrez.read(handle)
handle.close()

# Get the first protein ID from the search results
protein_id = record["IdList"][0]

# Fetch the protein sequence by ID
handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
record = handle.read()
handle.close()
# print(record)

seq_record = SeqIO.read(StringIO(record), "genbank")
protein_sequence = seq_record.seq

# Print the raw record (it will contain the sequence)
X=ProteinAnalysis(str(protein_sequence))
print(X.molecular_weight()/1000)


15.768856699999986


In [20]:
protein = IP(protein_sequence) #rpil
print("IEP of peptide {} is {:.2f}".format(protein.sequence, protein.pi()))

print("It's charge at pH 7 is {:.2f}".format(protein.charge_at_pH(7.0)))


IEP of peptide MQVILLDKVANLGSLGDQVNVKAGYARNFLVPQGKAVPATKKNIEFFEARRAELEAKLAEVLAAANARAEKINALETVTIASKAGDEGKLFGSIGTRDIADAVTAAGVEVAKSEVRLPNGVLRTTGEHEVSFQVHSEVFAKVIVNVVAE is 6.17
It's charge at pH 7 is -1.29


In [21]:
handle = Entrez.esearch(db="protein", term="uspA", retmax=1)
record = Entrez.read(handle)
handle.close()

# Get the first protein ID from the search results
protein_id = record["IdList"][0]

# Fetch the protein sequence by ID
handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
record = handle.read()
handle.close()

seq_record = SeqIO.read(StringIO(record), "genbank")
protein_sequence = seq_record.seq

In [22]:
protein = IP(protein_sequence)
print("IEP of peptide {} is {:.2f}".format(protein.sequence, protein.pi()))

print("It's charge at pH 7 is {:.2f}".format(protein.charge_at_pH(5.0)))

IEP of peptide MAYKHILVAVDLSPESEVLVSKAVSMAKPYNAKVSLIHVDVNYSDLYTGLIDVNLGDMQQRITEETSNSLKNLAKNSGYEIQEMLSGSGDLGQVLVDAIRKYDMDLVVCGHHQDFWSKLMSSARQLINTVHVDMLIVPLRDDDNA is 4.89
It's charge at pH 7 is -0.72


In [23]:
protein = IP("tufA")
print("IEP of peptide {} is {:.2f}".format(protein.sequence, protein.pi()))

print("It's charge at pH 7 is {:.2f}".format(protein.charge_at_pH(7.0)))

IEP of peptide TUFA is 5.18
It's charge at pH 7 is -0.60


In [24]:
handle = Entrez.esearch(db="protein", term="P42212", retmax=1)
record = Entrez.read(handle)
handle.close()

# Get the first protein ID from the search results
protein_id = record["IdList"][0]

# Fetch the protein sequence by ID
handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
record = handle.read()
handle.close()

seq_record = SeqIO.read(StringIO(record), "genbank")
protein_sequence = seq_record.seq
print(record)

LOCUS       GFP_AEQVI                238 aa            linear   INV 02-OCT-2024
DEFINITION  RecName: Full=Green fluorescent protein.
ACCESSION   P42212
VERSION     P42212.1
DBSOURCE    UniProtKB: locus GFP_AEQVI, accession P42212;
            class: standard.
            extra accessions:Q17104,Q27903,Q93125
            created: Nov 1, 1995.
            sequence updated: Nov 1, 1995.
            annotation updated: Oct 2, 2024.
            xrefs: M62654.1, AAA27722.1, M62653.1, AAA27721.1, L29345.1,
            AAA58246.1, X96418.1, CAA65278.1, U73901.1, AAB18957.1, JQ1514,
            1B9C_A, 1B9C_B, 1B9C_C, 1B9C_D, 1BFP_A, 1C4F_A, 1CV7_A, 1EMA_A,
            1EMB_A, 1EMC_A, 1EMC_B, 1EMC_C, 1EMC_D, 1EME_A, 1EMF_A, 1EMG_A,
            1EMK_A, 1EML_A, 1EMM_A, 1F09_A, 1F0B_A, 1GFL_A, 1GFL_B, 1H6R_A,
            1H6R_B, 1H6R_C, 1HCJ_A, 1HCJ_B, 1HCJ_C, 1HCJ_D, 1HUY_A, 1JBY_A,
            1JBZ_A, 1JC0_A, 1JC0_B, 1JC0_C, 1JC1_A, 1JC1_B, 1JC1_C, 1KP5_A,
            1KP5_B, 1KYP_A, 1KYR_A, 1KY

In [25]:
protein = IP(protein_sequence)
print("IEP of peptide {} is {:.2f}".format(protein.sequence, protein.pi()))

print("It's charge at pH 7 is {:.2f}".format(protein.charge_at_pH(5.52)))

IEP of peptide MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK is 5.67
It's charge at pH 7 is 1.25


In [31]:
handle = Entrez.esearch(db="protein", term="P0DOX5", retmax=1)
record = Entrez.read(handle)
handle.close()

# Get the first protein ID from the search results
protein_id = record["IdList"][0]

# Fetch the protein sequence by ID
handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
record = handle.read()
handle.close()

seq_record = SeqIO.read(StringIO(record), "genbank")
protein_sequence = seq_record.seq
print(record)
protein = IP(protein_sequence)


print("IEP of peptide {} is {:.2f}".format(protein.sequence, protein.pi()))

print("It's charge at pH 6.8 is {:.2f}".format(protein.charge_at_pH(6.8)))

LOCUS       IGG1_HUMAN               449 aa            linear   PRI 02-OCT-2024
DEFINITION  RecName: Full=Immunoglobulin gamma-1 heavy chain; AltName:
            Full=Immunoglobulin gamma-1 heavy chain NIE.
ACCESSION   P0DOX5
VERSION     P0DOX5.2
DBSOURCE    UniProtKB: locus IGG1_HUMAN, accession P0DOX5;
            class: standard.
            created: Mar 15, 2017.
            sequence updated: Jul 18, 2018.
            annotation updated: Oct 2, 2024.
            xrefs: 1N0X_H, 1N0X_K, 3PGF_H, 4R2G_D, 4R2G_J, 4R2G_N, 4R2G_Q,
            5O4E_A, 5O4E_C, 5VJ6_H, 5VU0_A, 5VU0_B, 5VZX_E, 5VZX_H, 5VZY_H,
            5W5L_A, 5W5L_B, 5WAV_A, 5WAV_B, 5XJE_A, 5XJE_B, 5XJF_A, 5XJF_B,
            5XMH_A, 5XMH_B, 5Y56_A, 5Y56_B, 5YC5_A, 5YC5_B, 6APD_J, 6APD_K,
            6APD_N, 6ARP_B, 6ARP_D, 6ARU_C, 6B70_C, 6B70_E, 6B7Z_C, 6B7Z_E,
            6BF7_C, 6BF7_E, 6BF9_C, 6BF9_E, 6BFT_A, 6BFT_H, 6BGT_B, 6BKB_H,
            6BKC_H, 6BZ4_A, 6BZ4_B, 6DKJ_A, 6DKJ_H, 6EAQ_A, 6EAQ_B, 6FCZ_H,
         

In [41]:
handle = Entrez.esearch(db="protein", term="P02662", retmax=1)
record = Entrez.read(handle)
handle.close()

# Get the first protein ID from the search results
protein_id = record["IdList"][0]

# Fetch the protein sequence by ID
handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
record = handle.read()
handle.close()

seq_record = SeqIO.read(StringIO(record), "genbank")
protein_sequence = seq_record.seq
protein = IP(protein_sequence)
print(record)

print("IEP of peptide {} is {:.2f}".format(protein.sequence, protein.pi()))

print("It's charge at pH 6.8 is {:.2f}".format(protein.charge_at_pH(6.8)))

LOCUS       CASA1_BOVIN              214 aa            linear   MAM 02-OCT-2024
DEFINITION  RecName: Full=Alpha-S1-casein; AltName: Allergen=Bos d 8; Contains:
            RecName: Full=Antioxidant peptide; Flags: Precursor.
ACCESSION   P02662
VERSION     P02662.2
DBSOURCE    UniProtKB: locus CASA1_BOVIN, accession P02662;
            class: standard.
            extra
            accessions:A5YK80,A8WCP1,A8WCP2,A8WCP3,A8WCP4,A8WCP5,A8WCP6,A8WCP7,
            A8WCP8,A8WCP9,A8WCQ0,A8WCQ1,A8WCQ2,A8WCQ3,A8WCQ4,A8WCQ5,A8WCQ6,
            A8WCQ7,A8WCQ8,A8WCQ9,A8WCR0,A8WCR1,A8WCR2,A8WCR3,A8WCR4,A8WCR5,
            A8WCR6,A8WCR7,A8WCR8,A8WCR9,A8WCS0,A8WCS1,Q28048,Q28069,Q32LE8,
            Q7M2U6,Q9TRH5
            created: Jul 21, 1986.
            sequence updated: Nov 1, 1990.
            annotation updated: Oct 2, 2024.
            xrefs: X00564.1, CAB57792.1, M33123.1, AAA30428.1, M38641.1,
            AAA30429.1, X59856.2, CAA42516.1, EU221551.1, ABW98936.1,
            EU221552.1, ABW9

In [43]:

handle = Entrez.esearch(db="protein", term="P05814", retmax=1)
record = Entrez.read(handle)
handle.close()

# Get the first protein ID from the search results
protein_id = record["IdList"][0]

# Fetch the protein sequence by ID
handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
record = handle.read()
handle.close()

seq_record = SeqIO.read(StringIO(record), "genbank")
protein_sequence = seq_record.seq
protein = IP(protein_sequence)
print(record)

print("IEP of peptide {} is {:.2f}".format(protein.sequence, protein.pi()))

print("It's charge at pH 6.8 is {:.2f}".format(protein.charge_at_pH(6.8)))

LOCUS       CASB_HUMAN               226 aa            linear   PRI 02-OCT-2024
DEFINITION  RecName: Full=Beta-casein; Flags: Precursor.
ACCESSION   P05814
VERSION     P05814.4
DBSOURCE    UniProtKB: locus CASB_HUMAN, accession P05814;
            class: standard.
            extra accessions:Q4VAZ9,Q9UCM5
            created: Nov 1, 1988.
            sequence updated: Aug 1, 1992.
            annotation updated: Oct 2, 2024.
            xrefs: X55739.1, CAA39270.1, AF027807.1, AAC82978.1, X17070.1,
            CAA34916.1, BC069554.1, AAH69554.1, BC096194.3, AAH96194.1,
            BC096195.1, AAH96195.1, BC096196.3, AAH96196.1, BC096197.3,
            AAH96197.1, X13766.1, CAA32017.1, KBHU, NP_001289699.1,
            NP_001882.1, XP_016863249.1
            xrefs (non-sequence databases): CCDS:CCDS3532.1,
            AlphaFoldDB:P05814, SMR:P05814, BioGRID:107834, IntAct:P05814,
            MINT:P05814, STRING:9606.ENSP00000341030, Allergome:1064,
            iPTMnet:P05814, PhosphoSi