###***Lab 12 - Working with BioPython to manipulate Sequence and Structure objects***

***Aim:*** *In this exercise, you will learn how to manipulate sequence and structure using BioPython*

###***1. Manipulating Sequence Objects:***

In [None]:
! pip install biopython

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [None]:
from Bio.Seq import Seq

#Create Sequence Object
seq_obj = Seq('AGTACAGATACAGGATACGCTACATCGCAGCTT')
print(f'Original Sequence is {seq_obj}\n')

#Number of Bases in the Sequences
print(f'No. of Bases in the Sequence is {len(seq_obj)}')
print(f'No. of Adenine Bases in the Sequence is {seq_obj.count("A")}')
print(f'No. of Guanine Bases in the Sequence is {seq_obj.count("G")}')
print(f'No. of Thymine Bases in the Sequence is {seq_obj.count("T")}')
print(f'No. of Cytosine Bases in the Sequence is {seq_obj.count("C")}')

Original Sequence is AGTACAGATACAGGATACGCTACATCGCAGCTT

No. of Bases in the Sequence is 33
No. of Adenine Bases in the Sequence is 11
No. of Guanine Bases in the Sequence is 7
No. of Thymine Bases in the Sequence is 7
No. of Cytosine Bases in the Sequence is 8


In [None]:
from Bio.SeqUtils import gc_fraction

#AT Content Percentage
a = seq_obj.count('A')
t = seq_obj.count('T')
print(f'AT Content in the sequence is {(((a+t)/(len(seq_obj)))*100):.2f}%')

#Percentage GC Content in Sequence
print(f'GC content in the sequence is {(gc_fraction(seq_obj)*100):.2f}%')

AT Content in the sequence is 54.55%
GC content in the sequence is 45.45%


In [None]:
from Bio.SeqUtils import MeltingTemp as mt

#Melting Temperature of the Sequence
print(f'Melting Temperature of sequence is {mt.Tm_GC(seq_obj):.2f}')

Melting Temperature of sequence is 60.36


In [None]:
#Reverse  Complement
reverse_complement = seq_obj.reverse_complement()
print(f"Reverse Complement: {reverse_complement}")

#Transcription
transcription = seq_obj.transcribe()
print(f"Transcription: {transcription}")

#Translation
translation = seq_obj.translate()
print(f"Translation: {translation}")

Reverse Complement: AAGCTGCGATGTAGCGTATCCTGTATCTGTACT
Transcription: AGUACAGAUACAGGAUACGCUACAUCGCAGCUU
Translation: STDTGYATSQL


In [None]:
# Parse the Sequence file
seq3 = SeqIO.parse("/content/ls_orchid_1.gbk", "genbank")
count = 0
#Parse can read multiple files at a time
for i in seq3:
  count += 1
#Number of files in the Sequence file
print(count)

94


In [None]:
# Find the residue count in protein sequence
new=SeqIO.parse("/content/ls_orchid.fasta","fasta")
for i in new:
 print(i.id)

print(i.count("M"),i.count("A"),i.count("D"),i.count("E"),i.count("F"),i.count("I"),i.count("G"),i.count("H"),i.count("K"),i.count("L"),i.count("N"),i.count("P"),i.count("R"),i.count("S"),i.count("T"),i.count("V"),i.count("Y"))
print("M", i.count("M"), end="")
print("N", i.count("N"), end="")
print("A", i.count("A"))

gi|2765658|emb|Z78533.1|CIZ78533
gi|2765657|emb|Z78532.1|CCZ78532
gi|2765656|emb|Z78531.1|CFZ78531
gi|2765655|emb|Z78530.1|CMZ78530
gi|2765654|emb|Z78529.1|CLZ78529
gi|2765652|emb|Z78527.1|CYZ78527
gi|2765651|emb|Z78526.1|CGZ78526
gi|2765650|emb|Z78525.1|CAZ78525
gi|2765649|emb|Z78524.1|CFZ78524
gi|2765648|emb|Z78523.1|CHZ78523
gi|2765647|emb|Z78522.1|CMZ78522
gi|2765646|emb|Z78521.1|CCZ78521
gi|2765645|emb|Z78520.1|CSZ78520
gi|2765644|emb|Z78519.1|CPZ78519
gi|2765643|emb|Z78518.1|CRZ78518
gi|2765642|emb|Z78517.1|CFZ78517
gi|2765641|emb|Z78516.1|CPZ78516
gi|2765640|emb|Z78515.1|MXZ78515
gi|2765639|emb|Z78514.1|PSZ78514
gi|2765638|emb|Z78513.1|PBZ78513
gi|2765637|emb|Z78512.1|PWZ78512
gi|2765636|emb|Z78511.1|PEZ78511
gi|2765635|emb|Z78510.1|PCZ78510
gi|2765634|emb|Z78509.1|PPZ78509
gi|2765633|emb|Z78508.1|PLZ78508
gi|2765632|emb|Z78507.1|PLZ78507
gi|2765631|emb|Z78506.1|PLZ78506
gi|2765630|emb|Z78505.1|PSZ78505
gi|2765629|emb|Z78504.1|PKZ78504
gi|2765628|emb|Z78503.1|PCZ78503
gi|2765627

In [None]:
from Bio.SeqRecord import SeqRecord

#Creating a New Sequence
seq1 = Seq("MNQPTVRAGLKIYSCDER")
rec = SeqRecord(seq1, id="22011587", name="Vardhini", description="Sastra")
print(f'New sequence is: \n{rec}')

New sequence is: 
ID: 22011587
Name: Vardhini
Description: Sastra
Number of features: 0
Seq('MNQPTVRAGLKIYSCDER')


In [None]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

#Sequence Alignment using the pairwise2 Module
seq1 = Seq("FPQSCVYKLIM")
seq2 = Seq("MGQSCYVLKDE")

#Global Alignment
align = pairwise2.align.globalxx(seq1,seq2,one_alignment_only=True)
print(align)

#Format the Alignment
print(format_alignment(*align[0])) #star refers to arbitary arguments

#Local Alignment
align1 = pairwise2.align.localxx(seq1,seq2,one_alignment_only=True)
print(format_alignment(*align1[0]))

#Creating the dot plot
def seqdotplot(sq1,sq2):
  print("", end="\t")
  for c2 in sq2:
    print(c2, end="\t")
    print()

  for c1 in sq1:
    print(c1, end="\t")

  for c2 in sq2:
    if c1==c2:
      print("X", end="\t")

    else:
      print(".", end="\t")
      print()

sq1="ATGC"
sq2="TGCA"
seqdotplot(sq1,sq2)

[Alignment(seqA='FP--QSCVYK-LIM---', seqB='--MGQSC-Y-VL--KDE', score=5.0, start=0, end=17)]
FP--QSCVYK-LIM---
    ||| |  |     
--MGQSC-Y-VL--KDE
  Score=5

3 QSCVYK-L
  ||| |  |
3 QSC-Y-VL
  Score=5

	T	
G	
C	
A	
A	T	G	C	.	
.	
X	.	


###***2. Manipulating Structure Objects:***

In [None]:
from Bio import PDB
#Initialize PDB Parser
parser = PDB.PDBParser()

#Parse the structure
structure = parser.get_structure("structure_id", "IF8UA.pdb")

#Iterate over all atoms in the structure
for model in structure:
  for chain in model:
    for residue in chain:
      for atom in residue:
        print(atom)

<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom OD1>
<Atom OD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom OE1>
<Atom OE2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom OG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG1>
<Atom CG2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom NE>
<Atom CZ>
<Atom NH1>
<Atom NH2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD>
<Atom NE>
<Atom CZ>
<Atom NH1>
<Atom NH2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<Atom CG>
<Atom CD1>
<Atom CD2>
<Atom N>
<Atom CA>
<Atom C>
<Atom O>
<Atom CB>
<A