In [1]:
from Bio import SeqIO

In [2]:
for record in SeqIO.parse("sequence.fasta","fasta"):
    print(record.id)

MT772240.1


In [3]:
for record in SeqIO.parse("sequence.fasta","fasta"):
    print(record)

ID: MT772240.1
Name: MT772240.1
Description: MT772240.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/IND/GBRC265/2020, complete genome
Number of features: 0
Seq('TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...TGT', SingleLetterAlphabet())


In [6]:
# Doing our Analysis 
ncov_record = SeqIO.read("sequence.fasta","fasta")

In [7]:
ncov_record

SeqRecord(seq=Seq('TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...TGT', SingleLetterAlphabet()), id='MT772240.1', name='MT772240.1', description='MT772240.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/IND/GBRC265/2020, complete genome', dbxrefs=[])

In [8]:
ncov_dna = ncov_record.seq

In [9]:
# covid DNA
ncov_dna

Seq('TCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...TGT', SingleLetterAlphabet())

In [11]:
# Len of our sequence
len(ncov_dna)

29800

In [12]:
# protein synthesis
# DNA -> mRNA -> Protein/AA

In [13]:
# Transcription (DNA to mRNA)
ncov_mrna = ncov_dna.transcribe()

In [14]:
ncov_mrna

Seq('UCGAUCUCUUGUAGAUCUGUUCUCUAAACGAACUUUAAAAUCUGUGUGGCUGUC...UGU', RNAAlphabet())

In [16]:
# Translate to Amino Acids/ Protein 
ncov_protein = ncov_mrna.translate()



In [17]:
ncov_protein

Seq('SISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QDTSNSSIFCRLLTVS...IPM', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [18]:
len(ncov_protein)

9933

In [19]:
# Custom 
# Condon = 3
len(ncov_dna)/3

9933.333333333334

In [20]:
# find all AA sequence before stopcondon
ncov_protein

Seq('SISCRSVL*TNFKICVAVTRLHA*CTHAV*LITNYCR*QDTSNSSIFCRLLTVS...IPM', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [21]:
ncov_aa = ncov_protein.split("*")

In [22]:
ncov_aa

[Seq('SISCRSVL', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TNFKICVAVTRLHA', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('CTHAV', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('LITNYCR', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('QDTSNSSIFCRLLTVSSVLQPIISTSRFCPGVTER', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('RWHLWLSRS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('KRRFAST', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('TALCVHQTFGCSNCTSWSCYG', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('AGSRTRRHSVRS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('W', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('DTWCPCPSCGRNTSGLPQGSSS', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('ER', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('RSWWP', HasStopCodon(ExtendedIUPACProtein(), '*')),
 Seq('LRRRS

In [24]:
ncov_clean = [str(i) for i in ncov_aa]

In [25]:
ncov_clean

['SISCRSVL',
 'TNFKICVAVTRLHA',
 'CTHAV',
 'LITNYCR',
 'QDTSNSSIFCRLLTVSSVLQPIISTSRFCPGVTER',
 'DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS',
 'RWHLWLSRS',
 'KRRFAST',
 'TALCVHQTFGCSNCTSWSCYG',
 'AGSRTRRHSVRS',
 'W',
 'DTWCPCPSCGRNTSGLPQGSSS',
 'ER',
 '',
 'RSWWP',
 'LRRRSKVI',
 'LRRRAWH',
 'SL',
 'RFSRKLEH',
 'T',
 'QWCYP',
 'THA',
 'A',
 'RRGIHSLCR',
 'QLLWP',
 'WLPS',
 'VH',
 'RPSSTCW',
 'SFMHFVRTTGLY',
 'H',
 'EGCILLP',
 'T',
 'A',
 'NCLVHGTF',
 'KEL',
 'IADTF',
 'N',
 'IGKEI',
 'HLQWGMSKFCISLKFHNQDYSTKG',
 'KEKA',
 'WLYG',
 'NSICLSSCVTK',
 'MQPNVPFNSHEV',
 'SLW',
 'NFMADGRFC',
 'SHLRILWH',
 'EFD',
 'RRCHYLWLLTPKCCC',
 'NLLSSMSQFRSRT',
 'A',
 'SCRIP',
 '',
 'IWLENHSS',
 'GWSHYCLWRLCVLLCWLP',
 'QVCLLGSTC',
 'R',
 'HRL',
 'PYRCCWRRFRRS',
 '',
 'QPS',
 'NTPKRESQHQYCW',
 'L',
 'T',
 '',
 'RDRHYFGIFFCFHKCFCGNCERFGL',
 'SIQTNC',
 'ILW',
 'F',
 'SYKRKS',
 'KRCLEYW',
 'TEINTESSLCICIRGCSCCTINFLPHS',
 'NCSKFCACFTEGRYNNTRWNFTVFTETH',
 'CYDVHI',
 'FGY',
 'QSSCNGLHYRWCCSVDFAVAN',
 'HLWHCL',


In [26]:
# Using Pandas
import pandas as pd

In [27]:
df = pd.DataFrame({'amino_acids':ncov_clean})

In [28]:
df['count'] = df['amino_acids'].str.len()

In [29]:
df.head()

Unnamed: 0,amino_acids,count
0,SISCRSVL,8
1,TNFKICVAVTRLHA,14
2,CTHAV,5
3,LITNYCR,7
4,QDTSNSSIFCRLLTVSSVLQPIISTSRFCPGVTER,35


In [30]:
# Largest sequence before *
df.nlargest(10,"count")

Unnamed: 0,amino_acids,count
546,CTIVFKRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFAKFL...,2701
692,ASAQRSQITLHINELMDLFMRIFTIGTVTLKQGEIKDATPSDFVRA...,290
717,TNMKIILFLALITLATCELYHYQECVRGTTVLLKEPCSSGTYEGNS...,123
693,AQADEYELMYSFVSEETGTLIVNSVLLFLAFVVFLLVTLAILTALR...,83
716,QQMFHLVDFQVTIAEILLIIMRTFKVSIWNLDYIINLIIKNLSKSL...,63
5,DGEPCPWFQRENTRPTQFACFTGSRRARTWLWRLRGGGLIRGTSTS,46
463,TMLRCYFPKCSEKNNQGYTPLVVTHNFDFTFSFSPEYSMVFVLFFV,46
537,DVVYTHWYWSGNNSYTGSQYGSRILWWCIVLSVLPLPHRSSKS,43
756,LQTLAANCTICPQRFSVLRNVAHWHGSHTFGNVVDLHRCHQIG,43
769,KSHHIFTEATRSTIECTVNNARESCLYGRALMCKINFSSAIPM,43


In [31]:
# count the  frequencies of Amino Acids
from collections import Counter

In [36]:
Counter(ncov_protein).most_common(10)

[('L', 882),
 ('S', 810),
 ('*', 769),
 ('T', 677),
 ('C', 636),
 ('F', 591),
 ('R', 554),
 ('V', 549),
 ('Y', 505),
 ('N', 472)]

In [1]:
# Import our Parser 
from Bio.PDB import PDBParser

In [2]:
# Read our PDB File 
parser = PDBParser()
structure = parser.get_structure("6lu7","6lu7.pdb")



In [3]:
# structure 
structure

<Structure id=6lu7>

In [4]:
len(structure)

1

In [5]:
model = structure[0]

In [6]:
for chain in model:
    print(chain)

<Chain id=A>
<Chain id=C>


In [7]:
# 3D visualization 
import nglview as nv



In [8]:
nv.demo()

NGLWidget()

In [9]:
# Covid 3d structure 
view = nv.show_biopython(structure)

In [10]:
view

NGLWidget()

In [11]:
# using py3dmol 
import py3Dmol

In [13]:
view2 = py3Dmol.view(query = 'pdb:6lu7')

In [14]:
view2.setStyle({'cartoon':{'color':'spectrum'}})

<py3Dmol.view at 0x26e1cc3ab08>

In [15]:
view2.render_image()


<py3Dmol.view at 0x26e1cc3ab08>