In [1]:
# Working with Sequences using fasta files
from Bio import SeqIO
from Bio.Seq import Seq

covid = SeqIO.read("covid_sequences.fasta", "fasta")
mers = SeqIO.read("mers_sequence.fasta", "fasta")
sars = SeqIO.read("sars_sequence.fasta", "fasta")
ebola = SeqIO.read("ebola_sequence.fasta", "fasta")

In [2]:
# Convert or Get the sequences
covid_seq = covid.seq
mers_seq = mers.seq
sars_seq = sars.seq
ebola_seq = ebola.seq
# Check the length of each sequence
print("covid_seq ::", len(covid_seq))
print("mers_seq ::", len(mers_seq))
print("sars_seq ::", len(sars_seq))
print("ebola_seq ::", len(ebola_seq))

covid_seq :: 29903
mers_seq :: 30119
sars_seq :: 29751
ebola_seq :: 18959


In [4]:
from Bio.SeqUtils import gc_fraction

# Check the length of each sequence
print("GC content of covid_seq ::", gc_fraction(covid_seq))
print("GC content of mers_seq ::", gc_fraction(mers_seq))
print("GC content of sars_seq ::", gc_fraction(sars_seq))
print("GC content of ebola_seq ::", gc_fraction(ebola_seq))

GC content of covid_seq :: 0.3797277865097147
GC content of mers_seq :: 0.4123642883229855
GC content of sars_seq :: 0.407616550704178
GC content of ebola_seq :: 0.4107284139458832


In [3]:
def pad_seq(seq):
    if len(seq) % 3 == 0:
        return seq
    elif len(seq) % 3 == 1:
        return seq + Seq("NN")
    else:
        return seq + Seq("N")

covid_seq = pad_seq(covid_seq)
mers_seq = pad_seq(mers_seq)
sars_seq = pad_seq(sars_seq)
ebola_seq = pad_seq(ebola_seq)

In [4]:
covid_protein = covid_seq.translate()
mers_protein = mers_seq.translate()
sars_protein = sars_seq.translate()
ebola_protein = ebola_seq.translate()
print("covid_protein ::", len(covid_protein))
print("mers_protein ::", len(mers_protein))
print("sars_protein ::", len(sars_protein))
print("ebola_protein ::", len(ebola_protein))

covid_protein :: 9968
mers_protein :: 10040
sars_protein :: 9917
ebola_protein :: 6320


In [5]:
def hamming_distance(lhs, rhs):
    return len([(x, y) for x, y in zip(lhs, rhs) if x != y])

In [6]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# Method 1
covid_analysed = ProteinAnalysis(str(covid_protein))
mers_analysed = ProteinAnalysis(str(mers_protein))
sars_analysed = ProteinAnalysis(str(sars_protein))
ebola_analysed = ProteinAnalysis(str(ebola_protein))
# Check for the Frequence of AA
covid_freq = covid_analysed.count_amino_acids()
mers_freq = mers_analysed.count_amino_acids()
sars_freq = sars_analysed.count_amino_acids()
ebola_freq = ebola_analysed.count_amino_acids()

In [10]:
# Method 2
# What the freq of Amino Acids
from collections import Counter

# Most common amino acid in Covid
covid_freq = Counter(covid_protein)
mers_freq = Counter(mers_protein)
sars_freq = Counter(sars_protein)
ebola_freq = Counter(ebola_protein)
covid_freq

Counter({'L': 886,
         'S': 810,
         '*': 774,
         'T': 679,
         'C': 635,
         'F': 593,
         'R': 558,
         'V': 548,
         'Y': 505,
         'N': 472,
         'I': 436,
         'K': 413,
         'G': 394,
         'A': 375,
         'H': 332,
         'Q': 325,
         'P': 292,
         'D': 290,
         'E': 270,
         'W': 263,
         'M': 117,
         'X': 1})

In [8]:
import matplotlib.pyplot as plt

In [None]:
plt.subplot(2, 2, 1)
plt.bar(covid_freq.keys(), covid_freq.values())
plt.show()

[]

  self.kernel.do_one_iteration()
