<a href="https://colab.research.google.com/github/zeinabezz/Bioinformatics/blob/main/sequences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Checks if DNA sequence is valid. Returns True is sequence is valid , or F a l s e otherwise .

In [5]:
def validate_dna ( dna_seq ):

  seqm = dna_seq.upper()
  valid = seqm.count("A") + seqm.count("C") + seqm.count("G") + seqm. count("T")
  if valid == len (seqm): return True
  else : return False

In [6]:
validate_dna (" atagagagatctcg")

False

In [7]:
validate_dna (" ATAGAXTAGAT ")

False

# Calculates the frequency of each symbol in the sequence Returns a dictionary.

In [8]:
def frequency (seq):
   dic = {}
   for s in seq.upper():
     if s in dic: dic[s] += 1
     else : dic[s] = 1
   return dic

In [10]:
frequency ("atagataactcgcatag")

{'A': 7, 'C': 3, 'G': 3, 'T': 4}

In [12]:
frequency ("MVVMKKSHHVLHSQSLIK")

{'H': 3, 'I': 1, 'K': 3, 'L': 2, 'M': 2, 'Q': 1, 'S': 3, 'V': 3}

In [14]:
seq_aa = input(" Protein sequence:")
freq_aa = frequency( seq_aa)
list_f = sorted(freq_aa.items(), key=lambda x: x[1] , reverse = True)

for (k,v) in list_f:
   print(" Aminoacid :", k, ":", v)

 Protein sequence:qwertqwert
 Aminoacid : Q : 2
 Aminoacid : W : 2
 Aminoacid : E : 2
 Aminoacid : R : 2
 Aminoacid : T : 2


# Returns percentage of G and C nucleotides in a DNA sequence.

In [42]:
def gc_content ( dna_seq ):
   gc_count = 0
   for i in range(len(dna_seq)):
      if dna_seq[i:i+2].upper() == "GC": gc_count += 1
   return gc_count#/len( dna_seq )

In [43]:
print(gc_content("gatcgcaaatgccccatgacgcatttcgc"))

4


# Returns GC content of non−overlapping sub−sequences of size k. The result is a list. 

In [48]:
def gc_content_subseq (dna_seq, k=5):
   res = []
   for i in range (0, len(dna_seq)-k+1, k):
      subseq = dna_seq [i:i+k]
      gc = gc_content(subseq)
      res. append(gc)
   return res

In [50]:
print(gc_content_subseq("gatcgcaaatgccccatgacgcatttcgcta"))

[0, 0, 1, 0, 1, 1]


# Function that computes the RNA corresponding to the transcription of the DNA sequence provided.

In [51]:
def transcription(dna_seq):
   assert validate_dna(dna_seq), " Invalid DNA sequence"
   return dna_seq.upper().replace ("T","U")

In [53]:
transcription("AGGATCCCCTTTCTAAAAAGGTGTGTGTGTTTTGT")

'AGGAUCCCCUUUCUAAAAAGGUGUGUGUGUUUUGU'


# Computes the reverse complement of the DNA sequence. 

In [54]:
def reverse_complement ( dna_seq ):
   assert validate_dna ( dna_seq ), " Invalid DNA sequence"
   comp = ""
   for c in dna_seq.upper ():
      if c == "A":
         comp = "T" + comp
      elif c == "T":
         comp = "A" + comp
      elif c == "G":
         comp = "C" + comp
      elif c== "C":
         comp = "G" + comp
   return comp

# Translates a codon into an aminoacid using an internal dictionary with the standard genetic code.

In [55]:
def translate_codon (cod):

   tc = {"GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A", 
         "TGT":"C", "TGC":"C",
         "GAT":"D", "GAC":"D",
         "GAA":"E", "GAG":"E",
         "TTT":"F", "TTC":"F",
         "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",
         "CAT":"H", "CAC":"H",
         "ATA":"I", "ATT":"I", "ATC":"I",
         "AAA":"K", "AAG":"K",
         "TTA":"L", "TTG":"L", "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
         "ATG":"M", "AAT":"N", "AAC":"N",
         "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
         "CAA":"Q", "CAG":"Q",
         "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R", "AGA":"R", "AGG":"R",
         "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S", "AGT":"S", "AGC":"S",
         "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
         "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
         "TGG":"W",
         "TAT":"Y", "TAC":"Y",
         "TAA":"_", "TAG":"_", "TGA":"_"}
   if cod in tc: return tc[cod]
   else : return None

# Translates a DNA sequence into an aminoacid sequence.

In [56]:
def translate_seq (dna_seq , ini_pos = 0):

   assert validate_dna ( dna_seq ), " Invalid DNA sequence"
   seqm = dna_seq . upper ()
   seq_aa = ""
   for pos in range (ini_pos , len (seqm)-2, 3):
      cod = seqm[pos:pos +3]
      seq_aa += translate_codon(cod)
   return seq_aa

# Provides the frequency of each codon encoding a given aminoacid , in a DNA sequence .

In [58]:
def codon_usage (dna_seq , aa):
   assert validate_dna ( dna_seq ), " Invalid DNA sequence"
   seqm = dna_seq . upper ()
   dic = {}
   total = 0
   for i in range (0, len (seqm)-2, 3):
      cod = seqm[i:i+3]
      if translate_codon(cod) == aa:
         if cod in dic:
           dic[cod] += 1
         else : dic[cod] = 1
         total += 1
   if total >0:
      for k in dic:
          dic[k] /= total
   return dic

# Computes the six reading frames of a DNA sequence ( including the reverse complement.

In [59]:
def reading_frames ( dna_seq ):
   assert validate_dna ( dna_seq ), " Invalid DNA sequence"
   res = []
   res. append( translate_seq(dna_seq ,0))
   res. append( translate_seq(dna_seq ,1))
   res. append( translate_seq(dna_seq ,2))
   rc = reverse_complement ( dna_seq )
   res. append( translate_seq(rc ,0))
   res. append( translate_seq(rc ,1))
   res. append( translate_seq(rc ,2))
   return res

# Computes all possible proteins in an aminoacid sequence. Returns list of possible proteins. 

In [7]:
def all_proteins_rf (aa_seq):
   aa_seq = aa_seq. upper ()
   current_prot = []
   proteins = []
   for aa in aa_seq:
      if aa == "_":
         if current_prot :
            for p in current_prot :
               proteins.append(p)
            current_prot = []
      else:
        if aa == "M":
            current_prot.append("")
        for i in range ( len ( current_prot )):
            current_prot [i] += aa
   return proteins

In [8]:
print(all_proteins_rf("MAPTHYUIPMOERTIOMIOPTRYYIU_"))

['MAPTHYUIPMOERTIOMIOPTRYYIU', 'MOERTIOMIOPTRYYIU', 'MIOPTRYYIU']


# Computes all possible proteins for all open reading frames.

In [1]:
def all_orfs ( dna_seq ):
   assert validate_dna ( dna_seq ), " Invalid DNA sequence"
   rfs = reading_frames ( dna_seq )
   res = []
   for rf in rfs:
      prots = all_proteins_rf(rf)
      for p in prots: res. append(p)
   return res

# Computes all possible proteins for all open reading frames. Returns ordered list of proteins with minimum size.

In [1]:
def all_orfs_ord (dna_seq , minsize = 0):
   assert validate_dna ( dna_seq ), " Invalid DNA sequence"
   rfs = reading_frames ( dna_seq )
   res = []
   for rf in rfs:
      prots = all_proteins_rf(rf)
      for p in prots:
         if len (p) > minsize : insert_prot_ord(p, res)
   return res

def insert_prot_ord (prot , list_prots):
   i = 0
   while i < len ( list_prots) and len (prot) < len ( list_prots[i]):
     i += 1
   list_prots. insert(i, prot)