# **SAMPLE CODE:**


# **IMPORT PYTHON PACKAGES**

In [1]:
import numpy as np
import pandas as pd
import math
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime

# **READ A FASTA FILE** 

In [2]:
dna=open("dna.example.fasta")
dna_contents=dna.read()

# **PARSING A FASTA FILE**

In [3]:
def gene_separation(dna_contents):
  '''
  Parsing "gene ids>gene sequence" read from dataset into list
  Input : dataset
  Output : list of gene sequence
  '''
  separate_genes=dna_contents.split(">") 
  separate_genes=separate_genes[1:]
  return separate_genes

In [4]:
separate_genes=gene_separation(dna_contents) 

print("Total number of records in dataset are: " + str(len(separate_genes)))

Total number of records in dataset are: 25


In [5]:
def genes_without_ids(separate_genes):
  '''
  Parse gene ids and gene sequences created from each gene and create a list
  Input : Gene list
  Output: Id list and gene list
  '''
  ids=[]
  genes=[]
  for i in range(0,len(separate_genes)):
    n=separate_genes[i].find("\n")
    ids.append(separate_genes[i][:n])
    genes.append(separate_genes[i][n+1:])
    genes[i]=genes[i].replace('\n','')
  return ids, genes

In [6]:
ids, genes =genes_without_ids(separate_genes)

In [7]:
def sequence_length(genes):
  '''
  Calculate length of genes and create a list
  Input: list o
  '''
  len_genes=[]
  for i in range(len(genes)):
    len_genes.append(len(genes[i]))
  return len_genes

In [8]:
#len_genes is list of length of genes
len_genes=sequence_length(genes)

# **LOOKING FOR ALL GENES WITH MAXIMUM OR MINIMUM LENGTH**

In [9]:
def max_len(len_genes,ids):
  '''
  Find all the genes with maximum length along with their position in the dataset
  Input: list of ids and gene length
  Output: position at which maximum length gene occur in list and their ids
  '''
  max_len_genes=max(len_genes)
  max_occurences=len_genes.count(max_len_genes)
  max_len_position=[]
  ids_of_max_gene_len=[]
  for i in range(len(len_genes)):
      if len_genes[i]==max_len_genes:
        max_len_position.append(i)
        ids_of_max_gene_len.append(ids[i])
  return max_len_position, ids_of_max_gene_len

In [10]:
max_len_position, ids_of_max_gene_len=max_len(len_genes,ids)
print("Number of maximum length genes : " + str(len(max_len_position)))

Number of maximum length genes : 1


In [11]:
def min_len(len_genes,ids):
  '''
  Find all the genes with minimum length along with their position in the dataset
  Input: list of ids and gene length
  Output: position at which minimum length gene occur in list and their ids
  '''
  min_len_genes=min(len_genes)
  min_occurences=len_genes.count(min_len_genes)
  min_len_position=[]
  ids_of_min_gene_len=[]
  for i in range(len(len_genes)):
      if len_genes[i]==min_len_genes:
        min_len_position.append(i)
        ids_of_min_gene_len.append(ids[i])
  return min_len_position, ids_of_min_gene_len

In [12]:
min_len_position, ids_of_min_gene_len=min_len(len_genes,ids)
print("Number of minimum length genes : " + str(len(max_len_position)))

Number of minimum length genes : 1


# **DETERMINING READING FRAMES OF INDIVIDUAL GENES**

In [13]:
def reading_frames(genes):
  '''
  Generating reading frames from each gene
  Input: Gene list
  Output: List of reading frames for each gene
  '''
  list_of_genes_with_rf=[]
  for i in range(len(genes)):
    temp = []
    rem = len(genes[i])%3
    temp.append(genes[i][:len(genes[i]) - (rem%3)])
    temp.append(genes[i][1:len(genes[i]) - ((rem-1)%3)])
    temp.append(genes[i][2:len(genes[i]) - ((rem-2)%3)])
    list_of_genes_with_rf.append(temp)
  return list_of_genes_with_rf

In [14]:
list_of_genes_with_rf=reading_frames(genes)
print("Reading frame 1 of gene 1: " + list_of_genes_with_rf[0][0])
print("Reading frame 1 of gene 2: " + list_of_genes_with_rf[0][1])
print("Reading frame 1 of gene 3: " + list_of_genes_with_rf[0][2])

Reading frame 1 of gene 1: TCGGGCGAAGGCGGCAGCAAGTCGTCCACGCGCAGCGCGGCACCGCGGGCCTCTGCCGTGCGCTGCTTGGCCATGGCCTCCAGCGCACCGATCGGATCAAAGCCGCTGAAGCCTTCGCGCATCAGGCGGCCATAGTTGGCGCCAGTGACCGTACCAACCGCCTTGATGCGGCGCTCGGTCATCGCTGCATTGATCGAGTAGCCACCGCCGCCGCAAATGCCCAGCACGCCAATGCGTTCTTCATCCACATAGGGGAGCGTTACGAGGTAGTCGCAGACCACGCGGAAATCCTCGACGCGCAGTGTCGGGTCTTCGGTAAAACGTGGTTCGCCGCCGCTGGCACCCTGGAAGCTGGCGTCGAAGGCGATGACGACGAAACCTTCCTTGGCCAGCGCCTCGCCATACACGTTCCCCGATGTTTGCTCCTTGCAGCTGCCGATCGGATGCGCGCTGATGATGGCGGGATATTTCTTGCCTTCGTCGAAGTTCGGCGGGAAGTGGATGTCGGCTGCGATATCCCAATACACATTCTTGATCTTGACGCTTTTCATGACAGCTCCGTTCAGGGGGAGGGGGTAAGTTCGCCAGGCCGAATCGTTGGTAGCCAAGCGGCAACGACTCGAATATAGAGAGCCGATTGGAATTCCGTAAGATCGCAATCTGGACTACAGTGGTATCTTCAAATTGACAATGGCACCTACATGGATCCCTCACTGCTTCCGTCTCTCGCGTGGTTCGCCCACGTCGCACATCATCGTAGCTTCACGAAAGCGGCTGCGGAAATGGGCGTTTCTCGAGCAAACCTGTCGCAGAACGTGAAGGCGCTCGAACGCCGGTTGAACGTCAAGCTGCTGTATCGAACGACTCGCGACATGTCGCTGACCGAGGAGGGGCAGCGGCTCTACGAGGTGTGGTATCCCGCGCTGGTCGCGGTCGAGCGGACGGTCGACGCGCTGCACGAGGAGCGCGACGA

# **CHECKING FOR REPEATS OF GIVEN LENGTH IN INDIVIDUAL GENES IN THE FILE**

In [15]:
def repeat_in_entire_file(genes,len_of_repeat_from_user):
  '''
  Determining different repeat of a given length and calculating the 
  number of occurence of the repeat in a particular gene
  Input: Gene list and lenght of repeat
  Output: Dictionary of repeat and their occurence
  '''
  dict_gene = {}
  for i in range(len(genes)):
    for j in range(0,len(genes[i]) - len_of_repeat_from_user+1):
      substring = genes[i][j:j+len_of_repeat_from_user]
      if substring in dict_gene:
        dict_gene[substring]+=1
      else:
        dict_gene[substring] = 1
  return dict_gene

In [None]:
len_of_repeat_from_user=6
print("List of repeats and their occurences")
print(repeat_in_entire_file(genes,len_of_repeat_from_user))

# **DETERMINING SPECIFIC REPEAT IN A GENE SEQUENCE**

In [17]:
def user_defined_repeat_in_genes(genes, repeat_from_user):
  '''
  Determining positions in individual genes 
  where the specific user defined repeat occurs
  Input: Gene list and repeat from user
  Output: Location of repeat in each gene
  '''
  dict_repeat_pos_in_genes={}
  pos_repeat_in_individual_genes=[]
  for i in range(len(genes)):
    for j in range(0,len(genes[i]) - len(repeat_from_user)+1):
      if repeat_from_user == genes[i][j:j+len(repeat_from_user)]:
        pos_repeat_in_individual_genes.append(j)
    if len(pos_repeat_in_individual_genes)>0:
      dict_repeat_pos_in_genes[i]=pos_repeat_in_individual_genes
    else:
      dict_repeat_pos_in_genes[i]="None"
    pos_repeat_in_individual_genes=[]
  return dict_repeat_pos_in_genes

In [18]:
repeat_from_user="AA"
dict_repeat_pos_in_genes=user_defined_repeat_in_genes(genes,repeat_from_user)
print("Repeat from user is : " + repeat_from_user)
print("Repeat occurences in gene 1 : " + str(dict_repeat_pos_in_genes[0]))

Repeat from user is : AA
Repeat occurences in gene 1 : [7, 18, 98, 99, 108, 155, 214, 215, 230, 285, 286, 317, 318, 319, 348, 360, 375, 376, 483, 495, 520, 577, 592, 606, 613, 622, 641, 649, 657, 681, 682, 689, 767, 768, 780, 781, 799, 800, 812, 818, 828, 839, 845, 859]


# **CALCULATING HAMMING DISTANCE BETWEEN TWO STRINGS**

In [19]:
def compute_hamming_distance(gene1,gene2):
  '''
  Compute the Hamming distance between two genes
  Input: Two gene as different string
  Output: Hamming distance between them
  '''
  hamming_distance=0
  for i in range(len(gene1)):
    if gene1[i]!=gene2[i]:
      hamming_distance=hamming_distance+1
  return hamming_distance

In [20]:
gene1="CTACAGCAATACGATCATATGCGGATCCGCAGTGGCCGGTAGACACACGT"
gene2="CTACCCCGCTGCTCAATGACCGGGACTAAAGAGGCGAAGATTATGGTGTG"
hamming_distance=compute_hamming_distance(gene1,gene2)

print("Hamming distance is : " + str(hamming_distance))

Hamming distance is : 36
