In [0]:
def pattern_count(text, pattern):
  '''Finding pattern counts in text using k-mer sliding window'''
  count = 0
  for i in range(0,(len(text)-len(pattern) + 1)):
    if text[i: i + len(pattern)] == pattern:
      count = count + 1
  return count
  
text = "ACAACTATGCATACTATCGGGAACTATCCTACTAT"
pattern = "ACTAT"

pattern_count(text, pattern)  

4

In [0]:
def frequent_words(text, k):
  '''Finding k-mer with max number of repeats in text '''
  freq_patterns = {}
  for i in range(0, len(text)-k+1):
    pattern =  text[i:i+k]
    if pattern in freq_patterns:
      freq_patterns[pattern] += 1
    else:
      freq_patterns[pattern] = 1
  maxCount = 0
  freqKmers = []
  for kmer,count in freq_patterns.items():
    if count > maxCount:
      maxCount = count
      freaqKmers = kmer
    elif count == maxCount:
      freqKmers += [kmer]
  return freqKmers


text = "atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaacctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgaccacggaaagatgatcaagagaggatgatttcttggccatatcgcaatgaatacttgtgacttgtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggattacgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttaggatagacggtttttcatcactgactagccaaagccttactctgcctgacatcgaccgtaaattgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaagatcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtttccttaaccctctattttttacggaagaatgatcaagctgctgctcttgatcatcgtttc"
k = 9

%time frequent_words(text, k)
  
    
  

In [0]:
def frequent_words2(text, k):
  freq_patterns = {}
  for i in range(0, len(text)-k+1):
    pattern =  text[i:i+k]
    if pattern in freq_patterns:
      freq_patterns[pattern] += 1
    else:
      freq_patterns[pattern] = 1
  max_key = max(freq_patterns, key=lambda k: freq_patterns[k])
  max_val = freq_patterns[max_key]
  print("The max frequent %s-mer is %s with %s count" % (k, max_key, max_val))
  
text = "atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaacctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgaccacggaaagatgatcaagagaggatgatttcttggccatatcgcaatgaatacttgtgacttgtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggattacgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttaggatagacggtttttcatcactgactagccaaagccttactctgcctgacatcgaccgtaaattgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaagatcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtttccttaaccctctattttttacggaagaatgatcaagctgctgctcttgatcatcgtttc"
k = 9

%time frequent_words2(text, k)

The max frequent 9-mer is atgatcaag with 3 count
CPU times: user 616 µs, sys: 0 ns, total: 616 µs
Wall time: 913 µs


In [0]:
reverse_dict = {
    "A":"T",
    "G":"C",
    "T":"A",
    "C":"G"
}


def reverse_complement(text):
  '''reverse complementing the DNA string'''
  rev = " "
  text = text.upper()
  for i in range(0,len(text)):
    reverse = reverse_dict[text[i]]
    rev = reverse + rev
  return rev


print(reverse_complement("tgctcacacagtttatcca"))

TGGATAAACTGTGTGAGCA 


In [0]:
def pattern_matching(text, pattern):
  '''Find all occurences of pattern in a string and return matching 
  sites, brute force naive algorithm'''
  text = text.upper()
  count = 0
  sites = []
  for i in range(0,(len(text)-len(pattern) + 1)):
    if text[i: i + len(pattern)] == pattern:
      count = count + 1
      sites.append(i)
  print( f"The pattern {pattern} appears in {len(sites)} positions: {sites}")
  
text = "atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaacctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgaccacggaaagatgatcaagagaggatgatttcttggccatatcgcaatgaatacttgtgacttgtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggattacgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttaggatagacggtttttcatcactgactagccaaagccttactctgcctgacatcgaccgtaaattgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaagatcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtttccttaaccctctattttttacggaagaatgatcaagctgctgctcttgatcatcgtttc"
pattern = "ATGATCAAG"

%time pattern_matching(text, pattern)

The pattern ATGATCAAG appears in 3 positions: [27, 127, 508]
CPU times: user 980 µs, sys: 0 ns, total: 980 µs
Wall time: 795 µs


In [0]:
def frequent_words2(text, k, n):
  '''Finding k-mers with n or more number of repeats in text'''
  text = text.upper()
  freq_patterns = {}
  keys = []
  values = []
  for i in range(0, len(text)-k+1):
    pattern =  text[i:i+k]
    if pattern in freq_patterns:
      freq_patterns[pattern] += 1
    else:
      freq_patterns[pattern] = 1
  dict = {k : v for k,v in freq_patterns.items() if v >= n}
  print(f"The max frequent {k}-mers ith {n} or more number of repeats is {dict}")
  
text = "aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttattaactgaaactaaaatggtaggtttggtggtaggttttgtgtacattttgtagtatctgatttttaattacataccgtatattgtattaaattgacgaacaattgcatggaattgaatatatgcaaaacaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggtaggtttctgaagctctcatcaatagactattttagtctttacaaacaatattaccgttcagattcaagattctacaacgctgttttaatgggcgttgcagaaaacttaccacctaaaatccagtatccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgggtggtaagttgcagacattattaaaaacctcatcagaagcttgttcaaaaatttcaatactcgaaacctaccacctgcgtcccctattatttactactactaataatagcagtataattgatctga"
k = 9

frequent_words2(text, 6, 3)


The max frequent 6-mers ith 3 or more number of repeats is {'TTTTGT': 3, 'TATTAA': 3, 'TGGTAG': 3, 'GGTAGG': 3, 'GTAGGT': 3, 'TAGGTT': 3, 'AGGTTT': 3, 'GGTGGT': 3, 'GTGGTA': 3, 'AATTGA': 3, 'AAACCT': 4, 'AACCTA': 3, 'ACCTAC': 5, 'CCTACC': 5, 'CTACCA': 5, 'TACCAC': 6, 'ACCACC': 4, 'ACTTAC': 3, 'CTTACC': 3}


In [0]:
def clump_finding(text, ksize, Len, times):
  text = text.upper()
  chunks = len(text)
  clumps = [text[i:i+Len] for i in range(0, chunks, Len)]
  patterns = {frequent_words2(text, ksize, times) for c in clumps}
  return patterns

text = "aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttattaactgaaactaaaatggtaggtttggtggtaggttttgtgtacattttgtagtatctgatttttaattacataccgtatattgtattaaattgacgaacaattgcatggaattgaatatatgcaaaacaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggtaggtttctgaagctctcatcaatagactattttagtctttacaaacaatattaccgttcagattcaagattctacaacgctgttttaatgggcgttgcagaaaacttaccacctaaaatccagtatccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgggtggtaagttgcagacattattaaaaacctcatcagaagcttgttcaaaaatttcaatactcgaaacctaccacctgcgtcccctattatttactactactaataatagcagtataattgatctga"
k = 9

clump_finding(text, 3, 100, 4)


The max frequent 3-mers ith 4 or more number of repeats is {'AAC': 14, 'ACT': 13, 'CTC': 7, 'TCT': 9, 'CTA': 14, 'TAT': 17, 'ATA': 12, 'TAC': 19, 'ACC': 20, 'CCT': 11, 'TCC': 4, 'CTT': 8, 'TTT': 24, 'TTG': 14, 'TGT': 10, 'CGA': 4, 'GAA': 11, 'AAT': 17, 'ATT': 24, 'GTG': 6, 'TGA': 9, 'GAT': 6, 'TTA': 18, 'TAG': 9, 'AGA': 10, 'AAA': 23, 'ATC': 7, 'TAA': 11, 'CTG': 7, 'ATG': 4, 'TGG': 7, 'GGT': 10, 'GTA': 12, 'AGG': 5, 'GTT': 8, 'ACA': 9, 'CAT': 7, 'AGT': 5, 'CCG': 4, 'CGT': 4, 'GAC': 5, 'CAA': 13, 'TGC': 5, 'GCA': 5, 'CCA': 10, 'CAC': 6, 'TTC': 8, 'TCA': 10, 'CAG': 8, 'AAG': 5, 'AGC': 4}
The max frequent 3-mers ith 4 or more number of repeats is {'AAC': 14, 'ACT': 13, 'CTC': 7, 'TCT': 9, 'CTA': 14, 'TAT': 17, 'ATA': 12, 'TAC': 19, 'ACC': 20, 'CCT': 11, 'TCC': 4, 'CTT': 8, 'TTT': 24, 'TTG': 14, 'TGT': 10, 'CGA': 4, 'GAA': 11, 'AAT': 17, 'ATT': 24, 'GTG': 6, 'TGA': 9, 'GAT': 6, 'TTA': 18, 'TAG': 9, 'AGA': 10, 'AAA': 23, 'ATC': 7, 'TAA': 11, 'CTG': 7, 'ATG': 4, 'TGG': 7, 'GGT': 10, 'GTA': 1

NameError: ignored

In [0]:
def clump_finding(text, ksize, Len, times):
  '''Find patterns forming (L,t)-clumps in a string (text) such way that 
  there is an interval of L length in which this k-mer appears at least t times'''
  text = text.upper()
  chunks = len(text)
  clumps = [text[i:i+Len] for i in range(0, chunks, Len) ]
  for c in clumps:
    if len(c) < Len:
      break
    else:
      freq_patterns = {}
      keys = []
      values = []
      for i in range(0, len(c)-ksize+1):
        pattern =  text[i:i+ksize]
        if pattern in freq_patterns:
          freq_patterns[pattern] += 1
        else:
          freq_patterns[pattern] = 1
      dict = {k : v for k,v in freq_patterns.items() if v >= times}
      print(f"For interval there are {ksize}-mers: {dict.keys()} that forming ({Len,times})-clumps {dict}")

text = "aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttattaactgaaactaaaatggtaggtttggtggtaggttttgtgtacattttgtagtatctgatttttaattacataccgtatattgtattaaattgacgaacaattgcatggaattgaatatatgcaaaacaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggtaggtttctgaagctctcatcaatagactattttagtctttacaaacaatattaccgttcagattcaagattctacaacgctgttttaatgggcgttgcagaaaacttaccacctaaaatccagtatccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgggtggtaagttgcagacattattaaaaacctcatcagaagcttgttcaaaaatttcaatactcgaaacctaccacctgcgtcccctattatttactactactaataatagcagtataattgatctga"
k = 9

clump_finding(text, 9, 127,3)
  

For interval there are 9-mers: dict_keys([]) that forming ((127, 3))-clumps {}
For interval there are 9-mers: dict_keys([]) that forming ((127, 3))-clumps {}
For interval there are 9-mers: dict_keys([]) that forming ((127, 3))-clumps {}
For interval there are 9-mers: dict_keys([]) that forming ((127, 3))-clumps {}


In [0]:
text = "aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttattaactgaaactaaaatggtaggtttggtggtaggttttgtgtacattttgtagtatctgatttttaattacataccgtatattgtattaaattgacgaacaattgcatggaattgaatatatgcaaaacaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggtaggtttctgaagctctcatcaatagactattttagtctttacaaacaatattaccgttcagattcaagattctacaacgctgttttaatgggcgttgcagaaaacttaccacctaaaatccagtatccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgggtggtaagttgcagacattattaaaaacctcatcagaagcttgttcaaaaatttcaatactcgaaacctaccacctgcgtcccctattatttactactactaataatagcagtataattgatctga"
Len = 90
chunks, chunk_size = len(text), len(text)//Len
clumps = [text[i:i+Len] for i in range(0, chunks, Len) ]
print(clumps)

len(text)//Len

['aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttattaactgaaactaaaatggtaggtttggtggtaggttt', 'tgtgtacattttgtagtatctgatttttaattacataccgtatattgtattaaattgacgaacaattgcatggaattgaatatatgcaaa', 'acaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggtaggtttctgaagctctcatcaatagactattttagt', 'ctttacaaacaatattaccgttcagattcaagattctacaacgctgttttaatgggcgttgcagaaaacttaccacctaaaatccagtat', 'ccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgggtggtaagttgcagacattattaaaaacctcatcag', 'aagcttgttcaaaaatttcaatactcgaaacctaccacctgcgtcccctattatttactactactaataatagcagtataattgatctga']


6

In [0]:
def minimum_skew(seq):
  seq = seq.upper()
  x = 0
  y = []
  for i in range(len(seq)):
    if seq[i] == "G":
      x = x + 1
      y.append(x)
    elif seq[i] =="C":
      x = x - 1
      y.append(x)
    else:
      y.append(x)
  return y


def minimum_skew_gen(seq):
  seq = seq.upper()
  x = 0
  for i in range(len(seq)):
    if seq[i] == "G":
      x = x + 1
    elif seq[i] =="C":
      x = x - 1
    else:
      x = x
    yield x



text = "aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttattaactgaaactaaaatggtaggtttggtggtaggttttgtgtacattttgtagtatctgatttttaattacataccgtatattgtattaaattgacgaacaattgcatggaattgaatatatgcaaaacaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggtaggtttctgaagctctcatcaatagactattttagtctttacaaacaatattaccgttcagattcaagattctacaacgctgttttaatgggcgttgcagaaaacttaccacctaaaatccagtatccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgggtggtaagttgcagacattattaaaaacctcatcagaagcttgttcaaaaatttcaatactcgaaacctaccacctgcgtcccctattatttactactactaataatagcagtataattgatctga"
t1 = minimum_skew(text)
t2 = minimum_skew_gen(text)

print(t1)
print(list(t2))
    

[0, 0, -1, -1, -2, -2, -2, -2, -2, -3, -4, -4, -5, -6, -6, -6, -6, -6, -6, -5, -5, -6, -5, -5, -5, -5, -5, -5, -4, -4, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, 0, 0, 0, 1, 2, 2, 2, 2, 3, 4, 4, 5, 6, 6, 6, 7, 8, 8, 8, 8, 8, 9, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 8, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 10, 11, 11, 11, 10, 10, 10, 10, 10, 11, 10, 10, 10, 11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 10, 10, 10, 9, 8, 8, 7, 6, 6, 6, 6, 5, 5, 4, 4, 5, 5, 5, 5, 5, 6, 6, 5, 4, 4, 4, 4, 4, 4, 4, 5, 6, 6, 5, 5, 5, 4, 4, 4, 3, 3, 4, 5, 6, 6, 7, 8, 8, 8, 9, 10, 10, 10, 10, 9, 9, 10, 10, 10, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 

In [0]:
def hamming_distance(seq1, seq2):
  '''Finding Hamming distance between two sequences'''
  assert len(seq1) == len(seq2), "Two sequences not the same length!"
  mismatches = 0
  for i in range(len(seq1)):
    if seq1[i] == seq2[i]:
      continue
    else:
      mismatches = mismatches +1
  print(f'Between two sequences the Hamming distance is {mismatches}')
  
seq1 = 'AATTGGTTGG'
seq2 = 'AATTAATTGC'

hamming_distance(seq1, seq2)

AssertionError: ignored

In [0]:
len(seq1) == len(seq2)

True

In [0]:
def approximate_pattern_matching(text, pattern, d):
  '''Find all approximate occurrences of a pattern in a string'''
  positions = []
  count = 0
  for i in range(0,(len(text)-len(pattern) + 1)):
    for j in range(i,len(pattern)+1):
      if text[j] != pattern[j]:
        count += 1
        if count <= d:
          positions.append(i)
  return positions
  
  
def pattern_count(text, pattern):
  '''Finding pattern counts in text using k-mer sliding window'''
  count = 0
  for i in range(0,(len(text)-len(pattern) + 1)):
    if text[i: i + len(pattern)] == pattern:
      count = count + 1
  return count
  
  
  
  
  
  
  
  
  
  
  
seq1 ='AATTGGTTGG'
seq2 ='AAG'

approximate_pattern_matching(seq1, seq2, 1)  

IndexError: ignored