In [32]:
from collections import defaultdict
from typing import List, Dict, Set

### Pattern Count

In [3]:
def PatternCount(text: str, pattern: str) -> int:
    ''' 
    Returns the number of times that a k-mer Pattern appears as a substring of Text
    '''

    count = 0
    
    for i in range(len(text) - len(pattern)):
        if text[i:i + len(pattern)] == pattern:
            count += 1
            
    return count

In [4]:
PatternCount("GACCATCAAAACTGATAAACTACTTAAAAATCAGT", "AAA")

6

### Frequent Words Problem

In [11]:
def FrequencyTable(text: str, k: int) -> Dict[str, int]:
    ''' Returns a frequency table for all k-mers in text '''

    table = defaultdict(int)
    for i in range(len(text) - k):
        pattern = text[i:i + k]
        table[pattern] += 1
        
    return table

In [12]:
text = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k = 4

FrequencyTable(text, k)

defaultdict(int,
            {'ACGT': 1,
             'CGTT': 1,
             'GTTG': 1,
             'TTGC': 1,
             'TGCA': 2,
             'GCAT': 3,
             'CATG': 3,
             'ATGT': 1,
             'TGTC': 1,
             'GTCG': 1,
             'TCGC': 1,
             'CGCA': 1,
             'ATGA': 2,
             'TGAT': 1,
             'GATG': 1,
             'ATGC': 1,
             'TGAG': 1,
             'GAGA': 1,
             'AGAG': 1,
             'GAGC': 1})

In [13]:
def BetterFrequentWords(text: str, k: int) -> str:
    ''' Returns the most frequent k-mers in text '''

    table = FrequencyTable(text, k)

    max_count = max(table.values())

    ans = [key for key, value in table.items() if value == max_count]
    return "  ".join(ans)

In [15]:
text = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k = 4

BetterFrequentWords(text, k)

'GCAT  CATG'

### Reverse complement

In [17]:
def get_compliment(original: str) -> str:
    ''' Returns a reverse compiment for the original string'''
    
    original = original.upper()
    
    d = {"A": "T", "T": "A", "G": "C", "C": "G"}

    compliment = []

    for item in original:
        compliment.append(d[item])
        
    ans = "".join(compliment[::-1])
    return ans

In [20]:
original = "AAAACCCGGT"
right_ans = "ACCGGGTTTT"

ans = get_compliment(original)
print(ans)
print(ans == right_ans)

ACCGGGTTTT
True


### Pattern Matching Problem

In [22]:
def findStartingPositions(pattern: str, genome: str) -> List[int]:
    ''' Returns indices list of all occurrences of a pattern in a string '''

    indices = []
    
    for i in range(len(genome) - len(pattern) + 1):
        if genome[i:i + len(pattern)] == pattern:
            indices.append(str(i))
            
    return " ".join(indices)

In [24]:
pattern = "ATAT"
genome = "GATATATGCATATACTT"

starting_positions = findStartingPositions(pattern, genome)
print(starting_positions)
# right answer: 1, 3, 9

1 3 9


### The Clump Finding Problem

In [28]:
def FindClumps(text: str, k: int, L: int, t: int) -> Set[str]:
    ''' Finds k-mers forming (L, t) clumps in text
    
    Parameters:
        text: gemone in string format
        k: k-mer length
        L: length of an interval of the genome
        t: how many times a k-mer should appear within L interval
        
    Returns:    
        clumps: a set with k-mers that form (L, t) clumps
    
    '''
    
    clumps = set()
    table = FrequencyTable(text[0:L], k)
    clumps.update([key for key, value in table.items() if value == t])    
    
    for i in range(1, len(text) - L):        
        pattern_1 = text[i - 1:i - 1 + k]
        pattern_2 = text[i + L - k:i + L]
        
        table[pattern_1] -= 1
        table[pattern_2] += 1
        
        if table[pattern_1] == t:
            clumps.add(pattern_1)
        if table[pattern_2] == t:
            clumps.add(pattern_2)
        
    return clumps 

In [31]:
text = "CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA"
k, L, t = 5, 50, 4

ans = FindClumps(text, k, L, t)
print(" ".join(ans))

# right answer: GAAGA CGACA

GAAGA CGACA
