## Chapter 1

#### BA1A | Compute the Number of Times a Pattern Appears in a Text

In [1]:
def PatternCount(text, pattern):
    count = 0
    pattern_length = len(pattern)
    for i in range(len(text)- pattern_length +1):
        if text[i:i+pattern_length] == pattern:
            count += 1
    return count

In [2]:
PatternCount('GCGCG','GCG')

2

#### BA1B | Find the Most Frequent Words in a String

In [3]:
def MostFreqKmer(text, k):
    kmers = {}
    for i in range(len(text)-k+1):
        kmer = text[i:i+k]
        if kmer in kmers:
            kmers[kmer] += 1
        else:
            kmers[kmer] = 1
    kmers_sorted = sorted(kmers, key=kmers.get, reverse=True)
    max_count = kmers[kmers_sorted[0]]
    for kmer in kmers_sorted:
        if kmers[kmer] == max_count:
            print kmer
        else:
            break
    

In [4]:
MostFreqKmer('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4)

CATG
GCAT


#### BA1C | Find the Reverse Complement of a String

In [5]:
def ReverseComplement(dna):
    comp = ''
    for nt in dna:
        if nt == 'A':
            comp += 'T'
        elif nt == 'T':
            comp += 'A'
        elif nt == 'G':
            comp += 'C'
        elif nt == 'C':
            comp += 'G'
    return comp[::-1]

In [6]:
ReverseComplement('AAAACCCGGT')

'ACCGGGTTTT'

#### BA1D | Find All Occurrences of a Pattern in a String

In [7]:
def AllOccurrences(pattern, genome):
    for i in range(len(genome)-len(pattern)+1):
        if genome[i:i+len(pattern)] == pattern:
            print i,

In [8]:
AllOccurrences('ATAT', 'GATATATGCATATACTT')

1 3 9


#### BA1E | Find Patterns Forming Clumps in a String

In [9]:
def FindClumps(genome, k, L, t):
    clumps = set()
    # Sliding window of size L
    for i in range(len(genome)-L+1):
        L_window = genome[i:i+L]
        kmers = {}
        # Get kmer counts inside each L_window
        for j in range(len(L_window)-k+1):
            kmer = L_window[j:j+k] 
            if kmer in kmers:
                kmers[kmer] += 1
            else:
                kmers[kmer] = 1
        kmers_sorted = sorted(kmers, key=kmers.get, reverse=True)
        max_count = kmers[kmers_sorted[0]]
        if max_count >= t:
            for kmer in kmers_sorted:
                if kmers[kmer]>= t:
                    clumps.add(kmer)
    for c in clumps:
        print c
    return clumps

In [10]:
FindClumps('CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC', 5, 75, 4)

CGACA
GAAGA
AATGT


{'AATGT', 'CGACA', 'GAAGA'}

#### BA1F | Find a Position in a Genome Minimizing the Skew

In [11]:
# This function runs really slow on Jupyter Notebook, but ok from bash
def MinimumSkew(genome):
    skew = 0
    min_skew = 999
    skew_dict = {}
    for i, nt in enumerate(genome):
        if nt == 'G':
            skew += 1
        elif nt == 'C':
            skew += -1
        if skew <= min_skew:
            if skew in skew_dict:
                skew_dict[skew] += [i+1]
            else:
                skew_dict[skew] = [i+1]
            min_skew = skew
    return skew_dict[min_skew]     

In [12]:
MinimumSkew('CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG')

[53, 97]

#### BA1G | Compute the Hamming Distance Between Two Strings

In [13]:
def HammingDist(string1, string2):
    dist = 0
    for i, nt in enumerate(string1):
        if nt != string2[i]:
            dist += 1
    return dist

In [14]:
HammingDist('GGGCCGTTGGT','GGACCGTTGAC')

3

#### BA1H | Find All Approximate Occurrences of a Pattern in a String

In [15]:
def HammingDist(string1, string2):
    dist = 0
    for i, nt in enumerate(string1):
        if nt != string2[i]:
            dist += 1
    return dist

In [16]:
def ApproxPatternMatching(pattern, text, d):
    for i in range(len(text)-len(pattern)+1):
        if HammingDist(pattern, text[i:i+len(pattern)]) <= d:
            print i,

In [17]:
ApproxPatternMatching('ATTCTGGA','CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC', 3)

6 7 26 27 78


#### BA1I | Find the Most Frequent Words with Mismatches in a String

In [18]:
def HammingDist(string1, string2):
    dist = 0
    for i, nt in enumerate(string1):
        if nt != string2[i]:
            dist += 1
    return dist

In [19]:
def Neighbors(pattern, d):
    if d == 0:
        return {pattern}
    nucleotides = {'A','C','G','T'}
    if len(pattern) == 1:
        return nucleotides
    neighborhood = set()
    suffix_neighbors = Neighbors(pattern[1:],d)
    for text in suffix_neighbors:
        if HammingDist(pattern[1:],text) < d:
            for base in nucleotides:
                neighborhood.add(base+text)
        else:
            neighborhood.add(pattern[0]+text)

    return neighborhood

In [20]:
def MostFreqWordsWithMismatch(text, k, d):
    kmers = {}
    for i in range(len(text)-k+1):
        for kmer in Neighbors(text[i:i+k], d): 
            if kmer in kmers:
                kmers[kmer] += 1
            else:
                kmers[kmer] = 1
    kmers_sorted = sorted(kmers, key=kmers.get, reverse=True)
    max_count = kmers[kmers_sorted[0]]
    for kmer in kmers_sorted:
        if kmers[kmer] == max_count:
            print kmer
        else:
            break

In [21]:
MostFreqWordsWithMismatch('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4, 1)

GATG
ATGC
ATGT


#### BA1J | Find Frequent Words with Mismatches and Reverse Complements

In [22]:
def ReverseComplement(dna):
    comp = ''
    for nt in dna:
        if nt == 'A':
            comp += 'T'
        elif nt == 'T':
            comp += 'A'
        elif nt == 'G':
            comp += 'C'
        elif nt == 'C':
            comp += 'G'
    return comp[::-1]

In [23]:
def HammingDist(string1, string2):
    dist = 0
    for i, nt in enumerate(string1):
        if nt != string2[i]:
            dist += 1
    return dist

In [24]:
def Neighbors(pattern, d):
    if d == 0:
        return {pattern}
    nucleotides = {'A','C','G','T'}
    if len(pattern) == 1:
        return nucleotides
    neighborhood = set()
    suffix_neighbors = Neighbors(pattern[1:],d)
    for text in suffix_neighbors:
        if HammingDist(pattern[1:],text) < d:
            for base in nucleotides:
                neighborhood.add(base+text)
        else:
            neighborhood.add(pattern[0]+text)

    return neighborhood

In [25]:
def MostFreqWordsWithMismatchAndReverseComplements(text, k, d):
    kmers = {}
    for i in range(len(text)-k+1):
        for kmer in Neighbors(text[i:i+k], d): 
            if kmer in kmers:
                kmers[kmer] += 1
            else:
                kmers[kmer] = 1
        for kmer_rev in Neighbors(ReverseComplement(text[i:i+k]), d):
            if kmer_rev in kmers:
                kmers[kmer_rev] += 1
            else:
                kmers[kmer_rev] = 1            
    kmers_sorted = sorted(kmers, key=kmers.get, reverse=True)
    max_count = kmers[kmers_sorted[0]]
    for kmer in kmers_sorted:
        if kmers[kmer] == max_count:
            print kmer
        else:
            break

In [26]:
MostFreqWordsWithMismatchAndReverseComplements('ACGTTGCATGTCGCATGATGCATGAGAGCT', 4, 1)

ATGT
ACAT


#### BA1K | Generate the Frequency Array of a String

In [27]:
def PatternToNum(pattern):
    k = len(pattern)-1
    num = 0
    for i in range(len(pattern)):
        if pattern[i] == 'A':
            num += 0*(4**(k-i))
        elif pattern[i] == 'C':
            num += 1*(4**(k-i))
        elif pattern[i] == 'G':
            num += 2*(4**(k-i))
        elif pattern[i] == 'T':
            num += 3*(4**(k-i))
    return num

In [28]:
def FrequencyArray(text, k):
    freq_array = [0]*(4**k)
    for i in range(len(text)-k+1):
        pat_num = PatternToNum(text[i:i+k])
        freq_array[pat_num] = freq_array[pat_num]+1
    return freq_array           

In [29]:
FrequencyArray('ACGCGGCTCTGAAA', 2)

[2, 1, 0, 0, 0, 0, 2, 2, 1, 2, 1, 0, 0, 1, 1, 0]

#### BA1L | Implement PatternToNumber

In [30]:
def PatternToNum(pattern):
    k = len(pattern)-1
    num = 0
    for i in range(len(pattern)):
        if pattern[i] == 'A':
            num += 0*(4**(k-i))
        elif pattern[i] == 'C':
            num += 1*(4**(k-i))
        elif pattern[i] == 'G':
            num += 2*(4**(k-i))
        elif pattern[i] == 'T':
            num += 3*(4**(k-i))
    return num

In [31]:
PatternToNum('AGT')

11

#### BA1M | Implement NumberToPattern

In [32]:
def NumToPattern(integer,k):
    num_str = ""
    for i in range(k):
        remainder = (integer/(4**(k-i-1)))%4
        if remainder == 0:
            num_str += "A"
        elif remainder == 1:
            num_str += "C"
        elif remainder == 2:
            num_str += "G"
        elif remainder == 3:
            num_str += "T"
    return num_str

In [33]:
NumToPattern(45, 4)

'AGTC'

#### BA1N | Generate the d-Neighborhood of a String

In [34]:
def HammingDist(string1, string2):
    dist = 0
    for i, nt in enumerate(string1):
        if nt != string2[i]:
            dist += 1
    return dist

In [35]:
def Neighbors(pattern, d):
    if d == 0:
        return {pattern}
    nucleotides = {'A','C','G','T'}
    if len(pattern) == 1:
        return nucleotides
    neighborhood = set()
    suffix_neighbors = Neighbors(pattern[1:],d)
    for text in suffix_neighbors:
        if HammingDist(pattern[1:],text) < d:
            for base in nucleotides:
                neighborhood.add(base+text)
        else:
            neighborhood.add(pattern[0]+text)

    return neighborhood

In [36]:
for n in Neighbors('ACG', 1):
    print n,

ACC ATG AAG ACG GCG AGG ACA ACT TCG CCG
