# Where in the Genome Does DNA Replication Begin?

[Here](https://stepic.org/s/BrqnabRZ) is a link to the interactive text.

`genome` stores the genome of *Vibrio cholerae*, which is 1,108,250 nucleotides long.

In [39]:
genome = readdlm("data/Vibrio_cholerae.txt")[1,1]
length(genome)

1108250

## Code Challenge (1.2 - Step 6)
* **Input**: Strings `text` and `pattern`.
* **Output**: How many times `pattern` appears in `text`.

In [40]:
function pattern_count(text, pattern)
    count = 0
    for i = 1:(length(text) - length(pattern) + 1)
        if text[i:(i + length(pattern) - 1)] == pattern
            count += 1
        end
    end
    return count
end

pattern_count (generic function with 1 method)

In [41]:
data = readdlm("data/dataset_2_6.txt")
pattern_count(data[1], data[2])

32

##Code Challenge (1.2 - Step 9)
* **Input**: A string `text` and an integer $k$.
* **Output**: All of the most frequent $k$-mers in `text`.

In [42]:
function frequent_words(text, k)
    frequentPatterns = Set()
    count = Array{Int64}(length(text) - k + 1)
    for i = 1:(length(text) - k + 1)
        pattern = text[i:(i+k-1)]
        count[i] = pattern_count(text, pattern)
    end
    maxCount = maximum(count)
    for i = 1:(length(text) - k + 1)
        if count[i] == maxCount
            push!(frequentPatterns, text[i:(i+k-1)])
        end
    end
    return frequentPatterns
end     

frequent_words (generic function with 1 method)

In [43]:
data = readdlm("data/dataset_2_9.txt")
frequent_words(data[1], data[2])

Set(Any["CAGCATAGACAT"])

##Charging Station - Pattern to Number and Back
* **Input**: A DNA string `pattern`.
* **Output**: The integer corresponding to the index of `pattern` (0-indexed) if all DNA strings of the same length as `pattern`  were ordered lexicographically. Equivalent to conversion from base-4 to decimal.

In [44]:
function pattern_to_number(pattern)
    if pattern == ""
        return 0
    else
        symbol = last(pattern)
        prefix = pattern[1:(length(pattern)-1)]
        return 4 * pattern_to_number(prefix) + symbol_to_number(symbol)
    end
end

pattern_to_number (generic function with 1 method)

* **Input**: A DNA char `symbol`.
* **Output**: 0, 1, 2, or 3 for 'A','C','G', or 'T', respectively.

In [45]:
function symbol_to_number(symbol)
    dict = Dict('A' => 0, 'C' => 1, 'G' => 2, 'T' => 3)
    return dict[symbol]
end

symbol_to_number (generic function with 1 method)

In [46]:
data = readdlm("data/dataset_3010_2.txt")[1]
pattern_to_number(data)

253165471356

* **Input**: Integers `index` and $k$.
* **Output**: The DNA $k$-mer corresponding to `index`.

In [47]:
function number_to_pattern(index, k)
    if k == 1
        return number_to_symbol(index)
    else
        prefixIndex = div(index, 4)
        r = rem(index, 4)
        symbol = number_to_symbol(r)
        prefixPattern = number_to_pattern(prefixIndex, k-1)
        return string(prefixPattern, symbol)
    end
end

number_to_pattern (generic function with 1 method)

* **Input**: An integer $k$.
* **Output**: The DNA nucleotide corresponding to $k$.

In [48]:
function number_to_symbol(k)
    dict = Dict(0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T')
    return dict[k]
end

number_to_symbol (generic function with 1 method)

In [49]:
data = readdlm("data/dataset_3010_4.txt")
number_to_pattern(data[1], data[2])

"AAACCGTCTT"

## Charging Station - The Frequency Array
* **Input**: A DNA string `text` and an integer $k$ representing the length of the patterns in `text`.
* **Output**: The [frequency array](https://stepic.org/s/r436d31P).

In [50]:
function computing_frequencies(text, k)
    frequencyArray = Array{Int64}(4^k)
    for i = 1:4^k
        frequencyArray[i] = 0
    end
    for i = 1:(length(text) - k + 1)
        pattern = text[i:(i+k-1)]
        j = pattern_to_number(pattern)
        frequencyArray[j+1] = frequencyArray[j+1] + 1
    end
    return frequencyArray
end

computing_frequencies (generic function with 1 method)

In [51]:
data = readdlm("data/dataset_2994_5.txt")
output = computing_frequencies(data[1], Int(data[2]))
writedlm("data/output.txt", output)

* **Input**: A string `text` and an integer $k$.
* **Output**: All of the most frequent $k$-mers in `text`.

In [52]:
function faster_frequent_words(text, k)
    frequentPatterns = Set()
    frequencyArray = computing_frequencies(text, k)
    maxCount = maximum(frequencyArray)
    for i = 1:4^k
        if frequencyArray[i] == maxCount
            pattern = number_to_pattern(i-1, k)
            push!(frequentPatterns, pattern)
        end
    end
    return frequentPatterns
end

faster_frequent_words (generic function with 1 method)

## Application
Now we can finally use `faster_frequent_words` to find common $k$-mers in oriC. Maybe one of these is the *DnaA* Box.

In [53]:
oriC = readdlm("data/oriC.txt")[1]
faster_frequent_words(oriC, 9)

Set(Any["CTCTTGATC","TCTTGATCA","ATGATCAAG","CTTGATCAT"])

##Code Challenge (1.3 - Step 2)
* **Input**: A DNA string, $\text{pattern}$.
* **Output**: $\overline{\text{pattern}}$, the reverse complement of $\text{pattern}$.

In [54]:
function reverse_complement(pattern)
    dict = Dict('A' => 'T', 'T' => 'A', 'C' => 'G', 'G' => 'C')
    if length(pattern) == 1
        return dict[pattern[1]]
    else
        prefix = pattern[1:end-1]
        lastSymbol = last(pattern)
        return string(dict[lastSymbol], reverse_complement(prefix))
    end
end

reverse_complement (generic function with 1 method)

In [55]:
# long output; uncomment to view
# data = readdlm("data/dataset_3_2.txt")[1]
# reverse_complement(data)

## Code Challenge (1.3 - Step 5)
* **Input**: Two strings, `pattern` and `genome`.
* **Output**: A collection of integers specifying all starting positions where `pattern` appears as a substring of `genome` (0-indexed).

In [56]:
function pattern_match(pattern, genome)
    positions = []
    for i = 1:(length(genome) - length(pattern) + 1)
        if genome[i:i+length(pattern)-1] == pattern
            push!(positions, i-1)
        end
    end
    return positions
end

pattern_match (generic function with 1 method)

In [57]:
data = readdlm("data/dataset_3_5.txt")
output = pattern_match(data[1], data[2])
writedlm("data/output_3_5.txt", output)

## Application
Let's try running `pattern_match` on the *Vibrio cholerae* genome. Specifically, we'll be looking for instances of the 9-mer "CTTGATCAT" that we found earlier.

In [58]:
output = pattern_match("CTTGATCAT", genome)
writedlm("data/output_Vcholerae.txt", output)

## Code Challenge (1.4 - Step 5)
* **Input**: A string `genome`, and integers $k$, $L$, and $t$.
* **Output**: All distinct $k$-mers forming $(L,t)$-clumps in `genome`.

In [66]:
function find_clumps(genome, k, L, t)
    frequentPatterns = Set()
    clump = Array{Int64}(4^k)
    for i = 1:4^k
        clump[i] = 0
    end
    text = genome[1:L]
    frequencyArray = computing_frequencies(text, k)
    for i = 1:4^k
        if frequencyArray[i] >= t
            clump[i] = 1
        end   
    end
    for i = 2:(length(genome) - L + 1)
        firstPattern = genome[i-1:i-1+k-1]
        index = pattern_to_number(firstPattern)
        frequencyArray[index+1] = frequencyArray[index+1] - 1
        lastPattern = genome[i+L-k:i+L-1]
        index = pattern_to_number(lastPattern)
        frequencyArray[index+1] = frequencyArray[index+1] + 1
        if frequencyArray[index+1] >= t
            clump[index+1] = 1
        end
    end
    for i = 1:4^k
        if clump[i] == 1
            pattern = number_to_pattern(i-1, k)
            push!(frequentPatterns, pattern)
        end
    end
    return frequentPatterns
end

find_clumps (generic function with 1 method)

In [81]:
data = readdlm("data/dataset_4_5.txt")
find_clumps(data[1], data[2], data[4], data[6])

Set(Any["AGAACTCCA","ACGAAGAGC","AGGGACTGG","CAGATGCGG","CAGAATGAG","GAGGGACTG","ATAGTGCGA","GTCCCACGC","GGAGGGACT","AGTGTGCTT","TTCGGTAGT"])

## Application
Let's try using `find_clumps` to find 9-mers that form (500,3)-clumps in the *E. coli* genome. We print the number of different 9-mers.

In [84]:
data = readdlm("data/E-coli.txt")[1]
length(find_clumps(data, 9, 500, 3))

1904