### Algorithms for DNA Sequencing: Programming Homework 4

In a practical, we saw a function for finding the longest exact overlap (suffix/prefix match) between two strings. The function is copied below.

In [None]:

def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest


It's possible for there to be multiple different shortest common superstrings for the same set of input strings. Consider the input strings ABC, BCA, CAB. One shortest common superstring is ABCAB but another is BCABC and another is CABCA.



Q1: What is the length of the shortest common superstring of the following strings?
CCT, CTT, TGC, TGG, GAT, ATT

In [None]:
def overlap(a, b, min_length=1):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [None]:
# Shortest common superstring example
scs(['ABC', 'BCA', 'CAB'] )

'ABCAB'

In [None]:
# problem 1: Shortest Common super string 
scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])

'CCTTGGATTGC'

Q2: How many different shortest common superstrings are there for the input strings given in the previous question?

Hint 1: You can modify the scs function to keep track of this. 

Hint 2: You can look at these examples to double-check that your modified scs is working as expected. 

Examples for Q2:

In [None]:
# This function is given in the assignment
from scs import scs

# You have to implement this function, or something like it
from scs_list import scs_list

Example 1:

In [None]:
strings = ['ABC', 'BCA', 'CAB']

In [None]:
# Returns just one shortest superstring
scs(strings)

Expected output: 'ABCAB'

In [None]:
# Returns list of all superstrings that are tied for shorest
scs_list(strings)

Expected output: ['ABCAB', 'BCABC', 'CABCA']

Example 2:

In [None]:
strings = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']

In [None]:
# Returns just one shortest superstring
scs(strings)

Expected output: 'TCGATGCAATAG'

In [None]:
# Returns list of all superstrings that are tied for shorest
scs_list(strings)

Expected output: ['AATAGATCGTGC',
 'AATAGATGCTCG',
 'AATAGTCGATGC',
 'AATCGATAGTGC',
 'AATGCTCGATAG',
 'TCGAATAGATGC',
 'TCGATAGAATGC',
 'TCGATGCAATAG',
 'TGCAATAGATCG',
 'TGCAATCGATAG']

Solution for Q2:

In [None]:
def pick_maximal_overlap(reads, k): # Input set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the pair of reads with maximal overlap along with that overlap"""
    reada, readb = None, None # Define reads a and b
    best_olen = 0
    for a,b in itertools.permutations(reads, 2): # 2nd argument we will compare a pair of reads
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b # If this is the best overlap length so far 
            best_olen = olen
    return reada, readb, best_olen    

In [None]:
def greedy_scs(reads, k): # Input is set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the sgreedy shortest superstring"""
    read_a, read_b, olen = pick_maximal_overlap(reads, k) # Calculate the maximal overlap
    while olen > 0: # while the length of overlap is greater than 0
        print('reads', reads)
        reads.remove(read_a) # remove reads a and b from reads 
        reads.remove(read_b) #  eg.AAAB and AAB with AAAB
        print('reads1', reads) # left with redas minus a and b
        reads.append(read_a + read_b[olen:]) # append to reads the combination of the reads, read_a + suffix of read_b
        print('reads2', reads)
        read_a, read_b, olen = pick_maximal_overlap(reads, k) # Get the new reads a and b with max overlap
        print('reads3', reads)
    return ''.join(reads) # after joining all reads with overlap concatenate all the remaining reads and return   

In [None]:
greedy_scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'], 1)

reads ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']
reads1 ['TGC', 'TGG', 'GAT', 'ATT']
reads2 ['TGC', 'TGG', 'GAT', 'ATT', 'CCTT']
reads3 ['TGC', 'TGG', 'GAT', 'ATT', 'CCTT']
reads ['TGC', 'TGG', 'GAT', 'ATT', 'CCTT']
reads1 ['TGC', 'TGG', 'CCTT']
reads2 ['TGC', 'TGG', 'CCTT', 'GATT']
reads3 ['TGC', 'TGG', 'CCTT', 'GATT']
reads ['TGC', 'TGG', 'CCTT', 'GATT']
reads1 ['TGG', 'GATT']
reads2 ['TGG', 'GATT', 'TGCCTT']
reads3 ['TGG', 'GATT', 'TGCCTT']
reads ['TGG', 'GATT', 'TGCCTT']
reads1 ['TGCCTT']
reads2 ['TGCCTT', 'TGGATT']
reads3 ['TGCCTT', 'TGGATT']
reads ['TGCCTT', 'TGGATT']
reads1 []
reads2 ['TGCCTTGGATT']
reads3 ['TGCCTTGGATT']


'TGCCTTGGATT'

In [None]:
# Question 1
def count_sup(reads, k):
    sup_list =[]
    for a,b in itertools.permutations(reads, 2):
        new_sup = greedy_scs(reads, 2)
        if new_sup not in sup_list:
            sup_list.append(new_sup)
    return sup_list    

In [None]:
new_sup_list = count_sup(['ABC', 'BCA', 'CAB'], 1)

reads ['ABC', 'BCA', 'CAB']
reads1 ['CAB']
reads2 ['CAB', 'ABCA']
reads3 ['CAB', 'ABCA']
reads ['CAB', 'ABCA']
reads1 []
reads2 ['CABCA']
reads3 ['CABCA']


In [None]:
p =(['ABC', 'BCA', 'CAB'])

In [None]:
from itertools import permutations
perm_list = (list(permutations(p)))

In [None]:
perm_list

[('ABC', 'BCA', 'CAB'),
 ('ABC', 'CAB', 'BCA'),
 ('BCA', 'ABC', 'CAB'),
 ('BCA', 'CAB', 'ABC'),
 ('CAB', 'ABC', 'BCA'),
 ('CAB', 'BCA', 'ABC')]

In [None]:
from itertools import permutations

def count_sup(reads):
        
    perm_list = (list(permutations(reads)))
    print(perm_list)
    sup_list =[]
    for element in perm_list:
        element = list(element)
        print(element)
        new_sup = scs(element)
        print(new_sup)
        if new_sup not in sup_list:
            sup_list.append(new_sup)
    return sup_list    

In [None]:
#Example 1
sup_list = count_sup(['ABC', 'BCA', 'CAB'])

[('ABC', 'BCA', 'CAB'), ('ABC', 'CAB', 'BCA'), ('BCA', 'ABC', 'CAB'), ('BCA', 'CAB', 'ABC'), ('CAB', 'ABC', 'BCA'), ('CAB', 'BCA', 'ABC')]
['ABC', 'BCA', 'CAB']
ABCAB
['ABC', 'CAB', 'BCA']
ABCAB
['BCA', 'ABC', 'CAB']
BCABC
['BCA', 'CAB', 'ABC']
BCABC
['CAB', 'ABC', 'BCA']
CABCA
['CAB', 'BCA', 'ABC']
CABCA


In [None]:
#Example 2
sup_list = count_sup(['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA'])

In [None]:
# Answer Prob 2
sup_list = count_sup(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])

In [None]:
sup_list

['CCTTGGATTGC', 'TGCCTTGGATT', 'TGGATTGCCTT', 'GATTGCCTTGG']

Question 3: Download this FASTQ file containing synthetic sequencing reads from a mystery virus:

https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq

All the reads are the same length (100 bases) and are exact copies of substrings from the forward strand of the virus genome.  You don't have to worry about sequencing errors, ploidy, or reads coming from the reverse strand.

Assemble these reads using one of the approaches discussed, such as greedy shortest common superstring.  Since there are many reads, you might consider ways to make the algorithm faster, such as the one discussed in the programming assignment in the previous module.

How many As are there in the full, assembled genome?

Hint: the virus genome you are assembling is exactly 15,894 bases long

In [None]:
# Read FASTQ function#
######################

def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline() # skip name line
            seq = fh.readline().rstrip() # read base sequence
            fh.readline() # skip placeholder line
            qual = fh.readline().rstrip() #base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [None]:
seq_reads, _ = readFastq('ads1_week4_reads.fq')

In [None]:
len(seq_reads)

1881

HW Question 3 and 4 using overlap function and original maximal overlap function

In [None]:
# Answer 3 & 4
#Build dictionary of kmers
from collections import defaultdict

def kmer_dictionary(reads,k):    
    kmer_dict = defaultdict()
    for read in reads:
        for i in range(len(read)): # go through list of reads
            kmer = read[i:i+k] # get the k-mers
            if len(kmer)== k:  # if k-mer is of length k
                if kmer not in kmer_dict:  #Add k-mer as key of dict
                    kmer_dict[kmer] = set() # initialize all values as empty set()
                kmer_dict[kmer].add(read)  # Add the read that the kmer is found in to the set of values  
    return kmer_dict   

In [None]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match


In [None]:
import itertools  

def pick_maximal_overlap(reads, k): # Input set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the pair of reads with maximal overlap along with that overlap"""
    reada, readb = None, None # Defien reads a and b
    best_olen = 0
    for a,b in itertools.permutations(reads, 2): # 2nd argument we will compare a pair of reads
        olen = overlap(a, b, k)
        if olen > best_olen:
            reada, readb = a, b # If this is the best overlap length so far 
            best_olen = olen
    return reada, readb, best_olen    

In [None]:
def greedy_scs(reads, k): # Input is set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the greedy shortest superstring"""
    read_a, read_b, olen = pick_maximal_overlap(reads, 2) # Calculate the maximal overlap
    while olen > 0: # while the length of overlap is greater than 0
        reads.remove(read_a) # remove reads a and b from reads 
        reads.remove(read_b) #  eg.AAAB and AAB with AAAB
        reads.append(read_a + read_b[olen:]) # append to reads the combination of the reads, read_a + suffix of read_b
        read_a, read_b, olen = pick_maximal_overlap(reads, k) # Get the new reads a and b with max overlap
    return ''.join(reads) # after joining all reads with overlap concatenate all the remaining reads and return   

In [None]:
genome = greedy_scs(seq_reads, 30)

In [None]:
# Check length of genome (number of bases)
len(genome)

15894

In [None]:
type(genome)

str

In [None]:
# Question 3
count_a = 0
for i in genome:
    if i == 'A':
        count_a += 1
        

In [None]:
count_a

4633

In [None]:
# Question 4
count_t = 0
for i in genome:
    if i == 'T':
        count_t += 1

In [None]:
count_t

3723

HW  Questons 3 and 4 using modified maximal overlap function to check for suffix prefix match between reads

In [None]:
def overlap(a, b, min_length=1):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools


In [None]:
def pick_maximal_overlap(reads, k): # Input set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the pair of reads with maximal overlap along with that overlap"""
    reada, readb = None, None # Defien reads a and b
    best_olen = 0
    for a,b in itertools.permutations(reads, 2): # 2nd argument we will compare a pair of reads
        suffix = a[-k:]
        if b.find(suffix):
            olen = overlap(a, b, min_length=k)
            if olen > best_olen:
                reada, readb = a, b # If this is the best overlap length so far 
                best_olen = olen
    return reada, readb, best_olen    

In [None]:
def greedy_scs(reads, k): # Input is set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the greedy shortest superstring"""
    read_a, read_b, olen = pick_maximal_overlap(reads, 2) # Calculate the maximal overlap
    while olen > 0: # while the length of overlap is greater than 0
        reads.remove(read_a) # remove reads a and b from reads 
        reads.remove(read_b) #  eg.AAAB and AAB with AAAB
        reads.append(read_a + read_b[olen:]) # append to reads the combination of the reads, read_a + suffix of read_b
        read_a, read_b, olen = pick_maximal_overlap(reads, k) # Get the new reads a and b with max overlap
    return ''.join(reads) # after joining all reads with overlap concatenate all the remaining reads and return   

In [None]:
genome = greedy_scs(seq_reads, 30)

In [None]:
len(genome)

15894

In [None]:
count_a = 0
for i in genome:
    if i == 'A':
        count_a += 1

In [None]:
# Answer Prob 3 
count_a

4633

Expected output: 4633

Q4: How many Ts are there in the full, assembled genome from the previous question?

In [None]:
# Question 4
count_t = 0
for i in genome:
    if i == 'T':
        count_t += 1

In [None]:
# Answer Prob 4
count_t

3723

Expected output: 3723