In [238]:
import itertools

In [239]:
def overlap(a, b, min_length=1):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match


def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    ss_list = []
    
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        ss_list.append(sup)
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring

    ss_list = [ss for ss in ss_list if len(ss) == len(shortest)]
    
    return shortest_sup, ss_list  # return shortest

In [256]:
""" Question 1. What is the length of the shortest common superstring of the following strings?
    ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'] ? """

shortest, ss_list = scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])

In [241]:
print(shortest)
print(len(shortest))

CCTTGGATTGC
11


In [257]:
""" Question 2. How many different shortest common superstrings are 
    there for the input strings given in the previous question? """

print(ss_list)
print(len(ss_list))

['CCTTGGATTGC', 'TGCCTTGGATT', 'TGGATTGCCTT', 'GATTGCCTTGG']
4


In [262]:
""" Question 3. Assemble these reads using one of the approaches discussed, such as greedy shortest common superstring.  
    Since there are many reads, you might consider ways to make the algorithm faster, such as the one discussed in the 
    programming assignment in the previous module. How many As are there in the full, assembled genome? 
    
    Question 4. How many Ts are there in the full, assembled genome from the previous question? """

def readFastq(filename):
    """ obtains all sequence reads and base qualities from a fastq file """
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [259]:
reads, _ = readFastq('ads1_week4_reads.fq')

In [260]:
def get_kmers(read, k):
    """ Get all the k-mers from a given read """
    kmer_set = set()
    for i in range(0, len(read)-k+1):
        kmer_set.add(read[i : i+k])
    return kmer_set


def overlap_all_pairs(reads, k, dict={}):
    """ Find all pairs of reads that overlap by at least k bases """
    for read in reads:
        kmers = get_kmers(read, k)
        for kmer in kmers:
            if kmer not in dict.keys():
                dict[kmer] = set()
            dict[kmer].add(read)
    pairs = []
    for head in reads:
        suffix = head[-k:]
        candidates = dict[suffix]
        for tail in candidates:
            if (not head == tail and overlap(head, tail, k)):
                pairs.append((head, tail))

    return pairs


def pick_max_overlap(reads, k):
    """ Pick two reads out of a list of reads and return
        the maximal overlap """
    reada, readb = None, None
    best_olen = 0
    for a, b in itertools.permutations(reads, 2):
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b
            best_olen = olen
    return reada, readb, best_olen


def pick_max_overlap_fast(reads, k):
    reada, readb = None, None
    best_olen = 0
    kmer_dict = {}

    for read in reads:
        for i in range(len(read)-k+1):
            kmer_dict[read[i:i+k]] = set()

    for read in reads:
        for i in range(len(read)-k+1):
            kmer_dict[read[i:i+k]].add(read)

    for read in reads:
        current_kmer_set = kmer_dict[read[-1*k:]]
        for kmer_read in current_kmer_set:
            if read != kmer_read:
                olen = overlap(read, kmer_read, min_length=k)
                if olen > best_olen:
                    reada, readb = read, kmer_read
                    best_olen = olen

    return reada, readb, best_olen


def greedy_scs(reads, k):
    """ Returns shortest common superstring of given
        strings, using greedy approach """
    reada, readb, olen = pick_max_overlap_fast(reads, k)
    while olen > 0:
        reads.remove(reada)
        reads.remove(readb)
        reads.append(reada + readb[olen:])
        reada, readb, olen = pick_max_overlap_fast(reads, k)
    return ''.join(reads)

In [261]:
genome = greedy_scs(reads, 10)

In [263]:
len(genome)

15894

In [264]:
As = 0
for char in genome:
    if char == 'A': As += 1
As

4633

In [265]:
Ts = 0
for char in genome:
    if char == 'T': Ts += 1
Ts

3723