In [1]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [2]:
ss = ["CCT", "CTT", "TGC", "TGG", "GAT", "ATT"]
print(len(scs(ss)))

11


In [3]:
def scs_list(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    shortest_res = list()
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
            shortest_res = list()
            shortest_res.append(sup)
        elif len(sup) == len(shortest_sup):
            shortest_res.append(sup)  # found shorter superstring
    return shortest_res  # return shortest

In [4]:
strings = ['ABC', 'BCA', 'CAB']
scs(strings)

'ABCAB'

In [5]:
print(scs_list(strings))

['ABCAB', 'BCABC', 'CABCA']


In [6]:
strings = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']
scs(strings)

'TCGATGCAATAG'

In [7]:
scs_list(strings)

['TCGATGCAATAG',
 'TCGATAGAATGC',
 'TCGAATAGATGC',
 'TGCAATCGATAG',
 'TGCAATAGATCG',
 'AATCGATAGTGC',
 'AATGCTCGATAG',
 'AATAGATCGTGC',
 'AATAGATGCTCG',
 'AATAGTCGATGC']

In [8]:
strings = ["CCT", "CTT", "TGC", "TGG", "GAT", "ATT"]
len(scs_list(strings))

4

In [9]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [10]:
reads, _ = readFastq("ads1_week4_reads.fq")

In [11]:
len(reads)

1881

In [33]:
from collections import defaultdict

def create_kmers_from_reads(reads, k):
        kmer_read = defaultdict(set)
        for read in reads:
            read_length = len(read)
            kmers = [read[i:i+k] for i in range(read_length - k + 1)]
            for kmer in kmers:
                kmer_read[kmer].add(read)
        return kmer_read

In [23]:
def pick_maximal_overlap(reads, k):
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""

    reada, readb = None, None
    best_olen = 0
    
    #create dict with kmer key/and set of reads with that kmer value. 
    kmer_dict = create_kmers_from_reads(reads, k) 
        
    for read in reads:
        #create suffix for this read
        read_suffix = read[-k: ] 
        
        #extract set of all reads containing this kmer/suffix
        read_set = kmer_dict[read_suffix]
        
        assert(len(read_set) > 0) # check that the set isnt empty
        
        read_set.discard(read) #remove the read so we dont compare it with itself

        for compar_read in read_set:
            olen = overlap(read, compar_read, min_length=k)
            if olen > best_olen:
                reada, readb = read, compar_read
                best_olen = olen
    return reada, readb, best_olen

In [24]:
def greedy_scs(reads, k):
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
    read_a, read_b, olen = pick_maximal_overlap(reads, k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[olen:])
        read_a, read_b, olen = pick_maximal_overlap(reads, k)
    return ''.join(reads)

In [25]:
greedy_scs(['ABC', 'BCA', 'CAB'], 2)

'CABCA'

In [26]:
greedy_scs(['ABC', 'BCA', 'CAB'], 1)

'CABCA'

In [27]:
greedy_scs(['ABCABAA', 'BCABCAAA', 'CABCAAA'], 4)

'ABCABAABCABCAAA'

In [28]:
small_reads = reads[:2]

In [29]:
genome = greedy_scs(small_reads, k=10)
print(len(genome))

200


In [31]:
genome = greedy_scs(reads, k=30)

In [32]:
len(genome)

15894

In [35]:
from collections import Counter
print(Counter(genome))

Counter({'A': 4633, 'C': 3789, 'G': 3749, 'T': 3723})
