In [6]:
def overlap(a, b, min_length=1):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [21]:
scs(['ABC', 'BCA', 'CAB'] )

'ABCAB'

In [20]:
scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])

'CCTTGGATTGC'

In [4]:
def pick_maximal_overlap(reads, k): # Input set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the pair of reads with maximal overlap along with that overlap"""
    reada, readb = None, None # Defien reads a and b
    best_olen = 0
    for a,b in itertools.permutations(reads, 2): # 2nd argument we will compare a pair of reads
        olen = overlap(a, b, min_length=k)
        if olen > best_olen:
            reada, readb = a, b # If this is the best overlap length so far 
            best_olen = olen
    return reada, readb, best_olen    

In [5]:
def greedy_scs(reads, k): # Input is set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the sgreedy shortest superstring"""
    read_a, read_b, olen = pick_maximal_overlap(reads, k) # Calculate the maximal overlap
    while olen > 0: # while the length of overlap is greater than 0
        print('reads', reads)
        reads.remove(read_a) # remove reads a and b from reads 
        reads.remove(read_b) #  eg.AAAB and AAB with AAAB
        print('reads1', reads) # left with redas minus a and b
        reads.append(read_a + read_b[olen:]) # append to reads the combination of the reads, read_a + suffix of read_b
        print('reads2', reads)
        read_a, read_b, olen = pick_maximal_overlap(reads, k) # Get the new reads a and b with max overlap
        print('reads3', reads)
    return ''.join(reads) # after joining all reads with overlap concatenate all the remaining reads and return   

In [7]:
greedy_scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'], 1)

reads ['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT']
reads1 ['TGC', 'TGG', 'GAT', 'ATT']
reads2 ['TGC', 'TGG', 'GAT', 'ATT', 'CCTT']
reads3 ['TGC', 'TGG', 'GAT', 'ATT', 'CCTT']
reads ['TGC', 'TGG', 'GAT', 'ATT', 'CCTT']
reads1 ['TGC', 'TGG', 'CCTT']
reads2 ['TGC', 'TGG', 'CCTT', 'GATT']
reads3 ['TGC', 'TGG', 'CCTT', 'GATT']
reads ['TGC', 'TGG', 'CCTT', 'GATT']
reads1 ['TGG', 'GATT']
reads2 ['TGG', 'GATT', 'TGCCTT']
reads3 ['TGG', 'GATT', 'TGCCTT']
reads ['TGG', 'GATT', 'TGCCTT']
reads1 ['TGCCTT']
reads2 ['TGCCTT', 'TGGATT']
reads3 ['TGCCTT', 'TGGATT']
reads ['TGCCTT', 'TGGATT']
reads1 []
reads2 ['TGCCTTGGATT']
reads3 ['TGCCTTGGATT']


'TGCCTTGGATT'

In [37]:
def count_sup(reads, k):
    sup_list =[]
    for a,b in itertools.permutations(reads, 2):
        print('here')
        new_sup = greedy_scs(reads, 2)
        if new_sup not in sup_list:
            sup_list.append(new_sup)
    return sup_list    

In [38]:
new_sup_list = count_sup(['ABC', 'BCA', 'CAB'], 1)

here
reads ['ABC', 'BCA', 'CAB']
reads1 ['CAB']
reads2 ['CAB', 'ABCA']
reads3 ['CAB', 'ABCA']
reads ['CAB', 'ABCA']
reads1 []
reads2 ['CABCA']
reads3 ['CABCA']
here
here
here
here
here


In [30]:
print(new_sup_list)

['CABCA']


In [41]:
p =(['ABC', 'BCA', 'CAB'])

In [48]:
from itertools import permutations

perm_list = (list(permutations(p)))

[('ABC', 'BCA', 'CAB'), ('ABC', 'CAB', 'BCA'), ('BCA', 'ABC', 'CAB'), ('BCA', 'CAB', 'ABC'), ('CAB', 'ABC', 'BCA'), ('CAB', 'BCA', 'ABC')]


In [63]:
#Count using greedy scs
from itertools import permutations


def count_sup(reads):
        
    perm_list = (list(permutations(reads)))
    print(perm_list)
    sup_list =[]
    for element in perm_list:
        element = list(element)
        print(element)
        new_sup = greedy_scs(element, 1)
        print(new_sup)
        if new_sup not in sup_list:
            sup_list.append(new_sup)
    return sup_list    

In [64]:
count_sup(['ABC', 'BCA', 'CAB'])

[('ABC', 'BCA', 'CAB'), ('ABC', 'CAB', 'BCA'), ('BCA', 'ABC', 'CAB'), ('BCA', 'CAB', 'ABC'), ('CAB', 'ABC', 'BCA'), ('CAB', 'BCA', 'ABC')]
['ABC', 'BCA', 'CAB']
reads ['ABC', 'BCA', 'CAB']
reads1 ['CAB']
reads2 ['CAB', 'ABCA']
reads3 ['CAB', 'ABCA']
reads ['CAB', 'ABCA']
reads1 []
reads2 ['CABCA']
reads3 ['CABCA']
CABCA
['ABC', 'CAB', 'BCA']
reads ['ABC', 'CAB', 'BCA']
reads1 ['CAB']
reads2 ['CAB', 'ABCA']
reads3 ['CAB', 'ABCA']
reads ['CAB', 'ABCA']
reads1 []
reads2 ['CABCA']
reads3 ['CABCA']
CABCA
['BCA', 'ABC', 'CAB']
reads ['BCA', 'ABC', 'CAB']
reads1 ['ABC']
reads2 ['ABC', 'BCAB']
reads3 ['ABC', 'BCAB']
reads ['ABC', 'BCAB']
reads1 []
reads2 ['ABCAB']
reads3 ['ABCAB']
ABCAB
['BCA', 'CAB', 'ABC']
reads ['BCA', 'CAB', 'ABC']
reads1 ['ABC']
reads2 ['ABC', 'BCAB']
reads3 ['ABC', 'BCAB']
reads ['ABC', 'BCAB']
reads1 []
reads2 ['ABCAB']
reads3 ['ABCAB']
ABCAB
['CAB', 'ABC', 'BCA']
reads ['CAB', 'ABC', 'BCA']
reads1 ['BCA']
reads2 ['BCA', 'CABC']
reads3 ['BCA', 'CABC']
reads ['BCA', 'CAB

In [65]:
print(sup_list)

['CABCA', 'ABCAB', 'BCABC']


In [66]:
sup_list = count_sup(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])

[('CTT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'), ('CTT', 'CTT', 'TGC', 'TGG', 'ATT', 'GAT'), ('CTT', 'CTT', 'TGC', 'GAT', 'TGG', 'ATT'), ('CTT', 'CTT', 'TGC', 'GAT', 'ATT', 'TGG'), ('CTT', 'CTT', 'TGC', 'ATT', 'TGG', 'GAT'), ('CTT', 'CTT', 'TGC', 'ATT', 'GAT', 'TGG'), ('CTT', 'CTT', 'TGG', 'TGC', 'GAT', 'ATT'), ('CTT', 'CTT', 'TGG', 'TGC', 'ATT', 'GAT'), ('CTT', 'CTT', 'TGG', 'GAT', 'TGC', 'ATT'), ('CTT', 'CTT', 'TGG', 'GAT', 'ATT', 'TGC'), ('CTT', 'CTT', 'TGG', 'ATT', 'TGC', 'GAT'), ('CTT', 'CTT', 'TGG', 'ATT', 'GAT', 'TGC'), ('CTT', 'CTT', 'GAT', 'TGC', 'TGG', 'ATT'), ('CTT', 'CTT', 'GAT', 'TGC', 'ATT', 'TGG'), ('CTT', 'CTT', 'GAT', 'TGG', 'TGC', 'ATT'), ('CTT', 'CTT', 'GAT', 'TGG', 'ATT', 'TGC'), ('CTT', 'CTT', 'GAT', 'ATT', 'TGC', 'TGG'), ('CTT', 'CTT', 'GAT', 'ATT', 'TGG', 'TGC'), ('CTT', 'CTT', 'ATT', 'TGC', 'TGG', 'GAT'), ('CTT', 'CTT', 'ATT', 'TGC', 'GAT', 'TGG'), ('CTT', 'CTT', 'ATT', 'TGG', 'TGC', 'GAT'), ('CTT', 'CTT', 'ATT', 'TGG', 'GAT', 'TGC'), ('CTT', 'CTT', 'ATT', 'GAT', 'T

In [67]:
print(sup_list)

['TGCTTGGATT', 'TGGATTGCTT']


In [68]:
len(sup_list[0])

10

In [69]:
#Example 1
sup_list = count_sup(['ABC', 'BCA', 'CAB'])

[('ABC', 'BCA', 'CAB'), ('ABC', 'CAB', 'BCA'), ('BCA', 'ABC', 'CAB'), ('BCA', 'CAB', 'ABC'), ('CAB', 'ABC', 'BCA'), ('CAB', 'BCA', 'ABC')]
['ABC', 'BCA', 'CAB']
reads ['ABC', 'BCA', 'CAB']
reads1 ['CAB']
reads2 ['CAB', 'ABCA']
reads3 ['CAB', 'ABCA']
reads ['CAB', 'ABCA']
reads1 []
reads2 ['CABCA']
reads3 ['CABCA']
CABCA
['ABC', 'CAB', 'BCA']
reads ['ABC', 'CAB', 'BCA']
reads1 ['CAB']
reads2 ['CAB', 'ABCA']
reads3 ['CAB', 'ABCA']
reads ['CAB', 'ABCA']
reads1 []
reads2 ['CABCA']
reads3 ['CABCA']
CABCA
['BCA', 'ABC', 'CAB']
reads ['BCA', 'ABC', 'CAB']
reads1 ['ABC']
reads2 ['ABC', 'BCAB']
reads3 ['ABC', 'BCAB']
reads ['ABC', 'BCAB']
reads1 []
reads2 ['ABCAB']
reads3 ['ABCAB']
ABCAB
['BCA', 'CAB', 'ABC']
reads ['BCA', 'CAB', 'ABC']
reads1 ['ABC']
reads2 ['ABC', 'BCAB']
reads3 ['ABC', 'BCAB']
reads ['ABC', 'BCAB']
reads1 []
reads2 ['ABCAB']
reads3 ['ABCAB']
ABCAB
['CAB', 'ABC', 'BCA']
reads ['CAB', 'ABC', 'BCA']
reads1 ['BCA']
reads2 ['BCA', 'CABC']
reads3 ['BCA', 'CABC']
reads ['BCA', 'CAB

In [70]:
sup_list

['CABCA', 'ABCAB', 'BCABC']

In [71]:
sup_list = count_sup(['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA'])

[('GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA'), ('GAT', 'TAG', 'TCG', 'TGC', 'ATA', 'AAT'), ('GAT', 'TAG', 'TCG', 'AAT', 'TGC', 'ATA'), ('GAT', 'TAG', 'TCG', 'AAT', 'ATA', 'TGC'), ('GAT', 'TAG', 'TCG', 'ATA', 'TGC', 'AAT'), ('GAT', 'TAG', 'TCG', 'ATA', 'AAT', 'TGC'), ('GAT', 'TAG', 'TGC', 'TCG', 'AAT', 'ATA'), ('GAT', 'TAG', 'TGC', 'TCG', 'ATA', 'AAT'), ('GAT', 'TAG', 'TGC', 'AAT', 'TCG', 'ATA'), ('GAT', 'TAG', 'TGC', 'AAT', 'ATA', 'TCG'), ('GAT', 'TAG', 'TGC', 'ATA', 'TCG', 'AAT'), ('GAT', 'TAG', 'TGC', 'ATA', 'AAT', 'TCG'), ('GAT', 'TAG', 'AAT', 'TCG', 'TGC', 'ATA'), ('GAT', 'TAG', 'AAT', 'TCG', 'ATA', 'TGC'), ('GAT', 'TAG', 'AAT', 'TGC', 'TCG', 'ATA'), ('GAT', 'TAG', 'AAT', 'TGC', 'ATA', 'TCG'), ('GAT', 'TAG', 'AAT', 'ATA', 'TCG', 'TGC'), ('GAT', 'TAG', 'AAT', 'ATA', 'TGC', 'TCG'), ('GAT', 'TAG', 'ATA', 'TCG', 'TGC', 'AAT'), ('GAT', 'TAG', 'ATA', 'TCG', 'AAT', 'TGC'), ('GAT', 'TAG', 'ATA', 'TGC', 'TCG', 'AAT'), ('GAT', 'TAG', 'ATA', 'TGC', 'AAT', 'TCG'), ('GAT', 'TAG', 'ATA', 'AAT', 'T

In [72]:
sup_list

['TCGATAGAATGC',
 'AATGCTCGATAG',
 'TGCAATCGATAG',
 'AATAGTCGATGC',
 'TGCAATAGATCG']

In [75]:
from itertools import permutations


def count_sup(reads):
        
    perm_list = (list(permutations(reads)))
    print(perm_list)
    sup_list =[]
    for element in perm_list:
        element = list(element)
        print(element)
        new_sup = scs(element)
        print(new_sup)
        if new_sup not in sup_list:
            sup_list.append(new_sup)
    return sup_list    

In [78]:
#Example 1
sup_list = count_sup(['ABC', 'BCA', 'CAB'])

[('ABC', 'BCA', 'CAB'), ('ABC', 'CAB', 'BCA'), ('BCA', 'ABC', 'CAB'), ('BCA', 'CAB', 'ABC'), ('CAB', 'ABC', 'BCA'), ('CAB', 'BCA', 'ABC')]
['ABC', 'BCA', 'CAB']
ABCAB
['ABC', 'CAB', 'BCA']
ABCAB
['BCA', 'ABC', 'CAB']
BCABC
['BCA', 'CAB', 'ABC']
BCABC
['CAB', 'ABC', 'BCA']
CABCA
['CAB', 'BCA', 'ABC']
CABCA


In [79]:
sup_list

['ABCAB', 'BCABC', 'CABCA']

In [76]:
#Example 2
sup_list = count_sup(['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA'])

[('GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA'), ('GAT', 'TAG', 'TCG', 'TGC', 'ATA', 'AAT'), ('GAT', 'TAG', 'TCG', 'AAT', 'TGC', 'ATA'), ('GAT', 'TAG', 'TCG', 'AAT', 'ATA', 'TGC'), ('GAT', 'TAG', 'TCG', 'ATA', 'TGC', 'AAT'), ('GAT', 'TAG', 'TCG', 'ATA', 'AAT', 'TGC'), ('GAT', 'TAG', 'TGC', 'TCG', 'AAT', 'ATA'), ('GAT', 'TAG', 'TGC', 'TCG', 'ATA', 'AAT'), ('GAT', 'TAG', 'TGC', 'AAT', 'TCG', 'ATA'), ('GAT', 'TAG', 'TGC', 'AAT', 'ATA', 'TCG'), ('GAT', 'TAG', 'TGC', 'ATA', 'TCG', 'AAT'), ('GAT', 'TAG', 'TGC', 'ATA', 'AAT', 'TCG'), ('GAT', 'TAG', 'AAT', 'TCG', 'TGC', 'ATA'), ('GAT', 'TAG', 'AAT', 'TCG', 'ATA', 'TGC'), ('GAT', 'TAG', 'AAT', 'TGC', 'TCG', 'ATA'), ('GAT', 'TAG', 'AAT', 'TGC', 'ATA', 'TCG'), ('GAT', 'TAG', 'AAT', 'ATA', 'TCG', 'TGC'), ('GAT', 'TAG', 'AAT', 'ATA', 'TGC', 'TCG'), ('GAT', 'TAG', 'ATA', 'TCG', 'TGC', 'AAT'), ('GAT', 'TAG', 'ATA', 'TCG', 'AAT', 'TGC'), ('GAT', 'TAG', 'ATA', 'TGC', 'TCG', 'AAT'), ('GAT', 'TAG', 'ATA', 'TGC', 'AAT', 'TCG'), ('GAT', 'TAG', 'ATA', 'AAT', 'T

In [77]:
sup_list

['TCGATGCAATAG',
 'TCGATAGAATGC',
 'TGCAATCGATAG',
 'TGCAATAGATCG',
 'AATCGATAGTGC',
 'AATGCTCGATAG',
 'AATAGATCGTGC',
 'AATAGATGCTCG',
 'TCGAATAGATGC',
 'AATAGTCGATGC']

In [80]:
#HW 2
sup_list = count_sup(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])

[('CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'), ('CCT', 'CTT', 'TGC', 'TGG', 'ATT', 'GAT'), ('CCT', 'CTT', 'TGC', 'GAT', 'TGG', 'ATT'), ('CCT', 'CTT', 'TGC', 'GAT', 'ATT', 'TGG'), ('CCT', 'CTT', 'TGC', 'ATT', 'TGG', 'GAT'), ('CCT', 'CTT', 'TGC', 'ATT', 'GAT', 'TGG'), ('CCT', 'CTT', 'TGG', 'TGC', 'GAT', 'ATT'), ('CCT', 'CTT', 'TGG', 'TGC', 'ATT', 'GAT'), ('CCT', 'CTT', 'TGG', 'GAT', 'TGC', 'ATT'), ('CCT', 'CTT', 'TGG', 'GAT', 'ATT', 'TGC'), ('CCT', 'CTT', 'TGG', 'ATT', 'TGC', 'GAT'), ('CCT', 'CTT', 'TGG', 'ATT', 'GAT', 'TGC'), ('CCT', 'CTT', 'GAT', 'TGC', 'TGG', 'ATT'), ('CCT', 'CTT', 'GAT', 'TGC', 'ATT', 'TGG'), ('CCT', 'CTT', 'GAT', 'TGG', 'TGC', 'ATT'), ('CCT', 'CTT', 'GAT', 'TGG', 'ATT', 'TGC'), ('CCT', 'CTT', 'GAT', 'ATT', 'TGC', 'TGG'), ('CCT', 'CTT', 'GAT', 'ATT', 'TGG', 'TGC'), ('CCT', 'CTT', 'ATT', 'TGC', 'TGG', 'GAT'), ('CCT', 'CTT', 'ATT', 'TGC', 'GAT', 'TGG'), ('CCT', 'CTT', 'ATT', 'TGG', 'TGC', 'GAT'), ('CCT', 'CTT', 'ATT', 'TGG', 'GAT', 'TGC'), ('CCT', 'CTT', 'ATT', 'GAT', 'T

In [81]:
sup_list

['CCTTGGATTGC', 'TGCCTTGGATT', 'TGGATTGCCTT', 'GATTGCCTTGG']

In [189]:
# problem 3 & 4
# Read FASTQ function#
######################

def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline() # skip name line
            seq = fh.readline().rstrip() # read base sequence
            fh.readline() # skip placeholder line
            qual = fh.readline().rstrip() #base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [190]:
seq_reads, _ = readFastq('ads1_week4_reads.fq')

In [191]:
len(seq_reads)

1881

In [231]:
small_seqs = seq_reads[:10]

In [232]:
small_seqs

['GTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCCTCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGT',
 'AGGAGATACTAAAACTAAACAAGTGCTTCTATAATAGTGGGGTTTCCGCCAATTCTAGATCTGGTCAAAGGGAATTAGCACCCTATCCCTCCGAAGTTGG',
 'TTGGTGAATTTAGATTGGAGAGAAAATGGTTGGATGTGGTGAGGAACAGGATTGCCGAGGACCTCTCTTTACGCCGATTCATGGTGGCTCTAATCCTGGA',
 'TTTGAGCTATGCGCTTGGAGGAGATATCAATAAGGTATTAGAAAAGCTCGGATACAGTGGAGGTGATTTACTGGGCATCTTAGAGAGCAGAGGAATAAAG',
 'TGTCAACCAGATCTTATACCCTGAAGTTCACCTAGATAGCCCGATAGTTACCAATAAGATAGTAGCTATCCTGGAGTATGCTCGAGTCCCTCACGCTTAC',
 'AGTGTGAAATAGACATCAGAATTAAGAAAAACGTAGGGTCCAAGTGGTTTCCCGTTATGGACTCGCTATCTGTCAACCAGATCTTATACCCTGAAGTTCA',
 'CTGGATGATATTGACAAGGAAACATCATCCTTGAGAGTCCCATATATTGGTTCTACCACTGATGAGAGAACAGACATGAAGCTCGCCTTCGTAAGAGCCC',
 'AAAACAGCCTTGGCCAAAATACACGAGGATAATCAGAAGATAATCTCCAAGCTAGAATCATTGCTGTTATTGAAGGGAGAAGTTGAGTCAATTAAGAAGC',
 'AGGGCAGGAGATGATATTGGCTGTTCAGGGTGTCCAAGACTACATCAATAATGAGCTGATACCGTCTATGAACCAACTATCTTGTGATTTAATCGGCCAG',
 'TGTAATCCGCTCCATTATAAAATCCAGCCGGCTAGAGGAGGATCGGAAGCGTT

In [217]:
#Build dictionary of kmers
from collections import defaultdict

def kmer_dictionary(reads,k):    
    kmer_dict = defaultdict()
    for read in reads:
        for i in range(len(read)): # go through list of reads
            kmer = read[i:i+k] # get the k-mers
            if len(kmer)== k:  # if k-mer is of length k
                if kmer not in kmer_dict:  #Add k-mer as key of dict
                    kmer_dict[kmer] = set() # initialize all values as empty set()
                kmer_dict[kmer].add(read)  # Add the read that the kmer is found in to the set of values  
    return kmer_dict   

In [1]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match


In [372]:
def overlap_all_reads(reads, k):
    overlapping_pairs = []
    kmer_dict = kmer_dictionary(reads, k)
    for read in reads:
        suffix = read[-k:]
        reads_with_kmer = kmer_dict[suffix]
        for r in reads_with_kmer:
            if r != read: 
                olen = overlap(read, r, k)
                if overlaps:
                    overlapping_pairs.append(overlaps) 
                    
    return olen, overlapping_pairs

In [2]:
def pick_maximal_overlap(reads, k): # Input set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the pair of reads with maximal overlap along with that overlap"""
    reada, readb = None, None # Defien reads a and b
    best_olen = 0
    for a,b in itertools.permutations(reads, 2): # 2nd argument we will compare a pair of reads
        ovlen = overlap(a, b, k)
        if olen > best_olen:
            reada, readb = a, b # If this is the best overlap length so far 
            best_olen = olen
    return reada, readb, best_olen    

In [3]:
def greedy_scs(reads, k): # Input is set of reads and minimum overlap k
    """Given a set of reads and minimum overlap k the function returns the greedy shortest superstring"""
    read_a, read_b, olen = pick_maximal_overlap(reads, 2) # Calculate the maximal overlap
    while olen > 0: # while the length of overlap is greater than 0
        reads.remove(read_a) # remove reads a and b from reads 
        reads.remove(read_b) #  eg.AAAB and AAB with AAAB
        reads.append(read_a + read_b[olen:]) # append to reads the combination of the reads, read_a + suffix of read_b
        read_a, read_b, olen = pick_maximal_overlap(reads, k) # Get the new reads a and b with max overlap
    return ''.join(reads) # after joining all reads with overlap concatenate all the remaining reads and return   

In [366]:
genome = greedy_scs(seq_reads, 30)

TypeError: cannot unpack non-iterable int object