In [23]:
def genOverlap(kmers):
    '''
    Given a collection of kmers genOverlap creates an overlap graph in the form of an ajecency list
    the overlap graph represents every instance of a kmer overlapping another kmer with an identical sequence
    :param kmers: list of strings, each string represents a kmer
    :return: a dictionary where each key value pair are strings that represent two kmers that overlap
    '''
    overlapGraph = {}
    for kmer in kmers: # iterate through all kmers
        overlapGraph[kmer] = []
        for otherKmer in kmers: # iterates through all kmers looking for overlapping kmers
            if otherKmer == kmer: # handles case where it sees itself in list
                continue
            else:
                if otherKmer[:-1] == kmer[1:]: # checks to see if the kmers overlap
                    overlapGraph[kmer].append(otherKmer)
    return overlapGraph

def main(fName):
    '''
    Reads input file and runs genOverlap on the given data
    '''
    kmers = []
    if fName == '':
        return
    else:
        with open(fName) as inFile:
            for line in inFile: # creates a list of kmers from input file
                kmers.append(line.strip())
    graph = genOverlap(kmers)
    toPrint = [] # used to organize output
    for key in graph.keys():
        for value in graph[key]:
            toPrint.append('{0} -> {1}'.format(key,value)) # formats dictionary into adjacency list
    toPrint.sort()
    for item in toPrint:
        print(item)
if __name__ == '__main__':
    main(fName='')

In [24]:
main('rosalind_ba3c.txt')

AAAACCCTGCCCTCACACAA -> AAACCCTGCCCTCACACAAT
AAAACCTTCCCTGTACCGGG -> AAACCTTCCCTGTACCGGGA
AAAAGGTCTTTTTCTCCGTC -> AAAGGTCTTTTTCTCCGTCA
AAAATATGACCGCGCTAAAG -> AAATATGACCGCGCTAAAGA
AAACAGCTCTTACTGTCCAA -> AACAGCTCTTACTGTCCAAA
AAACCCTGCCCTCACACAAT -> AACCCTGCCCTCACACAATA
AAACCTTCCCTGTACCGGGA -> AACCTTCCCTGTACCGGGAC
AAACGCCGACATTTATACTA -> AACGCCGACATTTATACTAA
AAAGACTAACAGTATCAAGC -> AAGACTAACAGTATCAAGCA
AAAGGTCTTTTTCTCCGTCA -> AAGGTCTTTTTCTCCGTCAC
AAAGTTTATCTGACGCGGTA -> AAGTTTATCTGACGCGGTAA
AAATAATCATAACTCCCGGC -> AATAATCATAACTCCCGGCT
AAATATCAGAGGTTGAAGGA -> AATATCAGAGGTTGAAGGAG
AAATATGACCGCGCTAAAGA -> AATATGACCGCGCTAAAGAC
AAATGCACTCGCTTTGACAT -> AATGCACTCGCTTTGACATC
AAATGTATCGGGACTCCCAA -> AATGTATCGGGACTCCCAAC
AACAGAGAATAGACCTGTAC -> ACAGAGAATAGACCTGTACG
AACAGCTCTTACTGTCCAAA -> ACAGCTCTTACTGTCCAAAA
AACAGTATCAAGCATGTGGG -> ACAGTATCAAGCATGTGGGC
AACATACATCAGTGGGTCCG -> ACATACATCAGTGGGTCCGT
AACCAATAAGGCTCGACAAT -> ACCAATAAGGCTCGACAATG
AACCCTGCCCTCACACAATA -> ACCCTGCCCTCACACAATAA
AACCGCGCTT