In [13]:
def read_fasta(fname):
    headers, records = [], []
    record = ""
    with open(fname, "rb") as fin:
        for line in fin:
            if not line[0] == '>':
                record += line.strip()
            else:
                headers.append(line.strip())
                if record:
                    records.append(record)
                    record = ""
    if record:
        records.append(record)
    return headers, records

In [195]:
def get_frame(genome, start):
    # frame 1: ATG ATG
    # frame 2: A TGA TG
    # frame 3: AT GAT G
    frame, n = [], len(genome)
    for i in range(start, n, 3):
        if (i + 3) <= n:
            frame.append(genome[i:i + 3])
    return frame

def get_orfs(genome, start):
    START_CODON = "ATG"
    END_CODONS = ["TAA", "TAG", "TGA"]
    orfs = []
    frame = get_frame(genome, start)
    for i in range(len(frame)):
        if frame[i] == START_CODON:
            for j in range(i + 1, len(frame)):
                if frame[j] in END_CODONS:
                    orfs.append("".join(frame[i:j+1]))
    return orfs

print get_orfs("ATGAAATAG", 0)

def get_ngram_positions(genome, n):
    """
    n - length of the repeats
    """
    ngrams = {}
    for i in range(len(genome) - n + 1):
        ngram = genome[i:i+n]
        if ngram not in ngrams:
            ngrams[ngram] = set([i])
        else:
            ngrams[ngram].add(i)
    return ngrams

['ATGAAATAG']


In [202]:
headers, records = read_fasta("dna2.fasta")
print 'record count:', len(headers)
print 'longest sequence', max([len(r) for r in records])
print 'sortest sequence', min([len(r) for r in records])
print
for i in range(len(headers)):
    print headers[i]
    print 'record length:', len(records[i])
    orfs = get_orfs(records[i], 2)
    ngrams = get_ngram_positions(records[i], 10)
    repeats = dict((key,value) for key, value in ngrams.iteritems() if len(value) > 1)
#     mlen, mstart = 0, 0
#     for start, orf in orfs:
#         if len(orf) > mlen:
#             mstart = start
#             mlen = len(orf)
    print 'max orf length:', max([len(orf) for orf in orfs] or [0])
#     print 'max orf start:', mstart
    print 'repeats of len 7:', len(repeats)
    print '*****************'

record count: 18
longest sequence 4894
sortest sequence 115

>gi|142022655|gb|EQ086233.1|91 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 4635
max orf length: 3678
repeats of len 7: 68
*****************
>gi|142022655|gb|EQ086233.1|304 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 1151
max orf length: 519
repeats of len 7: 2
*****************
>gi|142022655|gb|EQ086233.1|255 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 4894
max orf length: 3609
repeats of len 7: 73
*****************
>gi|142022655|gb|EQ086233.1|45 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 3511
max orf length: 3207
repeats of len 7: 71
*****************
>gi|142022655|gb|EQ086233.1|396 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length

In [184]:
genome = "".join(records)
ngrams = get_ngram_positions(genome, 12)
repeats = [(key,len(value)) for key, value in ngrams.iteritems() if len(value) > 1]
print sorted(repeats, lambda x, y: y[1]-x[1])

[('ATTCGCCATTCG', 10), ('CATTCGCCATTC', 10), ('TTCGCCATTCGC', 10), ('TCGCCATTCGCC', 10), ('CCATTCGCCATT', 9), ('CGCCATTCGCCA', 9), ('GCCATTCGCCAT', 9), ('TCGGGCTGCCGT', 3), ('CGCGGTCGATGC', 3), ('CTCGCGCAGCGC', 3), ('GATCACCGCGAC', 3), ('CGCCGCGCGACG', 3), ('AGCGTCGCGAGC', 3), ('GCGCGCCGTCGC', 3), ('CAGGTCGCGCCG', 3), ('CGCGTTCAGCGC', 3), ('GCGGCGTCCGGC', 3), ('CGAGCACCGCGA', 3), ('GCGCGCCGCGTG', 3), ('CGCGCGCCGCGT', 3), ('GCCGCGCAGATC', 3), ('CGTGCGGATCAC', 3), ('CCGGCGCGGCCG', 3), ('CGCGACGCTCGC', 3), ('GCGCTGGCCGCG', 3), ('CGCTCGCGCAGC', 3), ('CGACGAGCTGGT', 3), ('GTGCGGATCACG', 3), ('GCGCAGATCGCG', 3), ('GCTCGCGCAGCG', 3), ('CGATCCTCGCCG', 3), ('GGTCGATGCGAT', 3), ('GCGCGCTGATCG', 3), ('GCGAAGGCCGCG', 3), ('GCCGCGCGACGC', 3), ('TGCTGCGCGACC', 3), ('CGCGCAGATCGC', 3), ('CCAGGTCGCGCC', 3), ('ATCGCGATGCGC', 3), ('TCGCGCCGAGCG', 3), ('CTGCTGCGCGCG', 3), ('GCGATCACCTGT', 2), ('CGCCGATCGACG', 2), ('TACGCGAGCGTG', 2), ('GCGCTCGCGCAG', 2), ('CGGCGAGATCGT', 2), ('GCCGTACGTCGA', 2), ('TGGCGC