In [13]:
def read_fasta(fname):
    headers, records = [], []
    record = ""
    with open(fname, "rb") as fin:
        for line in fin:
            if not line[0] == '>':
                record += line.strip()
            else:
                headers.append(line.strip())
                if record:
                    records.append(record)
                    record = ""
    if record:
        records.append(record)
    return headers, records

In [150]:
def get_frames(genome):
    frames, n = [], len(genome)
    for start in range(3):
        frame = []
        for i in range(start, n, 3):
            if (i + 3) <= n:
                frame.append(genome[i:i + 3])
        frames.append(frame)
    return frames

def get_orfs(genome):
    START_CODON = "ATG"
    END_CODONS = ["TAA", "TAG", "TGA"]
    orfs = []
    start = -1
    while start < len(genome):
        start = genome.find(START_CODON, start + 1)
        if start < 0:
            break
        for i in range(start + 3, len(genome), 3):
            if genome[i:i+3] in END_CODONS:
                orfs.append((start, genome[start:i+3]))
                break
    return orfs

def get_repeats(genome, n):
    """
    n - length of the repeats
    """
    start = -1
    repeats = {}
    for i in range(len(genome) - n + 1):
        pos = i
        repeats[genome[i:i+n]].add() = repeats.get(genome[i:i+n], 0) + 1
        while pos < len(genome):
            pos = genome.find(genome[i:i + n], pos + 1)
            if pos >= 0:
                repeats[genome[i:i+n]] += 1
            else:
                break
    print repeats
print get_repeats("ACACA", 3)

{'CAC': 1, 'ACA': 3}
None


In [139]:
headers, records = read_fasta("dna.example.fasta")
for i in range(len(headers)):
    print headers[i]
    print 'record length:', len(records[i])
    orfs = get_orfs(records[i])
    mlen, mstart = 0, 0
    for start, orf in orfs:
        if len(orf) > mlen:
            mstart = start
            mlen = len(orf)
    print 'max orf length:', mlen
    print 'max orf start:', mstart
    print '*****************'

>gi|142022655|gb|EQ086233.1|43 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 990
max orf length: 213
max orf start: 366
*****************
>gi|142022655|gb|EQ086233.1|160 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 724
max orf length: 363
max orf start: 106
*****************
>gi|142022655|gb|EQ086233.1|41 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 3080
max orf length: 918
max orf start: 1193
*****************
>gi|142022655|gb|EQ086233.1|221 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 2863
max orf length: 594
max orf start: 1771
*****************
>gi|142022655|gb|EQ086233.1|294 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
record length: 3832
max orf length: 1608
max orf start: 140
*****************
>gi|14