In [24]:
from Bio import SeqIO
f = open('dna.example.fasta')
sequences = SeqIO.parse(f,'fasta')

### Part 1
How many records are in the file? A record in a FASTA file is defined as a single-line header, followed by lines of sequence data. The header line is distinguished from the sequence data by a greater-than (">") symbol in the first column. The word following the ">" symbol is the identifier of the sequence, and the rest of the line is an optional description of the entry. There should be no space between the ">" and the first letter of the identifier. 

What are the lengths of the sequences in the file? What is the longest sequence and what is the shortest sequence? Is there more than one longest or shortest sequence? What are their identifiers? 

In [25]:
i = 0
seqlen = []
identifier = []

for fasta in sequences:
    seqlen.append(len(str(fasta.seq)))
    identifier.append(fasta.id)
    i += 1

nseqs = i
minlen = min(seqlen)
maxlen = max(seqlen)

print(seqlen)
print('Minimum length',minlen)
print('Maximum length',maxlen)

minlen_inds = [i for i in range(nseqs) if seqlen[i]==minlen]
minlen_ids = [identifier[minlen_inds[j]] for j in range(len(minlen_inds))]
print("Minimum length sequences:", minlen_ids)

maxlen_inds = [i for i in range(nseqs) if seqlen[i]==maxlen]
maxlen_ids = [identifier[maxlen_inds[j]] for j in range(len(maxlen_inds))]
print("Maximum length sequences:", maxlen_ids)

#for fasta in sequences:
#    if (len(str(fasta.seq)) == minlen):
#        print('Minimum length sequence:', fasta.id)
#    elif (len(str(fasta.seq)) == maxlen):
#        print('Maximum length sequence:', fasta.id)

f.close()

[990, 724, 3080, 2863, 3832, 4805, 1663, 512, 691, 3072, 1801, 3603, 2478, 1608, 4745, 1810, 3424, 1451, 3276, 2124, 1712, 1325, 1189, 555, 2449]
Minimum length 512
Maximum length 4805
Minimum length sequences: ['gi|142022655|gb|EQ086233.1|521']
Maximum length sequences: ['gi|142022655|gb|EQ086233.1|323']


In [32]:
for id in identifier:
    print(id)

gi|142022655|gb|EQ086233.1|43
gi|142022655|gb|EQ086233.1|160
gi|142022655|gb|EQ086233.1|41
gi|142022655|gb|EQ086233.1|221
gi|142022655|gb|EQ086233.1|294
gi|142022655|gb|EQ086233.1|323
gi|142022655|gb|EQ086233.1|564
gi|142022655|gb|EQ086233.1|521
gi|142022655|gb|EQ086233.1|455
gi|142022655|gb|EQ086233.1|229
gi|142022655|gb|EQ086233.1|422
gi|142022655|gb|EQ086233.1|384
gi|142022655|gb|EQ086233.1|280
gi|142022655|gb|EQ086233.1|158
gi|142022655|gb|EQ086233.1|59
gi|142022655|gb|EQ086233.1|319
gi|142022655|gb|EQ086233.1|438
gi|142022655|gb|EQ086233.1|210
gi|142022655|gb|EQ086233.1|237
gi|142022655|gb|EQ086233.1|507
gi|142022655|gb|EQ086233.1|350
gi|142022655|gb|EQ086233.1|245
gi|142022655|gb|EQ086233.1|279
gi|142022655|gb|EQ086233.1|378
gi|142022655|gb|EQ086233.1|101


### Part 2
Given an input reading frame on the forward strand (1, 2, or 3) your program should be able to identify all ORFs present in each sequence of the FASTA file, and answer the following questions: what is the length of the longest ORF in the file? What is the identifier of the sequence containing the longest ORF? For a given sequence identifier, what is the longest ORF contained in the sequence represented by that identifier? What is the starting position of the longest ORF in the sequence that contains it? The position should indicate the character number in the sequence.

In [33]:
with open('dna.example.fasta') as f:
    seq_dict = SeqIO.to_dict(SeqIO.parse(f,'fasta'))
# with open automatically closes the file

In [28]:
def orf(seq, start):
    # Look for starts
    orf_starts, orf_stops, orf_lens = [], [], []
    stop_codons = ['TAA', 'TAG', 'TGA']
    for ind in range(start,len(seq),3): 
        if (seq[ind:ind+3]=='ATG'):
            orf_starts.append(ind)
        
    for ind2 in range(start+3,len(seq),3):   
        # for more efficiency can check if there is
        # at least one orf_start and search starting there
        if (seq[ind2:ind2+3] in stop_codons):
            orf_stops.append(ind2)   # start of first nucleotide of last codon
            
    #print("Starts: ", orf_starts)
    #print("Stops: ", orf_stops)
    
    orf_start_pos = []
    for i in range(len(orf_starts)):    
        orf_next_stops = [el for el in orf_stops if el > orf_starts[i]]
        if len(orf_next_stops) > 0:
            orf_lens.append(orf_next_stops[0]+2 - orf_starts[i] + 1)
            orf_start_pos.append(orf_starts[i]+1)   # e.g., index 0 is position 1
            #print(f'Has ORF at ind {orf_starts[i]} through {orf_next_stops[0]+2}')
            #print('Length: ', orf_lens[-1])
            #print(seq[orf_starts[i]:orf_starts[i]+3])   # Should be ATG
            #print(seq[orf_next_stops[0]:orf_next_stops[0]+3])     # Should be a stop codon
    
    if len(orf_lens)==0:
        return None, None
    else:
        maxlen = max(orf_lens)
        return maxlen, orf_start_pos[orf_lens.index(maxlen)]
        # Note does not deal with ties
    
maxlens = {}
for key,value in seq_dict.items():
    max_len_start = [orf(value.seq,start) for start in range(3)]
    # List of tuples
    print('Max lengths per frame: ', max_len_start)
    lens_not_none = [i for i in max_len_start if i != (None, None)]
    if len(lens_not_none) > 0:
        maxlens[key] = max(lens_not_none)

# Test accessing by identifier
maxlens['gi|142022655|gb|EQ086233.1|43']
maxlens['gi|142022655|gb|EQ086233.1|41']

# Unzip tuples
res = list(zip(*maxlens.values()))
maxlens = res[0]
maxlen_pos = res[1]

maxlens
maxlen_pos

max_of_all = max(maxlens)
max_of_all_pos = maxlen_pos[maxlens.index(max_of_all)] 
max_of_all_id = identifier[maxlens.index(max_of_all)]

print('Sequence with longest ORF: ', max_of_all_id)
print('Starting position of longest ORF: ', max_of_all_pos)
print('Nucleotide length of longest ORF: ', max_of_all)


Max lengths per frame:  [(213, 367), (39, 416), (186, 444)]
Max lengths per frame:  [(309, 136), (363, 107), (165, 75)]
Max lengths per frame:  [(306, 559), (762, 2009), (918, 1194)]
Max lengths per frame:  [(264, 1246), (594, 1772), (324, 2388)]
Max lengths per frame:  [(420, 2422), (690, 2099), (1608, 141)]
Max lengths per frame:  [(1686, 2824), (1371, 65), (1596, 1476)]
Max lengths per frame:  [(282, 724), (None, None), (507, 825)]
Max lengths per frame:  [(None, None), (None, None), (159, 126)]
Max lengths per frame:  [(39, 241), (552, 86), (54, 153)]
Max lengths per frame:  [(726, 1960), (1311, 125), (306, 804)]
Max lengths per frame:  [(84, 1066), (159, 1112), (639, 216)]
Max lengths per frame:  [(585, 2137), (489, 1154), (720, 1347)]
Max lengths per frame:  [(267, 1177), (291, 1427), (204, 2007)]
Max lengths per frame:  [(129, 700), (486, 1040), (1218, 150)]
Max lengths per frame:  [(1107, 3592), (951, 3605), (942, 972)]
Max lengths per frame:  [(156, 481), (324, 971), (537, 219

In [29]:
print(maxlens)

(213, 363, 918, 594, 1608, 1686, 507, 159, 552, 1311, 639, 720, 291, 1218, 1107, 537, 477, 564, 219, 741, 678, 636, 153, 534, 639)


In [30]:
print(maxlen_pos)

(367, 107, 1194, 1772, 141, 2824, 825, 126, 86, 125, 216, 1347, 1427, 150, 3592, 219, 987, 410, 794, 930, 365, 606, 413, 2, 1633)


### Part 3
We will only consider repeats on the forward strand here. Also we will allow repeats to overlap themselves. For example, the sequence ACACA contains two copies of the sequence ACA - once at position 1 (index 0 in Python), and once at position 3. Given a length n, your program should be able to identify all repeats of length n in all sequences in the FASTA file. Your program should also determine how many times each repeat occurs in the file, and which is the most frequent repeat of a given length.

In [None]:
def find_repeats(seq, n):
    pass