In [4]:
def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences

In [5]:
def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t

In [6]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

In [7]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [8]:
def naiveStrand(p, t):
    r = reverseComplement(p) # reverse comp sequence
    occurrences = [] # initiate the list of match index
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        matchP = True # forward stand matched
        matchR = True # reverse strand matched
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                matchP = False  # loop over characters
            if t[i+j] != r[j]:  # compare characters
                matchR = False  
                break
        if matchR or matchP:
            occurrences.append(i)  # all chars matched; record
    return occurrences
    

In [9]:
naiveStrand('ACT', 'CAGTAGT') # Test the new naiveStrand

[1, 4]

In [10]:
virusGenome = readGenome('lambda_virus.fa')

In [11]:
len(naiveStrand('AGGT', virusGenome))

2888

In [12]:
len(naive('TTAA', virusGenome))

195

In [13]:
naiveStrand('ACTAAGT', virusGenome)[0]

430

In [14]:
naiveStrand('AGTCGA', virusGenome)[0]

8

In [15]:
def naive_2mm(p, t):
    occurences = []
    pLen = len(p)
    readRange = len(t) - pLen +1
    
    for i in range(readRange):
        matched = True
        errors = 0
        for j in range(pLen):
            if t[i+j] != p[j]:
                errors += 1
            if errors > 2: # if there are more than 2 errors
                matched = False # it doesnt match
                break
        if matched:
            occurences.append(i)
    return occurences

In [16]:
p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
occurrences = naive_2mm(p, t)
print(occurrences)

[10, 24, 38]


In [17]:
!curl http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa > phix.fa

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5528  100  5528    0     0   8291      0 --:--:-- --:--:-- --:--:--  8287


In [18]:
# read phix_genome
phix_genome = readGenome('phix.fa')

In [19]:
occurrences = naive_2mm('GATTACA', phix_genome)

In [20]:
print('offset of leftmost occurrence: %d' % min(occurrences))

offset of leftmost occurrence: 10


In [21]:
print('offset of leftmost occurrence: %d' % min(occurrences))

offset of leftmost occurrence: 10


In [22]:
len(naive_2mm('TTCAAGCC', virusGenome))

191

In [23]:
min(naive_2mm('AGGAGGTT', virusGenome))

49

In [24]:
!curl https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq > dna.fastq

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  235k  100  235k    0     0   380k      0 --:--:-- --:--:-- --:--:--  380k


In [25]:
naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT')

[0, 4]

In [27]:
len(naive_2mm('TTCAAGCC', virusGenome))

191

In [29]:
def readFastq(fileName):
    seqs = []
    quals = []
    with open(fileName) as fh:
        while True:
            fh.readline()
            seq = fh.readline().rstrip()
            if len(seq) == 0 : 
                break
            fh.readline()
            qual = fh.readline().rstrip()
            seqs.append(seq)
            quals.append(qual)
    return seqs, quals