In [1]:
def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t

In [2]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome


In [3]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [5]:
def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences


In [7]:
def naive_with_rc(p, t):
    complement = reverseComplement(p)
    if complement == p:
        return naive(p, t)
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        match_rc = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
            if t[i+j] != complement[j]:  # compare characters
                match_rc = False
            if not match and not match_rc:
                break
        if match or match_rc:
                occurrences.append(i)  # all chars matched; record
    return occurrences


In [8]:
p = 'CCC'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)

[10, 23]


In [9]:
p = 'CGCG'
t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)

[10, 24]


In [12]:
# Phi-X genome
!wget http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa

--2021-02-06 15:18:11--  http://d396qusza40orc.cloudfront.net/ads1/data/phix.fa
Resolving d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)... 99.84.39.161, 99.84.39.16, 99.84.39.64, ...
Connecting to d396qusza40orc.cloudfront.net (d396qusza40orc.cloudfront.net)|99.84.39.161|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5528 (5.4K) [application/octet-stream]
Saving to: ‘phix.fa’


2021-02-06 15:18:11 (2.86 MB/s) - ‘phix.fa’ saved [5528/5528]



In [13]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

In [14]:
phix_genome = readGenome('phix.fa')

In [15]:
occurrences = naive_with_rc('ATTA', phix_genome)

In [16]:
print('offset of leftmost occurrence: %d' % min(occurrences))

offset of leftmost occurrence: 62


In [17]:
print('# occurrences: %d' % len(occurrences))

# occurrences: 60


In [18]:
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa

--2021-02-06 15:19:33--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 143.204.148.164, 143.204.148.89, 143.204.148.139, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|143.204.148.164|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: ‘lambda_virus.fa’


2021-02-06 15:19:35 (2.58 MB/s) - ‘lambda_virus.fa’ saved [49270/49270]



In [19]:
lambda_virus = readGenome('lambda_virus.fa')

In [21]:
occurrences = naive_with_rc('AGGT', lambda_virus) 
print('# occurrences: %d' % len(occurrences))

# occurrences: 306


In [22]:
occurrences = naive_with_rc('TTAA', lambda_virus) 
print('# occurrences: %d' % len(occurrences))

# occurrences: 195


In [23]:
occurrences = naive_with_rc('ACTAAGT', lambda_virus) 
print('offset of leftmost occurrence: %d' % min(occurrences))

offset of leftmost occurrence: 26028


In [24]:
occurrences = naive_with_rc('AGTCGA', lambda_virus) 
print('offset of leftmost occurrence: %d' % min(occurrences))

offset of leftmost occurrence: 450


In [5]:
def naive_2mm(p, t, total_mismatch=2):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        mismatch_count = 0
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                mismatch_count += 1
                if mismatch_count > total_mismatch:
                    match = False
                    break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences


In [28]:
p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
occurrences = naive_2mm(p, t)
print(occurrences)


[10, 24, 38]


In [29]:
occurrences = naive_2mm('GATTACA', phix_genome)

In [30]:
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))

offset of leftmost occurrence: 10
# occurrences: 79


In [31]:
occurrences = naive_2mm('TTCAAGCC', lambda_virus)
print('# occurrences: %d' % len(occurrences))

# occurrences: 191


In [34]:
occurrences = naive_2mm('AGGAGGTT', lambda_virus)
print('offset of leftmost occurrence: %d' % min(occurrences))

offset of leftmost occurrence: 49


In [1]:
#!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq

--2021-02-19 19:52:48--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/chr1.GRCh38.excerpt.fasta
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 143.204.148.132, 143.204.148.164, 143.204.148.139, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|143.204.148.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 810105 (791K) [application/octet-stream]
Saving to: ‘chr1.GRCh38.excerpt.fasta.1’


2021-02-19 19:52:48 (6.94 MB/s) - ‘chr1.GRCh38.excerpt.fasta.1’ saved [810105/810105]



In [3]:
human_1 = readGenome("chr1.GRCh38.excerpt.fasta")

In [8]:
occurrences = naive_2mm('GGCGCGGTGGCTCACGCCTGTAAT', human_1)
print(len(occurrences))

19
