In [2]:
# the module that deals with regular expressions is called re, so if we want to write a program that uses the regular expression tools we must include the line:

import re

In [23]:
# to use the regular expression search function (which we'll discuss later in this chapter) we have to write:
# re.search(pattern, string)

In [6]:
# if we put the letter r immediately before the opening quotation mark, then any special characters inside the string are ignored:
print(r"\t\n")

\t\n


In [7]:
#here's how we test if a DNA sequence contains an EcoRI restriction site:
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")


restriction site found!


In [8]:
dna = "ATCGCGAATTCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")

In [9]:
#But a better way is to capture the variation in the AvaII site using a regular expression:
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")


In [10]:
dna = "ATCGCGAATTCAC"
if re.search(r"GC(A|T|G|C)GC", dna):
    print("restriction site found!")

In [11]:
#Here's the same program using character groups:
dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found!")


In [22]:
#here's a complex pattern to identify full-length eukaryotic messenger RNA sequences:
# ^ATG[ATGC]{30,1000}A{5,10}$

In [13]:
#One such method is the group method. If we call this method on the result of a regular expression search, we get the portion of the input string that matched the pattern:
dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m
m = re.search(r"GA[ATGC]{3}AC", dna)
print(m.group())

GACGTAC


In [21]:
#That's GA, followed by three bases, followed by AC, followed by two bases, followed by AC again. We can surround the bits of the pattern that we want to extract with parentheses – this is called capturing it:
# GA([ATGC]{3})AC([ATGC]{2})AC

In [15]:
#We can now refer to the captured bits of the pattern by supplying an argument to the group method. group(1) will return the bit of the string matched by the section of the pattern in the first set of parentheses, group(2) will return the bit matched by the second, etc.:
dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("entire match: " + m.group())
print("first bit: " + m.group(1))
print("second bit: " + m.group(2))

entire match: GACGTACGTAC
first bit: CGT
second bit: GT


In [16]:
# The start and end methods get the positions of the start and end of the pattern on the sequence:
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))


start: 2
end: 13


In [17]:
#We can get the start and end positions of individual groups by supplying a number as the argument to start and end:
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))
print("group one start: " + str(m.start(1)))
print("group one end: " + str(m.end(1)))
print("group two start: " + str(m.start(2)))
print("group two end: " + str(m.end(2)))

start: 2
end: 13
group one start: 4
group one end: 7
group two start: 9
group two end: 11


In [18]:
#Imagine we have a consensus DNA sequence that contains ambiguity codes, and we want to extract all runs of contiguous unambiguous bases. We need to split the DNA string wherever we see a base that isn't A, T, G or C:
dna = "ACTNGCATRGCTACGTYACGATSCGAWTCG"
runs = re.split(r"[^ATGC]", dna)
print(runs)

['ACT', 'GCAT', 'GCTACGT', 'ACGAT', 'CGA', 'TCG']


In [19]:
#re.findall returns a list of all matches of a pattern in a string. The first argument is the pattern, and the second argument is the string. Say we want to find all runs of A and T in a DNA sequence longer than five bases:
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.findall(r"[AT]{4,100}", dna)
print(runs)

['ATTATAT', 'AAATTATA']


In [20]:
#finditer returns a sequence of match objects, so to do anything useful with it, we need to use the return value in a loop:
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.finditer(r"[AT]{3,100}", dna)
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print("AT rich region from " + str(run_start) + " to " + str(run_end))

AT rich region from 5 to 12
AT rich region from 18 to 26
