Regular expressions (regexes) are, when you understand them, one of the most fun things you can work with in programming. They are a mini-language for matching text.

The first thing to know is that non-special characters match themselves in text.

In [1]:
sentence = ("A symmetry of a pattern is, loosely speaking, a way of transforming "
            "the pattern so that the pattern looks exactly the same after the "
            "transformation.")

In [2]:
sentence

'A symmetry of a pattern is, loosely speaking, a way of transforming the pattern so that the pattern looks exactly the same after the transformation.'

In [3]:
sentence.index("symmetry")

2

In [4]:
import re

In [5]:
help(re.search)

Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.



In [6]:
"e" in 'Hello'

True

In [7]:
re.search(r'e', "hello")

<_sre.SRE_Match object; span=(1, 2), match='e'>

In [8]:
re.search(r'pattern', sentence)

<_sre.SRE_Match object; span=(16, 23), match='pattern'>

In [9]:
re.findall(r"pattern", sentence)

['pattern', 'pattern', 'pattern']

# Matching Anything

In [10]:
re.search(r"oo.", sentence)

<_sre.SRE_Match object; span=(29, 32), match='oos'>

In [11]:
re.findall(r"oo.", sentence)

['oos', 'ook']

In [12]:
re.search(r".oo.", sentence)

<_sre.SRE_Match object; span=(28, 32), match='loos'>

In [13]:
re.search(r".oo..", sentence)

<_sre.SRE_Match object; span=(28, 33), match='loose'>

In [15]:
re.search(r"\.", sentence)

<_sre.SRE_Match object; span=(147, 148), match='.'>

In [16]:
print(re.findall(r"h", "Hello there! How may I help you?"))

['h', 'h']


In [17]:
print(re.findall(r"h", "Hello there! How may I help you?", re.IGNORECASE))

['H', 'h', 'H', 'h']


# What can I do with a match object

In [18]:
match = re.search("pattern", sentence)

In [19]:
help(match)

Help on SRE_Match object:

class SRE_Match(builtins.object)
 |  The result of re.match() and re.search().
 |  Match objects always have a boolean value of True.
 |  
 |  Methods defined here:
 |  
 |  __copy__(self, /)
 |  
 |  __deepcopy__(self, /, memo)
 |  
 |  __repr__(self, /)
 |      Return repr(self).
 |  
 |  end(self, group=0, /)
 |      Return index of the end of the substring matched by group.
 |  
 |  expand(self, /, template)
 |      Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
 |  
 |  group(...)
 |      group([group1, ...]) -> str or tuple.
 |      Return subgroup(s) of the match by indices or names.
 |      For 0 returns the entire match.
 |  
 |  groupdict(self, /, default=None)
 |      Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
 |      
 |      default
 |        Is used for groups that did not participate in the match.
 |  
 |  groups(self, /, defau

In [25]:
match.group()

'pattern'

# Start and end
^ matches the beginning of the string

$ matches the end

In [26]:
re.search(r"^A ", sentence)

<_sre.SRE_Match object; span=(0, 2), match='A '>

In [27]:
re.search(r"^way", sentence)

In [28]:
re.search(r".\.$", sentence)

<_sre.SRE_Match object; span=(146, 148), match='n.'>

In [29]:
my_hello = "How are you? I am fine. How are you?"
how = "How are you?"

In [30]:
re.search(r"How are you\?", my_hello)

<_sre.SRE_Match object; span=(0, 12), match='How are you?'>

In [31]:
re.search(r"How are you\?$", my_hello)

<_sre.SRE_Match object; span=(24, 36), match='How are you?'>

# Character class

That last match was pretty wordy. Luckily, we have something called character classes for commonly used groups of characters.

* \d matches digits.
* \D matches non-digits.
* \w matches "word characters": basically [a-zA-Z0-9_], plus all other valid Unicode characters that can be in words.
* \W matches non-word-characters.
* \s matches space characters -- [ \t\n\r\f\v].
* \S matches non-space characters.

In [32]:
re.search(r'.oo....', sentence)

<_sre.SRE_Match object; span=(28, 35), match='loosely'>

In [38]:
re.findall(r'o+', sentence)

['o', 'oo', 'o', 'o', 'o', 'oo', 'o', 'o']

In [37]:
re.findall(r'oo*', sentence)

['o', 'oo', 'o', 'o', 'o', 'oo', 'o', 'o']

In [39]:
re.findall(r"ng?", sentence)

['n', 'ng', 'n', 'ng', 'n', 'n', 'n', 'n']

In [40]:
re.findall(r"sym?", sentence)

['sym']

In [41]:
re.findall(r"sym+", sentence)

['symm']

In [42]:
re.findall(r"oo*", "Sooooooooon")

['ooooooooo']

In [43]:
no_a = "b"
one_a = "ab"
lots_of_a = "aaaaaaaaaab"

In [44]:
print(re.search(r"a*b", no_a))
print(re.search(r"a*b", one_a))
print(re.search(r"a*b", lots_of_a))

<_sre.SRE_Match object; span=(0, 1), match='b'>
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(0, 11), match='aaaaaaaaaab'>


In [45]:
print(re.search(r"a+b", no_a))
print(re.search(r"a+b", one_a))
print(re.search(r"a+b", lots_of_a))

None
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(0, 11), match='aaaaaaaaaab'>


In [46]:
print(re.search(r"a{2}b", no_a))
print(re.search(r"a{2}b", one_a))
print(re.search(r"a{2}b", lots_of_a))

None
None
<_sre.SRE_Match object; span=(8, 11), match='aab'>


In [47]:
print(re.search(r"a{1,2}b", no_a))
print(re.search(r"a{1,2}b", one_a))
print(re.search(r"a{1,2}b", lots_of_a))

None
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(8, 11), match='aab'>


In [48]:
print(re.search(r"a{1,}b", no_a))
print(re.search(r"a{1,}b", one_a))
print(re.search(r"a{1,}b", lots_of_a))

None
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(0, 11), match='aaaaaaaaaab'>


In [49]:
print(re.search(r"a{,2}b", no_a))
print(re.search(r"a{,2}b", one_a))
print(re.search(r"a{,2}b", lots_of_a))

<_sre.SRE_Match object; span=(0, 1), match='b'>
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(8, 11), match='aab'>


# Matching sets fo things
* [abz] will match on a, b, or z
* [A-Z] will match a range between A and Z
* [^A-Z] maotches anything that isn't A to Z

In [50]:
re.findall(r" [A-Za-z]{3,5} ", sentence)

[' way ', ' the ', ' that ', ' looks ', ' the ', ' after ']

In [51]:
re.findall(r"[tze]", "The quick brown fox jumped over the lazy dog")

['e', 'e', 'e', 't', 'e', 'z']

In [52]:
re.findall(r"[0-9]+", "I ate 100 ghost peppers in 1998")

['100', '1998']

In [54]:
re.findall(r"[\.,;\?!]", sentence)

[',', ',', '.']

In [55]:
re.search(r"[0-9]{3}-[0-9]{3}-[0-9]{4}", '111-222-3333')

<_sre.SRE_Match object; span=(0, 12), match='111-222-3333'>

# Character Classes
* \d matches digits.
* \D matches non-digits.
* \w matches "word characters": basically [a-zA-Z0-9_], plus all other valid Unicode characters that can be in words.
* \W matches non-word-characters.
* \s matches space characters -- [ \t\n\r\f\v].
* \S matches non-space characters.

In [56]:
re.search(r"\d{3}-\d{3}-\d{4}", '111-222-3333')

<_sre.SRE_Match object; span=(0, 12), match='111-222-3333'>

In [57]:
re.findall(r"[^\w\s]", sentence)

[',', ',', '.']

In [None]:
re.findall(r"[\W\S]", sentence)

In [59]:
re.search(r"\D*", "My phone number is 111-222-3333")

<_sre.SRE_Match object; span=(0, 19), match='My phone number is '>

In [60]:
re.findall(r"\D*", "My phone number is 111-222-3333")

['My phone number is ', '', '', '', '-', '', '', '', '-', '', '', '', '', '']


There's a few odder ones:
* \A matches the beginning of the string. This is a lot like ^, but different for multi-line strings.
* \Z matches the end of the string. This is a lot like $, but different for multi-line strings.
* \b matches a word boundary. This means it matches an empty string at the end of a word.

In [61]:
re.findall(r"\b\w{3,5}\b", sentence)

['way', 'the', 'that', 'the', 'looks', 'the', 'same', 'after', 'the']

In [62]:
text = """This is a multi-line string.
It has newlines in it."""

In [63]:
print(re.findall(r"\w\.$", text, re.MULTILINE))
print(re.findall(r"\w\.\Z", text, re.MULTILINE))

['g.', 't.']
['t.']


In [64]:
possible_emails = ["clinton", "jeff.newburn@theironyard.com", "beanguy@example.org", 
                   "Email help@example.org for more information",
                   "terry@example.org", "@carmen", "what@what", "hi@example.org"]

In [65]:
email = "help@example.com"
re.search(r"\A\w+@\w+\.\w{2,3}\Z", email)

<_sre.SRE_Match object; span=(0, 16), match='help@example.com'>

In [66]:
email = "help@example"
re.search(r"\A\w+@\w+\.\w{2,3}\Z", email)

In [67]:
[possibility
    for possibility in possible_emails
    if re.search("\A\w+@\w+\.\w{2,3}\Z", possibility)]

['beanguy@example.org', 'terry@example.org', 'hi@example.org']


Note that a regex for emails is more complex than this. It's not that hard, though:

```[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?```

# Capturing Matches
We often want to capture part of a match for later use. You can use parentheses to mark part of your regex as something you will capture.

In [79]:
foo = "I live at 1112 S Casino Center, Las Vegas NV  89104"
bar = "I live at 1 Cupertino, Las Vegas NV  89104"

In [76]:
match = re.search("I live at (.*), Las Vegas", foo)

In [77]:
match.group()

'I live at 1112 S Casino Center, Las Vegas'

In [78]:
match.groups()

('1112 S Casino Center',)

In [80]:
match = re.search("I live at (.*), Las Vegas", bar)

In [81]:
match.groups()

('1 Cupertino',)

In [82]:
possibilities = ["Decatur, GA", "Wilkesboro, NC", "Seattle", "Wichita Falls, TX", "DC"]
for possibility in possibilities:
    match = re.search(r"^([\w\s]+), ([A-Z]{2})", possibility)
    if match:
        city, state = match.groups()
        print(match.groups())
        print("City:{}|State:{}".format(city, state))


('Decatur', 'GA')
City:Decatur|State:GA
('Wilkesboro', 'NC')
City:Wilkesboro|State:NC
('Wichita Falls', 'TX')
City:Wichita Falls|State:TX


In [83]:
phone_nums = ["999-555-1212", "(703) 555-9999", "800.555.7341", "3145558286"]

In [100]:
phone_nums = ["999-555-1212", "(703) 555-9999", "800.555.7341", "3145558286"]
cleaned = []
for num in phone_nums:
    match = re.search(r"\(?(\d{3})\)?[\.\-\s]?(\d{3})[\.\-]?(\d{4})", num)
    if match:
        cleaned.append("{}-{}-{}".format(*match.groups()))
print(cleaned)

['999-555-1212', '703-555-9999', '800-555-7341', '314-555-8286']


In [90]:
match = re.search(r"\(?(\d{3})\)?[-\. ]?(\d{3})[-\.]?(\d{4})", "3145558286")

In [92]:
match.groups(1)

('314', '555', '8286')

In [93]:
print("{}-{}-{}".format(match.groups()[0], match.groups()[1], match.groups()[2]))

314-555-8286


# Non-capturing group

Use (?:) to make a group but not capture it.

In [96]:
match = re.search(r"(?:\(?(\d{3})\)?[-\. ]?)?(\d{3})[-\.]?(\d{4})", "5558286")

In [97]:
match.groups()

(None, '555', '8286')

In [99]:
phone_num_with_possible_area_code = r"(?:\(?(\d{3})\)?[\-\.]?\s*)?(\d{3})[\-\.]?(\d{4})"
match = re.search(phone_num_with_possible_area_code, "(702) 111-2222")
print(match.groups())

('702', '111', '2222')


In [98]:
bar = "hello.  How are you?"
match = re.search("(?:hello.\s*)?(.*)$", bar)
match.groups()

('How are you?',)

# Scratching the surface
This is just the beginning with regular expressions. You can go really deep down this hole.
* [Python regex docs](https://docs.python.org/3/library/re.html)
* [Regexr](http://www.regexr.com/)
* [Regex One](http://regexone.com/)
* [Regular-Expressions.info](http://www.regular-expressions.info/)