Regular expressions (regexes) are, when you understand them, one of the most fun things you can work with in programming. They are a mini-language for matching text.

The first thing to know is that non-special characters match themselves in text.

In [1]:
import re
help(re.search)

Help on function search in module re:

search(pattern, string, flags=0)
    Scan through string looking for a match to the pattern, returning
    a match object, or None if no match was found.



In [2]:
re.search(r"e", "hello")

<_sre.SRE_Match object; span=(1, 2), match='e'>

In [3]:
re.search(r"l", "hello")

<_sre.SRE_Match object; span=(2, 3), match='l'>

In [4]:
sentence = ("A symmetry of a pattern is, loosely speaking, a way of transforming "
            "the pattern so that the pattern looks exactly the same after the "
            "transformation.")

In [5]:
re.search(r"pattern", sentence)

<_sre.SRE_Match object; span=(16, 23), match='pattern'>

In [6]:
re.findall(r"pattern", sentence)

['pattern', 'pattern', 'pattern']

In [7]:
re.findall(r"at", sentence)

['at', 'at', 'at', 'at', 'at']

#Matching Anything
The . (period) character matches anything (except a newline). We can use this to find strings that match wildcards, like "a double-o followed by any character."

In [8]:
re.search(r"oo.", sentence)

<_sre.SRE_Match object; span=(29, 32), match='oos'>

See how the match is "oos".

In [9]:
re.search(r".oo.", sentence)

<_sre.SRE_Match object; span=(28, 32), match='loos'>

In [11]:
re.search(r".oo..", sentence)

<_sre.SRE_Match object; span=(28, 33), match='loose'>

In [12]:
re.search(r"\.", sentence)

<_sre.SRE_Match object; span=(147, 148), match='.'>

In [14]:
re.search(r"xjeiorajgfioeajthi", sentence)

In [15]:
print(re.findall(r"h", "Hello there! How may I help you?"))
print(re.findall(r"h", "Hello there! How may I help you?", re.IGNORECASE))

['h', 'h']
['H', 'h', 'H', 'h']


#What can I do with a match object?

In [16]:
match = re.search("pattern", sentence)
help(match)

Help on SRE_Match object:

class SRE_Match(builtins.object)
 |  The result of re.match() and re.search().
 |  Match objects always have a boolean value of True.
 |  
 |  Methods defined here:
 |  
 |  __copy__(...)
 |  
 |  __deepcopy__(...)
 |  
 |  __repr__(self, /)
 |      Return repr(self).
 |  
 |  end(...)
 |      end([group=0]) -> int.
 |      Return index of the end of the substring matched by group.
 |  
 |  expand(...)
 |      expand(template) -> str.
 |      Return the string obtained by doing backslash substitution
 |      on the string template, as done by the sub() method.
 |  
 |  group(...)
 |      group([group1, ...]) -> str or tuple.
 |      Return subgroup(s) of the match by indices or names.
 |      For 0 returns the entire match.
 |  
 |  groupdict(...)
 |      groupdict([default=None]) -> dict.
 |      Return a dictionary containing all the named subgroups of the match,
 |      keyed by the subgroup name. The default argument is used for groups
 |      that did no

In [17]:
import antigravity

#Start and end matches
You often want to match something if and only if it is at the beginning or end of a string.

^ matches the beginning of a string.

$ matches the end of a string.

In [18]:
re.search(r"^A ", sentence)

<_sre.SRE_Match object; span=(0, 2), match='A '>

In [19]:
re.search(r"^pattern", sentence)

In [20]:
re.search(r"n.$", sentence)

<_sre.SRE_Match object; span=(146, 148), match='n.'>

In [47]:
re.search(r"n\.$", sentence)

<_sre.SRE_Match object; span=(146, 148), match='n.'>

In [48]:
re.search(r"n\.$", "I like singing")

#Matching multiples
Often, you want to match a multiple amount of something. Whether it's 0 or more, 1 or more, 0 or 1, or something else, we've got you covered.
* \* matches 0 or more.
* \+ matches 1 or more.
* ? matches 0 or 1.
* {n} matches n repetitions.
* {m,n} matches m to n repetitions. You can leave out m or n to match 0 to n, or m to infinity.

In [21]:
re.findall(r"o+", sentence)

['o', 'oo', 'o', 'o', 'o', 'oo', 'o', 'o']

In [22]:
re.findall(r"oo*", sentence)

['o', 'oo', 'o', 'o', 'o', 'oo', 'o', 'o']

In [23]:
re.findall(r"oo*", "Sooooooooon")

['ooooooooo']

In [24]:
re.findall(r"ng? ", sentence)

['n ', 'ng ', 'n ', 'n ']

In [25]:
no_a = "b"
one_a = "ab"
lots_of_a = "aaaaaaaaaab"

In [27]:
print(re.search(r"a*b", no_a))
print(re.search(r"a*b", one_a))
print(re.search(r"a*b", lots_of_a))

<_sre.SRE_Match object; span=(0, 1), match='b'>
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(0, 11), match='aaaaaaaaaab'>


In [28]:
print(re.search(r"a+b", no_a))
print(re.search(r"a+b", one_a))
print(re.search(r"a+b", lots_of_a))

None
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(0, 11), match='aaaaaaaaaab'>


In [29]:
print(re.search(r"a{2}b", no_a))
print(re.search(r"a{2}b", one_a))
print(re.search(r"a{2}b", lots_of_a))

None
None
<_sre.SRE_Match object; span=(8, 11), match='aab'>


In [30]:
print(re.search(r"a{1,2}b", no_a))
print(re.search(r"a{1,2}b", one_a))
print(re.search(r"a{1,2}b", lots_of_a))

None
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(8, 11), match='aab'>


In [31]:
print(re.search(r"a{1,}b", no_a))
print(re.search(r"a{1,}b", one_a))
print(re.search(r"a{1,}b", lots_of_a))

None
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(0, 11), match='aaaaaaaaaab'>


In [32]:
print(re.search(r"a{,2}b", no_a))
print(re.search(r"a{,2}b", one_a))
print(re.search(r"a{,2}b", lots_of_a))

<_sre.SRE_Match object; span=(0, 1), match='b'>
<_sre.SRE_Match object; span=(0, 2), match='ab'>
<_sre.SRE_Match object; span=(8, 11), match='aab'>


In [49]:
re.search(r"(a+b){2}", "abaaaabaab")

<_sre.SRE_Match object; span=(0, 7), match='abaaaab'>

#Matching sets of things
All the above is good, but not that useful by itself. Being able to match a group of characters is super-useful.

We use square brackets to do this.

* [abz] will match an a, b, or z.
* [A-Z] matches a range of letters from A to Z.
* [^A-Z] matches anything that isn't A to Z.

In [34]:
# Get words that are 3 to 5 letters long
re.findall(r" [A-Za-z]{3,5} ", sentence)

[' way ', ' the ', ' that ', ' looks ', ' the ', ' after ']

In [35]:
# Find the first number in a string
re.search(r"[0-9]+", "I ate 130 ghost peppers")

<_sre.SRE_Match object; span=(6, 9), match='130'>

In [36]:
# Find all punctuation
re.search(r"[\.,;?!]", sentence)

<_sre.SRE_Match object; span=(26, 27), match=','>

In [50]:
# or
re.findall(r"[^A-Za-z0-9 ]", sentence)

[',', ',', '.']

In [51]:
# Find a phone number
re.search(r"[0-9]{3}-[0-9]{3}-[0-9]{4}", "My phone number is 919-555-1212.")

<_sre.SRE_Match object; span=(19, 31), match='919-555-1212'>

#Character class

That last match was pretty wordy. Luckily, we have something called character classes for commonly used groups of characters.

* \d matches digits.
* \D matches non-digits.
* \w matches "word characters": basically [a-zA-Z0-9_], plus all other valid Unicode characters that can be in words.
* \W matches non-word-characters.
* \s matches space characters -- [ \t\n\r\f\v].
* \S matches non-space characters.

In [37]:
# Find a phone number
re.search(r"\d{3}-\d{3}-\d{4}", "My phone number is 702-555-1212")

<_sre.SRE_Match object; span=(19, 31), match='702-555-1212'>

In [38]:
# Find all punctuation
re.findall(r"[^\w\s]", sentence)

[',', ',', '.']

In [39]:
# Find a phone number
re.search(r"\D*", "My phone number is 702-555-1212")

<_sre.SRE_Match object; span=(0, 19), match='My phone number is '>

In [52]:
# Find all punctuation -- why doesn't this work?
re.findall(r"[\W\S]", sentence)

['A',
 ' ',
 's',
 'y',
 'm',
 'm',
 'e',
 't',
 'r',
 'y',
 ' ',
 'o',
 'f',
 ' ',
 'a',
 ' ',
 'p',
 'a',
 't',
 't',
 'e',
 'r',
 'n',
 ' ',
 'i',
 's',
 ',',
 ' ',
 'l',
 'o',
 'o',
 's',
 'e',
 'l',
 'y',
 ' ',
 's',
 'p',
 'e',
 'a',
 'k',
 'i',
 'n',
 'g',
 ',',
 ' ',
 'a',
 ' ',
 'w',
 'a',
 'y',
 ' ',
 'o',
 'f',
 ' ',
 't',
 'r',
 'a',
 'n',
 's',
 'f',
 'o',
 'r',
 'm',
 'i',
 'n',
 'g',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'p',
 'a',
 't',
 't',
 'e',
 'r',
 'n',
 ' ',
 's',
 'o',
 ' ',
 't',
 'h',
 'a',
 't',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'p',
 'a',
 't',
 't',
 'e',
 'r',
 'n',
 ' ',
 'l',
 'o',
 'o',
 'k',
 's',
 ' ',
 'e',
 'x',
 'a',
 'c',
 't',
 'l',
 'y',
 ' ',
 't',
 'h',
 'e',
 ' ',
 's',
 'a',
 'm',
 'e',
 ' ',
 'a',
 'f',
 't',
 'e',
 'r',
 ' ',
 't',
 'h',
 'e',
 ' ',
 't',
 'r',
 'a',
 'n',
 's',
 'f',
 'o',
 'r',
 'm',
 'a',
 't',
 'i',
 'o',
 'n',
 '.']

In [41]:
re.findall(r"\S", sentence)

['A',
 's',
 'y',
 'm',
 'm',
 'e',
 't',
 'r',
 'y',
 'o',
 'f',
 'a',
 'p',
 'a',
 't',
 't',
 'e',
 'r',
 'n',
 'i',
 's',
 ',',
 'l',
 'o',
 'o',
 's',
 'e',
 'l',
 'y',
 's',
 'p',
 'e',
 'a',
 'k',
 'i',
 'n',
 'g',
 ',',
 'a',
 'w',
 'a',
 'y',
 'o',
 'f',
 't',
 'r',
 'a',
 'n',
 's',
 'f',
 'o',
 'r',
 'm',
 'i',
 'n',
 'g',
 't',
 'h',
 'e',
 'p',
 'a',
 't',
 't',
 'e',
 'r',
 'n',
 's',
 'o',
 't',
 'h',
 'a',
 't',
 't',
 'h',
 'e',
 'p',
 'a',
 't',
 't',
 'e',
 'r',
 'n',
 'l',
 'o',
 'o',
 'k',
 's',
 'e',
 'x',
 'a',
 'c',
 't',
 'l',
 'y',
 't',
 'h',
 'e',
 's',
 'a',
 'm',
 'e',
 'a',
 'f',
 't',
 'e',
 'r',
 't',
 'h',
 'e',
 't',
 'r',
 'a',
 'n',
 's',
 'f',
 'o',
 'r',
 'm',
 'a',
 't',
 'i',
 'o',
 'n',
 '.']

There's a few odder ones:
* \A matches the beginning of the string. This is a lot like ^, but different for multi-line strings.
* \Z matches the end of the string. This is a lot like $, but different for multi-line strings.
* \b matches a word boundary. This means it matches an empty string at the end of a word.

In [53]:
# Get words three to five letters long
re.findall(r"\b\w{3,5}\b", sentence)

['way', 'the', 'that', 'the', 'looks', 'the', 'same', 'after', 'the']

In [54]:
text = """This is a multi-line string.
It has newlines in it."""

print(re.findall(r"\w\.$", text, re.MULTILINE))
print(re.findall(r"\w\.\Z", text, re.MULTILINE))

['g.', 't.']
['t.']


In [56]:
# Pick out email addresses
possible_emails = ["clinton", "jeff.newburn@theironyard.com", "beanguy@example.org", 
                   "Email help@example.org for more information",
                   "terry@example.org", "@carmen", "what@what", "hi@example.org"]
[possibility 
 for possibility in possible_emails 
 if re.search("\A\w+@\w+\.\w{2,3}\Z", possibility)]

['beanguy@example.org', 'terry@example.org', 'hi@example.org']

Note that a regex for emails is more complex than this. It's not that hard, though:

```[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?```

#Capturing Matches
We often want to capture part of a match for later use. You can use parentheses to mark part of your regex as something you will capture.

In [43]:
foo = "I live at 1112 S Casino Center, Las Vegas NV 90104"
match = re.search("I live at (.*), Las Vegas", foo)
match.groups()

('1112 S Casino Center',)

In [57]:
possibilities = ["Decatur, GA", "Wilkesboro, NC", "Seattle", "Wichita Falls, TX", "DC"]
for possibility in possibilities:
    match = re.search(r"^([\w\s]+), ([A-Z]{2})", possibility)
    if match:
        city, state = match.groups()
        print("City:", city, "| State:", state)

City: Decatur | State: GA
City: Wilkesboro | State: NC
City: Wichita Falls | State: TX


In [45]:
phone_nums = ["999-555-1212", "(703) 555-9999", "800.555.7341", "3145558286"]
cleaned = []
for num in phone_nums:
    match = re.search(r"\(?(\d{3})\)?[\-\.]?\s*(\d{3})[\-\.]?(\d{4})",num)
    cleaned.append("{}-{}-{}".format(*match.groups()))
cleaned

['999-555-1212', '703-555-9999', '800-555-7341', '314-555-8286']

#Non-capturing group
Use (?:) to make a group but not capture it.

In [58]:
phone_num_with_possible_area_code = r"(?:\(?(\d{3})\)?[\-\.]?\s*)?(\d{3})[\-\.]?(\d{4})"
phone_nums = ["999-555-1212", "(703) 555-9999", "800.555.7341", "3145558286", "555-1212"]
cleaned = []
for num in phone_nums:
    match = re.search(phone_num_with_possible_area_code, num)
    cleaned.append("{}-{}-{}".format(*match.groups()))
print(cleaned)

['999-555-1212', '703-555-9999', '800-555-7341', '314-555-8286', 'None-555-1212']


In [46]:
phone_num_with_possible_area_code  = r"(?:\(?(\d{3})\)?[\-\.]?\s*)?(\d{3})[\-\.]?(\d{4})"
phone_nums = ["999-555-1212", "(703) 555-9999", "800.555.7341", "3145558286", "555-1212"]
for num in phone_nums:
    match = re.search(phone_num_with_possible_area_code, num)
    print(match.groups())

('999', '555', '1212')
('703', '555', '9999')
('800', '555', '7341')
('314', '555', '8286')
(None, '555', '1212')


In [59]:
re.search(r"(?:ab)+", "ccccababababcccccab")

<_sre.SRE_Match object; span=(4, 12), match='abababab'>

#Scratching the surface
This is just the beginning with regular expressions. You can go really deep down this hole.
* [Python regex docs](https://docs.python.org/3/library/re.html)
* [Regexr](http://www.regexr.com/)
* [Regex One](http://regexone.com/)
* [Regular-Expressions.info](http://www.regular-expressions.info/)