In [1]:
import re

## regular expressions themselves

### what is a regular expression?

A regular expression is a formal description of a set of strings and a well-known collection of rules to know if any given string is *in* that set.  If a string is in the set, the regular expression is said to *match* that string.  The formal description is often referred to as the *pattern*.  The `re` module has a function to test if a regular expression matches a string: `re.match(pattern, string)`.

This document is about regular expressions (sometimes called regexes) and the `re` module from Python 3.7.  Most of it applies to earlier versions of Python but your mileage may vary.

In [2]:
from termcolor import colored

def demo(pattern, match_against, fn=None, show_location=False):
    """demo is built around re.match, but applied to a list of strings
       instead of a single string; and it prints a message for each, appropriate
       to a demonstration.  It uses colors and bold to make things nice.
    """
    compiled_pattern = re.compile(pattern)  # don't really need to do this, but it's good form
    if isinstance(match_against, str):  # OK, it takes a list, but if you give it a single string it still works
        match_against = [ match_against ]
    if fn is None:
        fn = re.match
    else:
        show_location = True
        print(f'using re.{fn.__name__}, the pattern ', end='')

    print(colored(f'{pattern}', 'blue', 'on_yellow', attrs=['bold']))
    for string in match_against:
        match = fn(pattern=compiled_pattern, string=string)
        if match:
            location_info = ''
            r = match.span()
            if show_location:
                location_info = f'at {r[0]}-{r[1]} of '
            print(f'  matches {location_info}', end='')
            
            print(colored(f'{string[:r[0]]}', 'green'), end='')
            print(colored(f'{string[r[0]:r[1]]}', 'green', attrs=['bold']), end='')
            print(colored(f'{string[r[1]:]}', 'green'))
        else:
            print(f'  does *not* match ', end='')
            print(colored(f'{string}', 'red'))
    print()

As we progress, the patterns will look more and more complicated.  Don't be intimidated!

In [3]:
# Simple characters like 'c', 'a', or 't', match themselves
# Note that I specified the pattern as a raw string... make that a habit for yourself
if re.match(pattern=r'cat', string='cat'):
    print('The pattern "cat" matches the string "cat"')

demo(pattern=r'cat', match_against='dog')

The pattern "cat" matches the string "cat"
[1m[43m[34mcat[0m
  does *not* match [31mdog[0m



`re.match` actually only tests that the string *starts* with a match.  Later we'll talk about `re.search` and `re.fullmatch`.

In [4]:
demo(pattern=r'cat', match_against='catch')

[1m[43m[34mcat[0m
  matches [32m[0m[1m[32mcat[0m[32mch[0m



#### character sets 

In [5]:
# Using a simple set: [aou]
demo(pattern=r'c[aou]t', match_against=['cat', 'cote', 'cute', 'city'])

[1m[43m[34mc[aou]t[0m
  matches [32m[0m[1m[32mcat[0m[32m[0m
  matches [32m[0m[1m[32mcot[0m[32me[0m
  matches [32m[0m[1m[32mcut[0m[32me[0m
  does *not* match [31mcity[0m



In [6]:
# Inverting a set: [^y]
demo(r'c[^y]t', ['cat', 'cote', 'cytoplasm', 'cctv'])

[1m[43m[34mc[^y]t[0m
  matches [32m[0m[1m[32mcat[0m[32m[0m
  matches [32m[0m[1m[32mcot[0m[32me[0m
  does *not* match [31mcytoplasm[0m
  matches [32m[0m[1m[32mcct[0m[32mv[0m



In [7]:
# Using ranges: [a-f0-9]
demo(r'0x[a-f0-9]', ['0x2', '0xc', '0xq'])

[1m[43m[34m0x[a-f0-9][0m
  matches [32m[0m[1m[32m0x2[0m[32m[0m
  matches [32m[0m[1m[32m0xc[0m[32m[0m
  does *not* match [31m0xq[0m



In [8]:
# Using shortcut: \d == [0-9]; . == [^\n]
demo(r'\d\d\d-\d\d\d\d', ['555-12q2', '411', '867-5309'])
demo(r'\d\d\d-\d\d.\d', ['555-12q2', '411', '867-5309'])

[1m[43m[34m\d\d\d-\d\d\d\d[0m
  does *not* match [31m555-12q2[0m
  does *not* match [31m411[0m
  matches [32m[0m[1m[32m867-5309[0m[32m[0m

[1m[43m[34m\d\d\d-\d\d.\d[0m
  matches [32m[0m[1m[32m555-12q2[0m[32m[0m
  does *not* match [31m411[0m
  matches [32m[0m[1m[32m867-5309[0m[32m[0m



#### repetition

In [9]:
# {min, max}, note that \w == [a-zA-Z0-9_] (at least for ASCII)
demo(r'\d{3}-\d{4}', ['555-12q2', '411', '867-5309'])
demo(r'=\w{3,5}=', ['=cat=', '=catch=', '=dogs=', '=catchy=', '=my='])

[1m[43m[34m\d{3}-\d{4}[0m
  does *not* match [31m555-12q2[0m
  does *not* match [31m411[0m
  matches [32m[0m[1m[32m867-5309[0m[32m[0m

[1m[43m[34m=\w{3,5}=[0m
  matches [32m[0m[1m[32m=cat=[0m[32m[0m
  matches [32m[0m[1m[32m=catch=[0m[32m[0m
  matches [32m[0m[1m[32m=dogs=[0m[32m[0m
  does *not* match [31m=catchy=[0m
  does *not* match [31m=my=[0m



In [10]:
# ?, +, *, Note that \s is whitespace
# ? == {0,1}, + == {1,}, * == {0,}
demo(r'\d+\s*-?\s*\d*', ['555-\t1212', '411', '12345-abc'])

[1m[43m[34m\d+\s*-?\s*\d*[0m
  matches [32m[0m[1m[32m555-	1212[0m[32m[0m
  matches [32m[0m[1m[32m411[0m[32m[0m
  matches [32m[0m[1m[32m12345-[0m[32mabc[0m



#### alternation

In [11]:
# binds lower than other operators, so cat|dog does what it looks like
demo(r'cat|dog', ['catchy', 'dog-lover', 'apple pie'])

[1m[43m[34mcat|dog[0m
  matches [32m[0m[1m[32mcat[0m[32mchy[0m
  matches [32m[0m[1m[32mdog[0m[32m-lover[0m
  does *not* match [31mapple pie[0m



#### groups

In [12]:
# the default kind of group, capturing, is specified with simple parens
demo(r'\d{3}(-\d{4})?', ['555-1212', '411'])

# When you've captured a group, you can reference it with a backslash
demo(r'(\w+) \1', ['abc def', 'abc abc'])
demo(r'href=([\'\"]).*?\1', ['href="hello"', 'href=\'goodbye\'', 'href=\'abc"'])

[1m[43m[34m\d{3}(-\d{4})?[0m
  matches [32m[0m[1m[32m555-1212[0m[32m[0m
  matches [32m[0m[1m[32m411[0m[32m[0m

[1m[43m[34m(\w+) \1[0m
  does *not* match [31mabc def[0m
  matches [32m[0m[1m[32mabc abc[0m[32m[0m

[1m[43m[34mhref=([\'\"]).*?\1[0m
  matches [32m[0m[1m[32mhref="hello"[0m[32m[0m
  matches [32m[0m[1m[32mhref='goodbye'[0m[32m[0m
  does *not* match [31mhref='abc"[0m



In [13]:
# special groups start with a '?' just inside the left paren
# 'x' is the verbose flag, this is a flag-setting group
# verbose mode lets me break apart the regex, even onto multiple lines
demo(r'''(?x)  # verbose mode
    \d{3}          # prefix
    ( - \d{4} )?   # optional''', ['555-1212', '411'])

[1m[43m[34m(?x)  # verbose mode
    \d{3}          # prefix
    ( - \d{4} )?   # optional[0m
  matches [32m[0m[1m[32m555-1212[0m[32m[0m
  matches [32m[0m[1m[32m411[0m[32m[0m



In [14]:
# setting flags for the whole pattern, for just a portion of the pattern
# the 'i' flag makes the pattern case-insensitive
demo(r'(?i)abcXYZ', ['abcxyz', 'AbCXYz'])
demo(r'(?i:abc)XYZ', ['abcxyz', 'abcXYZ', 'ABCXYZ'])

[1m[43m[34m(?i)abcXYZ[0m
  matches [32m[0m[1m[32mabcxyz[0m[32m[0m
  matches [32m[0m[1m[32mAbCXYz[0m[32m[0m

[1m[43m[34m(?i:abc)XYZ[0m
  does *not* match [31mabcxyz[0m
  matches [32m[0m[1m[32mabcXYZ[0m[32m[0m
  matches [32m[0m[1m[32mABCXYZ[0m[32m[0m



In [15]:
# assertions, positive and negative look-ahead (specialized groups)
# `re.match`: the pattern must match at the beginning of the string;
# `re.search`: the pattern can match anywhere inside the string
demo(r'Isaac(?=\s+Asimov)', ['Isaac Newton', 'Isaac Asimov'], show_location=True)
demo(r'Isaac(?!\s+Newton)', ['Isaac Newton', 'Isaac Asimov'], show_location=True)
demo(r'(?<!particle)-physics', ['nuclear-physics', 'particle-physics'], fn=re.search)
demo(r'(?<=nuclear)-physics', ['nuclear-physics', 'particle-physics'], fn=re.search)

[1m[43m[34mIsaac(?=\s+Asimov)[0m
  does *not* match [31mIsaac Newton[0m
  matches at 0-5 of [32m[0m[1m[32mIsaac[0m[32m Asimov[0m

[1m[43m[34mIsaac(?!\s+Newton)[0m
  does *not* match [31mIsaac Newton[0m
  matches at 0-5 of [32m[0m[1m[32mIsaac[0m[32m Asimov[0m

using re.search, the pattern [1m[43m[34m(?<!particle)-physics[0m
  matches at 7-15 of [32mnuclear[0m[1m[32m-physics[0m[32m[0m
  does *not* match [31mparticle-physics[0m

using re.search, the pattern [1m[43m[34m(?<=nuclear)-physics[0m
  matches at 7-15 of [32mnuclear[0m[1m[32m-physics[0m[32m[0m
  does *not* match [31mparticle-physics[0m



#### special matches

Look-ahead and look-behind are a special form of match instruction called an 'assertion'.  They match zero characters, but still influence where the actual match can happen.  Here are a couple of other zero-length assertions:

In [16]:
# beginning or end of a word, or not (zero-length)
demo(r'cat\b', ['cat', 'catchy', 'cat-call'], show_location=True)
demo(r'cat\B', ['cat', 'catchy', 'cat-call'], show_location=True)

# beginning or end of the string (zero-length)
demo(r'^abc', ['abcb', 'bbabc'], fn=re.search)
demo(r'abc$', ['abcb', 'bbabc'], fn=re.search)

[1m[43m[34mcat\b[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32m[0m
  does *not* match [31mcatchy[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32m-call[0m

[1m[43m[34mcat\B[0m
  does *not* match [31mcat[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32mchy[0m
  does *not* match [31mcat-call[0m

using re.search, the pattern [1m[43m[34m^abc[0m
  matches at 0-3 of [32m[0m[1m[32mabc[0m[32mb[0m
  does *not* match [31mbbabc[0m

using re.search, the pattern [1m[43m[34mabc$[0m
  does *not* match [31mabcb[0m
  matches at 2-5 of [32mbb[0m[1m[32mabc[0m[32m[0m



## the `re` module

We've already discussed pattern strings and the strings against which we are matching.  Let's talk about the other nouns in the `re` world: compiled patterns, match objects, and the `re` module itself.  To get a compiled pattern, call `re.compile(pattern_string, flags=0)`.  A compiled pattern has method calls that are mirrored in the `re` module:

* `match`, `search`, `fullmatch`
* `split`
* `findall`, `finditer`
* `sub`, `subn`

The `re` versions take a compiled pattern *or* a pattern-string; the compiled pattern versions take (optionally) string indeces.  So for example:

`re.match(pattern, string, flags=0)` [doc](https://docs.python.org/3.7/library/re.html#re.match) vs.<br/>
`pattern.match(string[, pos[, endpos]])` [doc](https://docs.python.org/3.7/library/re.html#re.Pattern.match)

In [17]:
# `match` vs. `search` vs. `fullmatch`
# `match` and `search` you know; `fullmatch` must match the entire string
demo(r'cat', ['catchy', 'cat', 'alley-cat'], fn=re.match)
demo(r'cat', ['catchy', 'cat', 'alley-cat'], fn=re.search)
demo(r'cat', ['catchy', 'cat', 'alley-cat'], fn=re.fullmatch)

using re.match, the pattern [1m[43m[34mcat[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32mchy[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32m[0m
  does *not* match [31malley-cat[0m

using re.search, the pattern [1m[43m[34mcat[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32mchy[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32m[0m
  matches at 6-9 of [32malley-[0m[1m[32mcat[0m[32m[0m

using re.fullmatch, the pattern [1m[43m[34mcat[0m
  does *not* match [31mcatchy[0m
  matches at 0-3 of [32m[0m[1m[32mcat[0m[32m[0m
  does *not* match [31malley-cat[0m



#### match objects

Matching functions will return a match object (success) or `None` (failure).  A simple `if` is all you need to test for success.  Save the match object to access some details about the match.  For instance, remember we talked about "capturing" groups.  This is where the data in a captured group is saved.

In [18]:
match = re.match(r'(\d{3})-(\d{4})', '867-5309')
if match:
    print(f'group 1: {match.group(1)}')
    print(f'group 2: {match[2]}')
    print(f'try a tuple: {match.group(1, 2)}')

group 1: 867
group 2: 5309
try a tuple: ('867', '5309')


* other interesting functions
    * `re.escape`
    * `match.expand`

## tips and tricks

* verbose mode
* to compile or not
* greedy or not
* using a function argument in `sub` and `subn`
* setting flags within the regex
* when to use alternatives (e.g., `os.path`, `glob`, `regex`, `str.startswith`, `in`)

## common pitfalls

* failure to use a raw string
* not understanding `re.MULTILINE`, `re.DOTALL`
* mixing `str` with `bytes`

## documentation

* This presentation: [https://github.com/wolf/re-presentation](https://github.com/wolf/re-presentation)
* Online regex tester and debugger [regular expressions 101](https://regex101.com)
* [re module documentation](https://docs.python.org/3.7/library/re.html)
* third party [regex module homepage](https://bitbucket.org/mrabarnett/mrab-regex)
* by Jeffrey Friedl: ["Mastering Regular Expressions" First Edition](https://www.amazon.com/Mastering-Regular-Expressions-Techniques-Handbooks/dp/1565922573) (covers Python) [Third Edition](https://www.amazon.com/Mastering-Regular-Expressions-Jeffrey-Friedl/dp/0596528124) (does *not* cover Python)

In [19]:
url_pattern_str = r'''
    (?P<protocol>  # URL protocol, required and captured
      https?       # the 's' in 'https' is optional
    )
    ://            # required, but not captured
    (?P<host>      # host, required _and_ captured
      [^/:]+       # ...stops at the first slash or colon
    )
    (?:            # an optional group for the port
      :            # ...so we don't capture the colon
      (?P<port>    # optional (because of the containing group), but captured
          \d+      # the port is all digits
      )
    )?
    (?P<path>      # path, optional but captured
      /[^?]*       # ...stops at the first question mark
    )?
    (?:            # an optional group for the query
      \?           # ...so we don't capture the '?' that starts a query
      (?P<query>
          .+       # everything _after_ the question mark is the query itself
      )
    )?
'''
url_pattern = re.compile(url_pattern_str, re.VERBOSE)

In [20]:
urls = [
    r'https://google.com',
    r'https://google.com/',
    r'https://www.learninga-z.com/main/Activity/reading',
    r'http://learninga-z.com:8088/main/Activity/reading?module=razkids',
    r'https://github.com/wolf/re-presentation.git',
    r'https://www.amazon.com/Mastering-Regular-Expressions-Techniques-Handbooks/dp/1565922573',
]

for url in urls:
    match = url_pattern.match(url)

    for group_name in url_pattern.groupindex.keys():
        if match[group_name]:
            print(f'{group_name:>8}:  {match[group_name]}')
    print()

protocol:  https
    host:  google.com

protocol:  https
    host:  google.com
    path:  /

protocol:  https
    host:  www.learninga-z.com
    path:  /main/Activity/reading

protocol:  http
    host:  learninga-z.com
    port:  8088
    path:  /main/Activity/reading
   query:  module=razkids

protocol:  https
    host:  github.com
    path:  /wolf/re-presentation.git

protocol:  https
    host:  www.amazon.com
    path:  /Mastering-Regular-Expressions-Techniques-Handbooks/dp/1565922573

