In [1]:
import re

* regular expressions themselves
    * simple characters
    * character sets
        * inverted sets
        * ranges
        * shortcuts
    * repetition
    * alternation
    * groups
        * non-capturing
        * capturing, and capturing with a name
        * back-references
* the `re` module
    * regular expression strings (already discussed)
    * compiled patterns
        * `match` vs. `search`
        * 
    * strings to be searched
    * match objects
    * other interesting functions
* tips and tricks
    * verbose mode
    * to compile or not
    * greedy or not
    * using a function argument in `sub` and `subn`
    * setting flags within the regex
    * when to use alternatives (e.g., `os.path`, `glob`, `regex`)
* real-world examples
* common pitfalls
    * failure to use a raw string
    * not understanding re.MULTILINE, re.DOTALL
    * mixing str and bytes


In [2]:
url_pattern_str = r'''(?P<protocol>    # URL protocol, required and captured
                          https?       # the 's' in 'https' is optional
                      )
                      ://              # required, but not captured
                      (?P<host>        # host, required _and_ captured
                          [^/:]+       # ...stops at the first slash or colon
                      )
                      (?:              # an optional group for the port
                          :            # ...so we don't capture the colon
                          (?P<port>    # optional (because of the containing group), but captured
                              \d+      # the port is all digits
                          )
                      )?
                      (?P<path>        # path, optional but captured
                          /[^?]*       # ...stops at the first question mark
                      )?
                      (?:              # an optional group for the query
                          \?           # ...so we don't capture the '?' that queries start with
                          (?P<query>
                              .+       # everything _after_ the question mark is the query
                          )
                      )?
                   '''
url_pattern = re.compile(url_pattern_str, re.VERBOSE)

In [4]:
urls = [
    r'https://google.com',
    r'https://google.com/',
    r'https://www.learninga-z.com/main/Activity/reading',
    r'http://learninga-z.com:8088/main/Activity/reading?module=razkids',
    r'https://git.zv.cx/wolf/re-presentation.git',
]

for url in urls:
    match = url_pattern.match(url)

    for group_name in url_pattern.groupindex.keys():
        if match[group_name]:
            print(f"{group_name:>8}:  {match[group_name]}")
    print()

protocol:  https
    host:  google.com

protocol:  https
    host:  google.com
    path:  /

protocol:  https
    host:  www.learninga-z.com
    path:  /main/Activity/reading

protocol:  http
    host:  learninga-z.com
    port:  8088
    path:  /main/Activity/reading
   query:  module=razkids

protocol:  https
    host:  git.zv.cx
    path:  /wolf/re-presentation.git

