In [2]:
import re

* regular expressions themselves
    * simple characters
    * character sets
        * inverted sets
        * ranges
        * shortcuts
    * repetition
    * alternation
    * groups
        * non-capturing
        * capturing, and capturing with a name
        * back-references
* the `re` module
    * regular expression strings (already discussed)
    * compiled patterns
        * `match` vs. `search`
        * 
    * strings to be searched
    * match objects
    * other interesting functions
* tips and tricks
    * verbose mode
    * to compile or not
    * greedy or not
    * using a function argument in `sub` and `subn`
    * setting flags within the regex
    * when to use alternatives (e.g., `os.path`, `glob`, `regex`)
* real-world examples
* common pitfalls
    * failure to use a raw string
    * not understanding re.MULTILINE, re.DOTALL
    * mixing str and bytes


In [15]:
import io

output = io.StringIO()
output.write('First line.\n')
print('Second line.', file=output)

pattern = re.compile(r'^(.*)(line)\.$', re.MULTILINE)
m = pattern.findall(output.getvalue())
print(m)
print(output.getvalue())

[('First ', 'line'), ('Second ', 'line')]
First line.
Second line.



In [5]:
pattern_test_str = r'(?i)goodbye'
pattern_test = re.compile(pattern_test_str)

In [25]:
pattern_str = r'''(?P<protocol>    # URL protocol, required and captured
                      https?       # the 's' is optional
                  )
                  ://              # required, but not captured
                  (?P<host>        # host, required and captured
                      [^/:]+       # ...stops at the first slash or colon
                  )
                  (?:              # an optional group for the port
                      :            # ...so we don't capture the colon
                      (?P<port>    # optional (because of the containing group), but captured
                          \d+      # the port is all digits
                      )
                  )?
                  (?P<path>        # path, optional but captured
                      /[^?]*       # ...stops at the first question mark
                  )?
                  (?:              # an optional group for the query
                      \?           # ...so we don't capture the '?'
                      (?P<query>
                          .+       # everything _after_ the question mark is the query
                      )
                  )?
               '''
pattern = re.compile(pattern_str, re.X)

In [26]:
match = pattern.match(r'http://www.learninga-z.com/main/landing?module=razkids')
print(match.groups())

('http', 'www.learninga-z.com', None, '/main/landing', 'module=razkids')


In [33]:
for group_name in sorted(pattern.groupindex.keys()):
    if match[group_name]:
        print(f"{group_name}: {match[group_name]}")

host: www.learninga-z.com
path: /main/landing
protocol: http
query: module=razkids
