In [1]:
import re

In [2]:
# Search '123' in the given string. re.search() always returns the left-most match in the string
s = 'foo123bar123'
re.search('123', s)

<re.Match object; span=(3, 6), match='123'>

## Character Class
In a regex, a set of characters specified in square brackets (`[]`) makes up a character class. Or you can use pre-defined **metacharacters** to match characters. Check out [here](https://realpython.com/regex-python/#metacharacters-supported-by-the-re-module) for a full list of metacharacters supported by `re` module.

In [3]:
# [0-9] matches any single decimal digit character—any character between '0' and '9', inclusive.
# The full expression [0-9][0-9][0-9] matches any sequence of three consecutive decimal digit characters.
re.search('[0-9][0-9][0-9]', s)

<re.Match object; span=(3, 6), match='123'>

In [4]:
print(re.search('[0-9][0-9][0-9]', '12foo34'))

None


In [5]:
# Enumerate the set of characters to match in the brackets
print(re.search('ba[artz]', 'foobarqux'))
print(re.search('ba[artz]', 'foobazqux'))

<re.Match object; span=(3, 6), match='bar'>
<re.Match object; span=(3, 6), match='baz'>


In [6]:
# [0-9a-fA-F] matches any hexadecimal digit character
re.search('[0-9a-fA-f]', '--- a0 ---')

<re.Match object; span=(4, 5), match='a'>

In [7]:
# Match any characters that isn't a digit
# Note: ^ needs to appear as the first character.
re.search('[^0-9]', '12345foo')

<re.Match object; span=(5, 6), match='f'>

In [8]:
# ^ has no effect here.
re.search('[#:^]', 'foo^bar:baz#qux')

<re.Match object; span=(3, 4), match='^'>

In [9]:
# Escape '-' by adding backslash or put it as the first or last character
print(re.search('[-abc]', '123-456'))
print(re.search('[abc-]', '123-456'))
print(re.search('[ab\-c]', '123-456'))

<re.Match object; span=(3, 4), match='-'>
<re.Match object; span=(3, 4), match='-'>
<re.Match object; span=(3, 4), match='-'>


In [10]:
print(re.search('[]]', 'foo[1]'))
print(re.search('[\[]', 'foo[1]'))

<re.Match object; span=(5, 6), match=']'>
<re.Match object; span=(3, 4), match='['>


In [11]:
# Other regex metacharacters lose their special meaning inside a character class
print(re.search('[)*+|]', '123*456'))
print(re.search('[)*+|]', '123+456'))

<re.Match object; span=(3, 4), match='*'>
<re.Match object; span=(3, 4), match='+'>


In [12]:
# \w: Match based on whether a character is a word character.
print(re.search('\w', '#(.a$@&'))
# equivalent to 
print(re.search('[a-zA-Z0-9_]', '#(.a$@&'))

<re.Match object; span=(3, 4), match='a'>
<re.Match object; span=(3, 4), match='a'>


In [13]:
# \W: Match any non-word characters.
print(re.search('\W', 'a_1*3Qb'))
# equivalent to
print(re.search('[^a-zA-Z0-9_]', 'a_1*3Qb'))

<re.Match object; span=(3, 4), match='*'>
<re.Match object; span=(3, 4), match='*'>


In [14]:
# \d \D: Match based on whether a character is a decimal digit.
print(re.search('\d', 'abc4def'))
print(re.search('\D', '234Q678'))

<re.Match object; span=(3, 4), match='4'>
<re.Match object; span=(3, 4), match='Q'>


In [16]:
# \s \S: Match based on whether a character represents whitespace.
print(re.search('\s', 'foo\nbar baz'))
print(re.search('\S', '  \n foo  \n  '))

<re.Match object; span=(3, 4), match='\n'>
<re.Match object; span=(4, 5), match='f'>


In [17]:
# The character class sequences \w, \W, \d, \D, \s, and \S can appear inside a square bracket character class as well.
print(re.search('[\d\w\s]', '---3---'))
print(re.search('[\d\w\s]', '---a---'))
print(re.search('[\d\w\s]', '--- ---'))

<re.Match object; span=(3, 4), match='3'>
<re.Match object; span=(3, 4), match='a'>
<re.Match object; span=(3, 4), match=' '>


### Escaping metacharacter

In [18]:
# match any characters except whitespace
print(re.search('.', 'foo.bar'))
# match literal '.'
print(re.search('\.', 'foo.bar'))

<re.Match object; span=(0, 1), match='f'>
<re.Match object; span=(3, 4), match='.'>


In [22]:
# To match backslash, use double "escapte backslash"
print(re.search('\\\\', r"foo\bar"))
# Or use raw string
print(re.search(r'\\', r'foo\bar'))

<re.Match object; span=(3, 4), match='\\'>
<re.Match object; span=(3, 4), match='\\'>


### Anchors

In [24]:
# ^ or \A: anchor a match to the start of <string>.
print(re.search('^foo', 'foobar'))
print(re.search('\Afoo', 'barfoo'))

<re.Match object; span=(0, 3), match='foo'>
None


In [26]:
# $ or \Z: anchor a match to the end of <string>.
print(re.search('bar$', 'foobar'))
print(re.search('bar\Z', 'barfoo'))

<re.Match object; span=(3, 6), match='bar'>
None


In [27]:
# As a special case, $ (but not \Z) also matches just before a single newline at the end of the search string.
print(re.search('bar$', 'foobar\n'))
print(re.search('bar\Z', 'foobar\n'))

<re.Match object; span=(3, 6), match='bar'>
None


In [31]:
# \b: anchors a match to a word boundary.
print(re.search(r'\bbar', 'foo bar'))
print(re.search(r'\bbar', 'foo.bar'))
print(re.search(r'\bbar', 'foobar'))
print(re.search(r'foo\b', 'foo bar'))
print(re.search(r'foo\b', 'foo.bar'))
print(re.search(r'foo\b', 'foobar'))

<re.Match object; span=(4, 7), match='bar'>
<re.Match object; span=(4, 7), match='bar'>
None
<re.Match object; span=(0, 3), match='foo'>
<re.Match object; span=(0, 3), match='foo'>
None


In [32]:
# Match a whole word (note the usage of raw string)
print(re.search(r'\bbar\b', 'foo bar baz'))
print(re.search(r'\bbar\b', 'foo(bar)baz'))
print(re.search(r'\bbar\b', 'foobarbaz'))

<re.Match object; span=(4, 7), match='bar'>
<re.Match object; span=(4, 7), match='bar'>
None


In [34]:
# \B: anchors a match to a location that isn’t a word boundary.
print(re.search(r'\Bfoo\B', 'foo'))
print(re.search(r'\Bfoo\B', '.foo.'))
print(re.search(r'\Bfoo\B', 'barfoobaz'))

None
None
<re.Match object; span=(3, 6), match='foo'>


### Quantifiers
A **quantifier** metacharacter immediately follows a portion of a `<regex>` and indicates how many times that portion must occur for the match to succeed.

In [37]:
# *: Matches zero or more repetitions of the preceding regex.
print(re.search('foo-*bar', 'foobar'))
print(re.search('foo-*bar', 'foo-bar'))
print(re.search('foo-*bar', 'foo--bar'))

<re.Match object; span=(0, 6), match='foobar'>
<re.Match object; span=(0, 7), match='foo-bar'>
<re.Match object; span=(0, 8), match='foo--bar'>


In [39]:
# +: Matches one or more repetitions of the preceding regex.
print(re.search('foo-+bar', 'foobar'))
print(re.search('foo-+bar', 'foo-bar'))
print(re.search('foo-+bar', 'foo--bar'))

None
<re.Match object; span=(0, 7), match='foo-bar'>
<re.Match object; span=(0, 8), match='foo--bar'>


In [40]:
# ?: Matches zero or one repetitions of the preceding regex.
print(re.search('foo-?bar', 'foobar'))
print(re.search('foo-?bar', 'foo-bar'))
print(re.search('foo-?bar', 'foo--bar'))

<re.Match object; span=(0, 6), match='foobar'>
<re.Match object; span=(0, 7), match='foo-bar'>
None


In [42]:
'''
*?
+?
??

The non-greedy (or lazy) versions of the *, +, and ? quantifiers.
'''
# Greedy version: match anything in between < and > and produces longest possible match
print(re.search('<.*>', '%<foo> <bar> <baz>%'))

# Non-greedy version: 
print(re.search('<.*?>', '%<foo> <bar> <baz>%'))

<re.Match object; span=(1, 18), match='<foo> <bar> <baz>'>
<re.Match object; span=(1, 6), match='<foo>'>


In [43]:
# {m}: Matches exactly m repetitions of the preceding regex.
print(re.search('x-{3}x', 'x--x'))
print(re.search('x-{3}x', 'x---x'))

None
<re.Match object; span=(0, 5), match='x---x'>


In [46]:
'''
{m,n}: Matches any number of repetitions of the preceding <regex> from m to n, inclusive.
{,n}: Any number of repetitions of <regex> less than or equal to n, equal to <regex>{0,n}
{m,}: Any number of repetitions of <regex> greater than or equal to m
{,}:  Any number of repetitions of <regex>, equal to <regex>{0,} or <regex>*
'''
for i in range(1, 6):
    s = f"x{'-' * i}x"
    print(f'{i}  {s:10}', re.search('x-{2,4}x', s))

1  x-x        None
2  x--x       <re.Match object; span=(0, 4), match='x--x'>
3  x---x      <re.Match object; span=(0, 5), match='x---x'>
4  x----x     <re.Match object; span=(0, 6), match='x----x'>
5  x-----x    None


In [45]:
# {m,n}?: The non-greedy (lazy) version of {m,n}.
print(re.search('a{3,5}', 'aaaaaaaa'))
print(re.search('a{3,5}?', 'aaaaaaaa'))

<re.Match object; span=(0, 5), match='aaaaa'>
<re.Match object; span=(0, 3), match='aaa'>
