### <font color="brown">Regular Expressions Continued</font>

In [2]:
import re

#### <font color="brown">Special regular expression sequences to match predefined sets of characters</font>
1. Whitespace: \\s, \\S
2. Word (alphanumeric, plus underscore) characters: \\w, \\W
3. Digits: \\d, \\D
4. Word Boundary: \\b

---

**Whitespace**
- \\s : matches any whitespace character (including tab and newline)
- \\S : matches any non-whitespace character 

In [6]:
res = re.search(r'[.?!]{2,}\s+','...What the?')  # at least two of .,?,or !, followed by whitespace
print(res)

None


In [70]:
res = re.search(r'[.?!]{2,}\s+','...  What the?')  # at least two of .,?,or !, followed by whitespace
print(res)

<re.Match object; span=(0, 5), match='...  '>


In [69]:
res = re.search(r'[.?!]{2,} +','...  What the?')  # at least two of .,?,or !, followed by whitespace
print(res)

<re.Match object; span=(0, 5), match='...  '>


In [7]:
# can use [] class for whitespace
res = re.search(r'[.?!]{2,}[ \t\n]+','... What the?')  # at least two of .,?,or !, followed by whitespace
print(res)

<re.Match object; span=(0, 4), match='... '>


In [24]:
res = re.search(r'[.?!]{2,}\s+','What the?!! Next...')  # at least two of .,?,or !, followed by whitespace
print(res)

<re.Match object; span=(8, 12), match='?!! '>


In [5]:
# want at least 4 non-whitespace characters followed by at least one whitespace
res = re.search(r'\S{4,}\s+','The quick brown fox...')
print(res)

<re.Match object; span=(4, 10), match='quick '>


---

**"Word": characters (alphanumeric)**

- \\w : matches any alphanumeric character => [a-zA-Z0-9_]  (includes underscore)
- \\W : matches any non-alphanumeric character => [^a-zA-Z0-9_]  

In [8]:
# want at least 4 word characters followed by at least one whitespace
res = re.search(r'\w{4,}\s+',"Hey! What's up?")
print(res)

None


In [9]:
# want at least 4 word characters followed by at least one whitespace
res = re.search(r'\w{4,}\s+',"Hey! What's up with you?")
print(res)

<re.Match object; span=(15, 20), match='with '>


---

**Digits**

- \\d : matches any digit character => [0-9]
- \\D : matches any non-digit character => [^0-9]

#### Exercise:
Write a regular expression to determine if a given string is a legit phone number.
Here are the legit phone number variations:
- ddddddddd
- ddd-ddd-dddd
- (ddd)ddddddd
- (ddd)ddd-dddd

In [11]:
# first, let's handle the last two variants that have ()
while True:
    astr = input("phone number? ('quit' to stop) ")
    if astr == 'quit':
        break
    res = re.search(r'^\(\d{3}\)\d{3}-?\d{4}$',astr)  # escape '(' and ')' because they are metachars
    print('match') if res else print('no match')

phone number? ('quit' to stop)  (848)445-2590


match


phone number? ('quit' to stop)  (848)4452590


match


phone number? ('quit' to stop)  8484452590


no match


phone number? ('quit' to stop)  84812


no match


phone number? ('quit' to stop)  quit


In [13]:
# next, let's strengthen the above with the ability to handle leading/trailing whitespaces
while True:
    astr = input("phone number? ('quit' to stop) ")
    if astr == 'quit':
        break
    res = re.search(r'^\s*\(\d{3}\)\d{3}-?\d{4}\s*$',astr)  
    print('match') if res else print('no match')

phone number? ('quit' to stop)    (848)123-4567  


match


phone number? ('quit' to stop)  quit


##### For the non ( ) variants, we can't have a single pattern using -? for each of the - positions because it will match even if only one dash is present, and the other is not

In [14]:
# so, for instance, it will work for this string
res = re.search(r'^\s*\d{3}-?\d{3}-?\d{4}\s*$','  848-445-2790')
print(res)

<re.Match object; span=(0, 14), match='  848-445-2790'>


In [15]:
# but also for this string, which is not an acceptable variant
res = re.search(r'^\s*\d{3}-?\d{3}-?\d{4}\s*$','  848-4452790')
print(res)

<re.Match object; span=(0, 13), match='  848-4452790'>


##### So let's do one pattern to catch both dashes

In [73]:
# both dashes
print(re.search(r'^\s*\d{3}-\d{3}-\d{4}\s*$','  848-445-2790   '))

<re.Match object; span=(0, 17), match='  848-445-2790   '>


##### And another pattern to catch a straight sequence of 10 digits

In [74]:
# 10 digits in sequence
print(re.search(r'^\s*\d{10}\s*$','  8484452790   '))

<re.Match object; span=(0, 15), match='  8484452790   '>


##### Final solution, single regexp to catch all variants

In [18]:
# next, let's strengthen the above with the ability to handle leading/trailing whitespaces
while True:
    astr = input("phone number? ('quit' to stop) ")
    if astr == 'quit':
        break
    res = re.search(r'\s*(\(\d{3}\)\d{3}-?\d{4}|\d{3}-\d{3}-\d{4}|\d{10})\s*$',astr) 
    print('match') if res else print('no match')

phone number? ('quit' to stop)   8484452790 


match


phone number? ('quit' to stop)   848-445-2790 


match


phone number? ('quit' to stop)   (848)445-2790 


match


phone number? ('quit' to stop)   (848)4452790 


match


phone number? ('quit' to stop)   (848)-445-2790 


no match


phone number? ('quit' to stop)  quit


---

**Word boundary**
- \\b : matches only at word boundary (doesn't actually match any character, just sets the rule).
(Word is a sequence of alphanmeric characters plus underscore.)<br> 

In [19]:
# check if a string contains the word 'part'
res = re.search(r'\b[pP]art\b',"I'm going to a party tomorrow")
print(res)
res = re.search(r'\b[pP]art\b',"This is the best part of the movie.")
print(res)

None
<re.Match object; span=(17, 21), match='part'>


In [20]:
res = re.search(r'\b[eE]pisode\b',"Episode3 has a high rating.") # since word includes digits
print(res)

None


---

#### <font color="brown">Using the match function</font>
#### Matching always starts at beginning of string

In [7]:
print(re.search('ar','barbaric')) # 'ar' is in 'barbaric'
print(re.match('ar','barbaric')) # but 'barbaric' doesn't begin with 'ar'

<re.Match object; span=(1, 3), match='ar'>
None


In [25]:
# match all strings that start with ar, end with t, 
# and have at least one lowercase letter between

res = re.search('^ar[a-z]+t$', 'arrest')  # version 1, using search
print(res)
res = re.match('ar[a-z]+t$', 'arrest')  # version 2, using match   
print(res)

<re.Match object; span=(0, 6), match='arrest'>
<re.Match object; span=(0, 6), match='arrest'>


---

#### <font color="brown">Using the Match object returned by search/match</font>
**Applying the methods group(), span(), start(), end()**

In [26]:
res = re.search('at', 'catch')  # returned Match object is stored in res
print(res.group())  # returns the matched result string
print(res.span()) # returns the tuple (start,end) indices of matching part of original string
print(res.start()) # returns the start index of span
print(res.end()) # returns the end index of span
start,end = res.span()
print(start,',',end)

at
(1, 3)
1
3
1 , 3


In [29]:
# by definition, match will always return a span that starts at 0 (if a match is found)
res = re.match(r'<.*?>','<span>This is within a span tag in html</span>')
print(res.group())
print(res.span())
print(res.start())
print(res.end())

<span>
(0, 6)
0
6


In [30]:
# be careful to check for existence of returned match object before applying methods!
res = re.match('bar','sandbar')
print(res.group())

AttributeError: 'NoneType' object has no attribute 'group'

In [31]:
# defend!
res = re.match('bar','sandbar')
print(res.group()) if res else print('No match')

No match


**Typical usage is to store in Match object, check if it exists (not None), and then get matched string with group**

In [34]:
# find out if a string contains any substring that starts with ar, ends with t, 
# and has at least one lowercase letter between
def substr(astr):
    res = re.search('ar[a-z]+t',astr)  
    print('Match:',res.group()) if res else print('No match')
        
substr('parasite')
substr('artist')
substr('part')

Match: arasit
Match: artist
No match


---

#### <font color="brown">Using findall and finditer functions to get all matches</font>
- findall constructs the entire list before returning it
- finditer returns one at a time, on demand, in a Match object

In [32]:
# extracting words
res = re.findall("[a-zA-Z]+",'Some of these words have##$ trailing junk')
print(res)

['Some', 'of', 'these', 'words', 'have', 'trailing', 'junk']


In [39]:
# reject have##$?
res = re.findall(r'[a-zA-Z]+\b','Some of these words have##$ trailing junk')
print(res)

['Some', 'of', 'these', 'words', 'have', 'trailing', 'junk']


In [41]:
# we would have to look for one or more white spaces following each word
res = re.findall(r'[a-zA-Z]+\s+','Some of these words have##$ trailing junk')
print(res)

['Some ', 'of ', 'these ', 'words ', 'trailing ']


**We can eliminate the included trailing whitespace from the extracted strings by grouping, see next section**

In [35]:
# getting words, one at a time with finditer
iterator = re.finditer("[a-zA-Z]+",'Some of these words have##$ trailing junk')
print(iterator)
for match in iterator:
    print(match.group(),'@',match.span())

<callable_iterator object at 0x104d6a650>
Some @ (0, 4)
of @ (5, 7)
these @ (8, 13)
words @ (14, 19)
have @ (20, 24)
trailing @ (28, 36)
junk @ (37, 41)


---

#### <font color="brown">Grouping/Capturing</font>

In [63]:
# want to extract ("capture") area code separate from rest of phone number
# assume standard format (ddd)ddd-dddd
res = re.match(r'\s*\((\d{3})\)(\d{3}-\d{4})', '(848)445-2790')

**Notice the grouping/capturing with parentheses around the area code part, as in (\d{3})
and likewise for the entire non-area code part**

In [64]:
print(res.group())  # for the whole thing
print(res.groups()) # for all parts captured with ( )
print(res.group(0)) # entire thing
print(res.group(1)) # first grouping with ( )
print(res.group(2)) # second grouping with ( )

(848)445-2790
('848', '445-2790')
(848)445-2790
848
445-2790


In [54]:
# equally, you can use search instead of match, just make sure to use ^ for start of string
res = re.search(r'^\s*\((\d{3})\)(\d{3}-\d{4})', '(848)445-2790')

In [55]:
print(res.group())  # for the whole thing
print(res.groups()) # for all parts grouped with ( )
print(res.group(0)) # entire thing
print(res.group(1)) # first grouping with ( )
print(res.group(2)) # second grouping with ( )

(848)445-2790
('848', '445-2790')
(848)445-2790
848
445-2790


In [60]:
# alternatively, you can index into the groups() tuple
print(res.groups()[0])
print(res.groups()[1])

848
445-2790


In [62]:
# so you can do this
res = re.match(r'\s*\((\d{3})\)(\d{3}-\d{4})', '(848)445-2790')
if res:
    for gr in res.groups():
        print(gr)

848
445-2790


---

#### <font color="brown">Pre-compiling a regular expression</font>

**Sometimes it's easier to "compile" a regular expression and use it several times later**

In [71]:
pattrn = re.compile(r'\s*\((\d{3})\)(\d{3}-\d{4})')
res = pattrn.match('(848)445-2790')
print(res.groups())

('848', '445-2790')


In [72]:
patt = re.compile(r'\s*#?\s*(\d+)')
res = patt.match('#25 Infinite Loop,Cupertino,CA 12345')
print(res.groups())
res = patt.match(' # 25 Infinite Loop,Cupertino,CA 12345')
print(res.groups())
res = patt.match(' 25 Infinite Loop,Cupertino,CA 12345')
print(res.groups())

('25',)
('25',)
('25',)
