## Regular Expression

- import re
- re.compile(r'abc')
- re.compile(r'abc').finditer('text to search')
- returns zero based index locations
- Special characters
    -  . any characters except new line
    - \s - white space | \S - not a white space (includes new line)
    - \d digit(0-9) | \D not a digit
    - \w - word (a-z, A-Z, 0-9, _) | \W - not a word    
    - \b - word boundary | \B - not a word boundary (example: 'Hello \b')    
    - ^  - beginning of the string | $  - end of string
    - [] - matches characters in brackets | \[^ \] - matches characters not in brackets ( # to include hyphen, add - at the end char set )
    - |  - logical or 
    - () - logical group

- Quantifiers
    - \* 0 or more
    - \+ 1 or more
    - ? 0 or one    
    - {3,4} - range of number

In [1]:
import re

In [74]:
text_to_search = '''
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ

1234567890
123abc

Hello HelloHello

MetaCharacters (Need to be escaped):
. ^ $ * ? { } [ ]  \ | ( )

utexas.edu

321-555-4321
123.555.1234

daniel-mitchell@utexas.edu
daniel-mitchell@bbc.co.uk

Mr. Johnson
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [3]:
pattern = re.compile(r'abc') # r is raw; no special interpretation of python
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

<re.Match object; span=(1, 4), match='abc'>
<re.Match object; span=(70, 73), match='abc'>


In [4]:
print(text_to_search[1:4]) # 0 index

abc


In [7]:
print(text_to_search[70:74])

abc



## Searching special characters

escape it using \

In [18]:
re.compile(r'\d\w').findall(text_to_search) # look for 2 char pattern where first char is digit and second char is word char (d, a-z, A-Z, _)

['12',
 '34',
 '56',
 '78',
 '90',
 '12',
 '3a',
 '32',
 '55',
 '43',
 '21',
 '12',
 '55',
 '12',
 '34']

In [17]:
re.compile(r'\d\w*').findall(text_to_search)

['1234567890', '123abc', '321', '555', '4321', '123', '555', '1234']

In [20]:
# Word boundary
re.compile(r'Hello').findall(text_to_search)


['Hello', 'Hello', 'Hello']

In [22]:
re.compile(r'Hello\b').findall(text_to_search) # Hello HelloHello (ignores second hello because it does not have a word boundary at the end)

['Hello', 'Hello']

In [24]:
re.compile(r'\bHello\b').findall(text_to_search) # Hello HelloHello (ignores second and third hello because it does not have a word boundary at start and end)

['Hello']

In [28]:
re.compile(r'^\s').findall(text_to_search) 

['\n']

In [30]:
re.compile(r'\s').findall(text_to_search)[1:3]

['\n', '\n']

## Character set

In [31]:
re.compile(r'[123]\w').findall(text_to_search)


['12', '34', '12', '3a', '32', '32', '12', '12', '34']

In [33]:
re.compile(r'[a-z][a-z]').findall(text_to_search)[0:5]


['ab', 'cd', 'ef', 'gh', 'ij']

In [37]:
re.compile(r'[a-zA-Z0-9][a-zA-Z-]').findall(text_to_search)[0:5] # to include hyphen, add - at the end char set 

['ab', 'cd', 'ef', 'gh', 'ij']

In [39]:
re.compile(r'[(a-z)(A-Z)(0-9)][(a-z)(A-Z)(-)]').findall(text_to_search)[0:5] # () is group


['ab', 'cd', 'ef', 'gh', 'ij']

In [75]:
re.compile(r'[a-zA-Z0-9-]+[@]+[a-zA-Z\.$]+').findall(text_to_search)

['daniel-mitchell@utexas.edu', 'daniel-mitchell@bbc.co.uk']

In [72]:
# Mr. Johnson Mr Smith Ms Davis Mrs. Robinson Mr. T
re.compile(r'[M][r|s]+[\.]? [a-zA-Z]*').findall(text_to_search)

['Mr. Johnson', 'Mr Smith', 'Ms Davis', 'Mrs. Robinson', 'Mr. T']

In [73]:
# 321-555-4321 123.555.1234
re.compile(r'\d{3}[.-]\d{3}[.-]\d{4}').findall(text_to_search)

['321-555-4321', '123.555.1234']

## Accessing from match object

In [77]:
pattern = re.compile(r'\d{3}[.-]\d{3}[.-]\d{4}')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat)

<re.Match object; span=(170, 182), match='321-555-4321'>
<re.Match object; span=(183, 195), match='123.555.1234'>


In [83]:
pattern = re.compile(r'\d{3}[.-]\d{3}[.-]\d{4}')
matches = pattern.finditer(text_to_search)
for mat in matches:
    print(mat.span(0)) #array of start and end index
    print(mat.group(0)) #substring
    print(text_to_search[mat.span(0)[0]:mat.span(0)[1]])

(170, 182)
321-555-4321
321-555-4321
(183, 195)
123.555.1234
123.555.1234


In [85]:
urls = r'''
https://www.google.com
http://yahoo.com
https://www.whitehouse.gov
https://craigslist.org
'''

In [100]:
matches = re.compile(r'https?://(www\.)?(\w+)(\.\w+)').finditer(urls)
for mat in matches:    
    print(mat.group(2) + mat.group(3)) #0 whole sub-string; 1..n each matching group. 1 - 
    

google.com
yahoo.com
whitehouse.gov
craigslist.org


In [103]:
matches = re.compile(r'https?://(www\.)?(\w+)(\.\w+)').finditer(urls)
for mat in matches:    
    print(urls[mat.span(2)[0]:mat.span(2)[1]]) 

google
yahoo
whitehouse
craigslist


In [105]:
matches = re.compile(r'https?://(www\.)?(\w+)(\.\w+)').finditer(urls)
for mat in matches:    
    print(urls[mat.span(1)[0]:mat.span(1)[1]] + urls[mat.span(2)[0]:mat.span(2)[1]] + urls[mat.span(3)[0]:mat.span(3)[1]])

www.google.com
yahoo.com
www.whitehouse.gov
craigslist.org
