# [Regular expressions](https://www.regular-expressions.info/tutorial.html)

"a regular expression is a pattern describing a certain amount of text"

#### There are special characters:

. + * ? ^ $ ( ) [ ] { } | \

Each one of them has a specific meaning. If you want to search for those characters you need to use a backshash

#### Import regex library

In [2]:
import re

#### Raw strings
_An 'r' before a string tells the Python interpreter to treat backslashes as a literal (raw) character_

In [2]:
print('This is a line \nand this is a new one')
print(r'This is a line \nand this is a new one')

This is a line 
and this is a new one
This is a line \nand this is a new one


#### How are regex useful?
For example, imagine that you have a long list of dates one hunderd times larger that the above. How one can search for all dates in August? 

In [3]:
dates = """
08-12-2012
06-07-2015
08/08/08
04.08.08
09.09.2019
9.9.2019
8.9.2019
"""

In [4]:
results = re.findall(r'[\d]?8[-\/\.][\d]{1,2}[-/\.][\d]{2,4}', dates)
print(results)


['08-12-2012', '08/08/08', '8.9.2019']


#### Text string

In [8]:
with open(faketext) as f:
    faketext = f.read()

NameError: name 'faketext' is not defined

In [6]:
faketext = """A

Alex
Georgios
Shama
Suleiman
Liam
Olivia
Noah
Emma
Oliver
Charlotte
Elijah
Amelia
James
Ava
William
Sophia
Benjamin
Isabella
Lucas
Mia
Henry
Evelyn
Theodore
Harper

08-12-2012
06-07-2015
08/08/08
04.08.08
09.09.2019
9.9.2019
8.9.2019

202-555-0166
201*555*0177
(202) 555-0128
(201)555-0178
202 555 0198
900-555-0166
800*555*0177
(900) 555-0128
(800)555-0178
900 555 0198
800.555.0199
800.555.0152
201.555.0199
201.555.0152


Mr. Darcy
Dr. Tsolakis
Prof. Cartledge
Mr. T
Mrs Robinson
Mr. Bean
Miss Piggy 


JohnDoe@gmail.com
John_Doe@gmail.com
JohnDoe@facebook.net
john.doe@uchicago.edu
sincere.jakubowski@gutkowski.com
kerluke.cierra@bradtke.com
nromaguera@yahoo.com
john-doe@gmail.com
johndoe1990@gmail.com

Z"""
# Using 3 quotation marks allows you to use more than one line

#### finditer

### Word Characters

#### . 
Any character except a new line

In [52]:
mypattern = re.compile(r'.e')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(4, 6), match='le'>
<re.Match object; span=(8, 10), match='Ge'>
<re.Match object; span=(25, 27), match='le'>
<re.Match object; span=(57, 59), match='ve'>
<re.Match object; span=(68, 70), match='te'>
<re.Match object; span=(79, 81), match='me'>
<re.Match object; span=(87, 89), match='me'>
<re.Match object; span=(110, 112), match='Be'>
<re.Match object; span=(122, 124), match='be'>
<re.Match object; span=(138, 140), match='He'>
<re.Match object; span=(145, 147), match='ve'>
<re.Match object; span=(152, 154), match='he'>
<re.Match object; span=(157, 159), match='re'>
<re.Match object; span=(163, 165), match='pe'>
<re.Match object; span=(461, 463), match='le'>
<re.Match object; span=(464, 466), match='ge'>
<re.Match object; span=(490, 492), match='Be'>
<re.Match object; span=(514, 516), match='oe'>
<re.Match object; span=(533, 535), match='oe'>
<re.Match object; span=(551, 553), match='oe'>
<re.Match object; span=(556, 558), match='ce'>
<re.Match object; span=(563, 5

#### \d
Digits

In [53]:
mypattern = re.compile(r'\d\d')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(168, 170), match='08'>
<re.Match object; span=(171, 173), match='12'>
<re.Match object; span=(174, 176), match='20'>
<re.Match object; span=(176, 178), match='12'>
<re.Match object; span=(179, 181), match='06'>
<re.Match object; span=(182, 184), match='07'>
<re.Match object; span=(185, 187), match='20'>
<re.Match object; span=(187, 189), match='15'>
<re.Match object; span=(190, 192), match='08'>
<re.Match object; span=(193, 195), match='08'>
<re.Match object; span=(196, 198), match='08'>
<re.Match object; span=(199, 201), match='04'>
<re.Match object; span=(202, 204), match='08'>
<re.Match object; span=(205, 207), match='08'>
<re.Match object; span=(208, 210), match='09'>
<re.Match object; span=(211, 213), match='09'>
<re.Match object; span=(214, 216), match='20'>
<re.Match object; span=(216, 218), match='19'>
<re.Match object; span=(223, 225), match='20'>
<re.Match object; span=(225, 227), match='19'>
<re.Match object; span=(232, 234), match='20'>
<re.Match obj

#### \D
Not a digit

In [54]:
mypattern = re.compile(r'\D\D')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(0, 2), match='A\n'>
<re.Match object; span=(2, 4), match='\nA'>
<re.Match object; span=(4, 6), match='le'>
<re.Match object; span=(6, 8), match='x\n'>
<re.Match object; span=(8, 10), match='Ge'>
<re.Match object; span=(10, 12), match='or'>
<re.Match object; span=(12, 14), match='gi'>
<re.Match object; span=(14, 16), match='os'>
<re.Match object; span=(16, 18), match='\nS'>
<re.Match object; span=(18, 20), match='ha'>
<re.Match object; span=(20, 22), match='ma'>
<re.Match object; span=(22, 24), match='\nS'>
<re.Match object; span=(24, 26), match='ul'>
<re.Match object; span=(26, 28), match='ei'>
<re.Match object; span=(28, 30), match='ma'>
<re.Match object; span=(30, 32), match='n\n'>
<re.Match object; span=(32, 34), match='Li'>
<re.Match object; span=(34, 36), match='am'>
<re.Match object; span=(36, 38), match='\nO'>
<re.Match object; span=(38, 40), match='li'>
<re.Match object; span=(40, 42), match='vi'>
<re.Match object; span=(42, 44), match='a\n'>
<re.Match o

#### \w
Word character (lowercase and uppercase letters, digits, or underscore)

In [55]:
mypattern = re.compile(r'\w\w\w')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(3, 6), match='Ale'>
<re.Match object; span=(8, 11), match='Geo'>
<re.Match object; span=(11, 14), match='rgi'>
<re.Match object; span=(17, 20), match='Sha'>
<re.Match object; span=(23, 26), match='Sul'>
<re.Match object; span=(26, 29), match='eim'>
<re.Match object; span=(32, 35), match='Lia'>
<re.Match object; span=(37, 40), match='Oli'>
<re.Match object; span=(40, 43), match='via'>
<re.Match object; span=(44, 47), match='Noa'>
<re.Match object; span=(49, 52), match='Emm'>
<re.Match object; span=(54, 57), match='Oli'>
<re.Match object; span=(57, 60), match='ver'>
<re.Match object; span=(61, 64), match='Cha'>
<re.Match object; span=(64, 67), match='rlo'>
<re.Match object; span=(67, 70), match='tte'>
<re.Match object; span=(71, 74), match='Eli'>
<re.Match object; span=(74, 77), match='jah'>
<re.Match object; span=(78, 81), match='Ame'>
<re.Match object; span=(81, 84), match='lia'>
<re.Match object; span=(85, 88), match='Jam'>
<re.Match object; span=(91, 94), matc

#### \W
Not a word character 

In [56]:
mypattern = re.compile(r'\W')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='\n'>
<re.Match object; span=(2, 3), match='\n'>
<re.Match object; span=(7, 8), match='\n'>
<re.Match object; span=(16, 17), match='\n'>
<re.Match object; span=(22, 23), match='\n'>
<re.Match object; span=(31, 32), match='\n'>
<re.Match object; span=(36, 37), match='\n'>
<re.Match object; span=(43, 44), match='\n'>
<re.Match object; span=(48, 49), match='\n'>
<re.Match object; span=(53, 54), match='\n'>
<re.Match object; span=(60, 61), match='\n'>
<re.Match object; span=(70, 71), match='\n'>
<re.Match object; span=(77, 78), match='\n'>
<re.Match object; span=(84, 85), match='\n'>
<re.Match object; span=(90, 91), match='\n'>
<re.Match object; span=(94, 95), match='\n'>
<re.Match object; span=(102, 103), match='\n'>
<re.Match object; span=(109, 110), match='\n'>
<re.Match object; span=(118, 119), match='\n'>
<re.Match object; span=(127, 128), match='\n'>
<re.Match object; span=(133, 134), match='\n'>
<re.Match object; span=(137, 138), match='\n'>
<re.

#### \s
Whitespace (space, tab, newline)

In [57]:
mypattern = re.compile(r'\s')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='\n'>
<re.Match object; span=(2, 3), match='\n'>
<re.Match object; span=(7, 8), match='\n'>
<re.Match object; span=(16, 17), match='\n'>
<re.Match object; span=(22, 23), match='\n'>
<re.Match object; span=(31, 32), match='\n'>
<re.Match object; span=(36, 37), match='\n'>
<re.Match object; span=(43, 44), match='\n'>
<re.Match object; span=(48, 49), match='\n'>
<re.Match object; span=(53, 54), match='\n'>
<re.Match object; span=(60, 61), match='\n'>
<re.Match object; span=(70, 71), match='\n'>
<re.Match object; span=(77, 78), match='\n'>
<re.Match object; span=(84, 85), match='\n'>
<re.Match object; span=(90, 91), match='\n'>
<re.Match object; span=(94, 95), match='\n'>
<re.Match object; span=(102, 103), match='\n'>
<re.Match object; span=(109, 110), match='\n'>
<re.Match object; span=(118, 119), match='\n'>
<re.Match object; span=(127, 128), match='\n'>
<re.Match object; span=(133, 134), match='\n'>
<re.Match object; span=(137, 138), match='\n'>
<re.

#### \S
Not whitespace

In [58]:
mypattern = re.compile(r'\S')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='A'>
<re.Match object; span=(3, 4), match='A'>
<re.Match object; span=(4, 5), match='l'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='x'>
<re.Match object; span=(8, 9), match='G'>
<re.Match object; span=(9, 10), match='e'>
<re.Match object; span=(10, 11), match='o'>
<re.Match object; span=(11, 12), match='r'>
<re.Match object; span=(12, 13), match='g'>
<re.Match object; span=(13, 14), match='i'>
<re.Match object; span=(14, 15), match='o'>
<re.Match object; span=(15, 16), match='s'>
<re.Match object; span=(17, 18), match='S'>
<re.Match object; span=(18, 19), match='h'>
<re.Match object; span=(19, 20), match='a'>
<re.Match object; span=(20, 21), match='m'>
<re.Match object; span=(21, 22), match='a'>
<re.Match object; span=(23, 24), match='S'>
<re.Match object; span=(24, 25), match='u'>
<re.Match object; span=(25, 26), match='l'>
<re.Match object; span=(26, 27), match='e'>
<re.Match object; span=(27, 28), match='i'>
<

### Anchors and Boundaries
They do not match a character but positions

#### \b
Word boundary 

In [59]:
mypattern = re.compile(r'\bA..')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(3, 6), match='Ale'>
<re.Match object; span=(78, 81), match='Ame'>
<re.Match object; span=(91, 94), match='Ava'>


#### \B
Not a word boundary 

#### ^
Beginning of the string

In [60]:
mypattern = re.compile(r'^')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(0, 0), match=''>


#### $
End of the string

#### Phone numbers

In [61]:
mypattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(238, 250), match='202-555-0166'>
<re.Match object; span=(251, 263), match='201*555*0177'>
<re.Match object; span=(280, 292), match='201)555-0178'>
<re.Match object; span=(293, 305), match='202 555 0198'>
<re.Match object; span=(306, 318), match='900-555-0166'>
<re.Match object; span=(319, 331), match='800*555*0177'>
<re.Match object; span=(348, 360), match='800)555-0178'>
<re.Match object; span=(361, 373), match='900 555 0198'>
<re.Match object; span=(374, 386), match='800.555.0199'>
<re.Match object; span=(387, 399), match='800.555.0152'>
<re.Match object; span=(400, 412), match='201.555.0199'>
<re.Match object; span=(413, 425), match='201.555.0152'>


#### Quantifiers

| Quantifier | Meaning |
| ----------- | ----------- 
| a? | Zero or one of a |
| a* | Zero or more of a |
| a+ | One or more of a |
| [0-9]+ | One or more of 0-9 |
| a{3} | Exactly 3 of a |
| a{3,} | 3 or more of a |
| a{3,6} | Between 3 and 6 of a |
| a* | Greedy quantifier |
| a*? | Lazy quantifier |
| a*+ | Possessive quantifier |

In [62]:
sentence = """
coffe coffee please sir I need some more coffee please"
"""

In [63]:
[a-zA-Z]{1,3}

SyntaxError: invalid syntax (3364947919.py, line 1)

In [None]:
quote = re.findall('co.*ee', sentence)
print(quote)

In [None]:
quote = re.findall('co.*?ee', sentence)
print(quote)

#### Simplify the above regex

In [65]:
mypattern = re.compile(r'\800.\d{3}.\d{4}.')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

error: invalid group reference 80 at position 1

#### Get all phone numbers

#### Get all phone numbers that begin with 800

In [None]:
mypattern = re.compile(r'800.\d{3}.\d{4}.')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

#### -
Range e.g., [1-5] or [a-z] or [A-Z]

#### ^-
Not the range e.g., [^1-5] or [^a-z] or [^A-Z]

#### Get all phone numbers that begin with 800 or 900

#### Get all the prefixes with the attached names

In [68]:
mypattern = re.compile(r'(Mr|Ms|Mrs|Dr|Prof). [A-Z]\w*\b')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(428, 437), match='Mr. Darcy'>
<re.Match object; span=(438, 450), match='Dr. Tsolakis'>
<re.Match object; span=(451, 466), match='Prof. Cartledge'>
<re.Match object; span=(467, 472), match='Mr. T'>
<re.Match object; span=(473, 485), match='Mrs Robinson'>
<re.Match object; span=(486, 494), match='Mr. Bean'>


#### Match the emails

In [71]:
mypattern = re.compile(r'[a-zA-Z0-9\-]+@[a-zA-Z0-9\-]+.[a-zA-Z0-9\-]+')
matches = mypattern.finditer(faketext)
for match in matches:
    print(match)

<re.Match object; span=(509, 526), match='JohnDoe@gmail.com'>
<re.Match object; span=(532, 545), match='Doe@gmail.com'>
<re.Match object; span=(546, 566), match='JohnDoe@facebook.net'>
<re.Match object; span=(572, 588), match='doe@uchicago.edu'>
<re.Match object; span=(597, 621), match='jakubowski@gutkowski.com'>
<re.Match object; span=(630, 648), match='cierra@bradtke.com'>
<re.Match object; span=(649, 669), match='nromaguera@yahoo.com'>
<re.Match object; span=(670, 688), match='john-doe@gmail.com'>
<re.Match object; span=(689, 710), match='johndoe1990@gmail.com'>


#### Replace

#### sub

In [None]:
dates = """08-12-2012
06-07-2015
08/08/08
04.08.08
09.09.2019
9.9.2019
8.9.2019"""

#### findall

#### search
Returns the first match otherwise it returns None

#### Flags

#### Write a regex that finds all roman numberals and print only how many roman numerals exist in the string.
| Roman | Value |
| --- | --- |
| I | 1 |
| II | 2 |
| III | 3 |
| IV | 4 |
| IX | 9 |
| X | 10 |
| L | 50 |
| C | 100 |
| D | 500 |
| M | 1000 |

In [5]:
numbers = """
DCXXXIV
CXXXI
CCXXXI
DCXX
DXCIX
DCIV
DCLVII
CLXXXV
XVI
CV
MCCLXXIV
CMIX
DXXXI
DL
DXCI
CDLXXIX
DLXII
CMXLII
CDX
CLXXXVIII
CDV
CXCI
XLII
LXVI
DCCXLVI
CDLXXXVII
"""

In [15]:
numbers2 = "DCXXXIV, CXXXI, CCXXXI, DCXX, DXCIX, DCIV, DCLVII, CLXXXV, XVI, CV, MCCLXXIV, CMIX, DXXXI, DL, DXCI, CDLXXIX, DLXII, CMXLII, CDX, CLXXXVIII, CDV, CXCI, XLII, LXVI, DCCXLVI, CDLXXXVII"

In [24]:
numberpattern = re.findall(r'^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}),$', numbers)
print(numberpattern)

[]


In [22]:
mypattern = re.compile(r'^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')
matches = mypattern.finditer(numbers)
for match in matches:
    print(match)

In [21]:
resultnumbers = re.finditer(r'^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}),$', numbers2)
print(resultnumbers)

<callable_iterator object at 0x7fd300affcd0>


----