# Advanced Regex

## Regular Expression review

- A powerful way to match text

In [1]:
import pandas as pd
import numpy as np

import re

https://regexper.com/#%28%5E%7C%5C%28%29%3F

In [5]:
'andre Andre aguiar'.replace('andre', 'joao')

'joao Andre aguiar'

In [6]:
text = "That person wears marvelous trousers."

### `Literal strings` vs `sets`

In [8]:
# literal strings: find the pattern 'person'
pattern = 'person'
re.findall(pattern, text)

['person']

In [10]:
pattern = 'persona'
re.findall(pattern, text)

[]

In [11]:
pattern = 'person'
re.sub(pattern,'man', text)

'That man wears marvelous trousers.'

In [12]:
# sets: Finding the pattern `p` or `e` or `r` or ...
pattern = '[person]'
print(re.findall(pattern, text))

['p', 'e', 'r', 's', 'o', 'n', 'e', 'r', 's', 'r', 'e', 'o', 's', 'r', 'o', 's', 'e', 'r', 's']


In [13]:
text = 'andre Andre aguiar'

pattern = '[Aa]ndre'
re.findall(pattern, text)

['andre', 'Andre']

In [14]:
text = 'São Paulo Sao Paulo Sáo Paulo Sun Paulo seu paulo san paolo sao paulo são paolo sAo Paolo sao_paulo'

pattern = '[Ss][ãaáàâAÃÁÀâeu][oun][ _][Pp]a[uo]lo'
print(re.sub(pattern, 'São Paulo\n', text))

São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo
 São Paulo



In [15]:
text = "Is it spelled gray or grey?"

pattern = 'gr[ae]y'
re.findall(pattern, text)

['gray', 'grey']

> So anything within brackets `[ ]` are considered `sets` in RegEx. A set of patterns you want to find. 

## Since it is a set, you can look for complete sets

For example: The set of upper-case letters from A to C.

In [16]:
text = "This is an A and B conversation, so C your way out of it, or Even F."

pattern = '[A-C]'
re.findall(pattern, text)

['A', 'B', 'C']

In [17]:
pattern = '[A-Z]'
re.findall(pattern, text)

['T', 'A', 'B', 'C', 'E', 'F']

In [18]:
text = "I'm not going to 0A the party because 1) Karen is going, 2) I don't like her, and 3) 3B I already have a headache."

pattern = '[1-3]'
re.findall(pattern, text)

['1', '2', '3', '3']

In [19]:
pattern = '[0-9]'
re.findall(pattern, text)

['0', '1', '2', '3', '3']

In [None]:
pattern = '[0-9A-Z]'
re.findall(pattern, text)

In [None]:
# pattern = '[0123456789ABCDEFG..]'
# re.findall(pattern, text)

In [20]:
pattern = '[0-9][A-Z]'
re.findall(pattern, text)

['0A', '3B']

Some useful sets: 

* [a-z]: Any lowercase letter between a and z.
* [A-Z]: Any uppercase letter between A and Z.
* [0-9]: Any numeric character between 0 and 9.

In [None]:
pattern = '[^0-9 a-z]'
re.findall(pattern, text)

# Meta characters - They means something different than the character they represent.

* `.` : Match **any character** except newline (`\n`)
* `^` : If used within a `set`, negates the condition (similar to `~` in python)
> Careful, this pattern also represents another thing: If used <u>outside a set</u>, it represents `match if at the beginning of the line`
* `$` : Match if at end of the line
* `|` : "OR" operator

## OR

In [21]:
text = 'Andre andre'

In [22]:
pattern = '[Aa]'
re.findall(pattern, text)

['A', 'a']

In [23]:
pattern = 'A|a'
re.findall(pattern, text)

['A', 'a']

In [24]:
text = '''
I like penguins
I like lions
I like penguins and lions
'''

pattern = 'penguins|lions'
re.findall(pattern, text)

['penguins', 'lions', 'penguins', 'lions']

In [27]:
print(re.sub(pattern, 'dogs', text))


I like dogs
I like dogs
I like dogs and dogs



## Match any character

In [28]:
text = """My boss asked me to turn in my TPS reports. 
I told him they were done, but they are not."""

pattern = '.'
print(re.findall(pattern, text))

['M', 'y', ' ', 'b', 'o', 's', 's', ' ', 'a', 's', 'k', 'e', 'd', ' ', 'm', 'e', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'i', 'n', ' ', 'm', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'e', 'p', 'o', 'r', 't', 's', '.', ' ', 'I', ' ', 't', 'o', 'l', 'd', ' ', 'h', 'i', 'm', ' ', 't', 'h', 'e', 'y', ' ', 'w', 'e', 'r', 'e', ' ', 'd', 'o', 'n', 'e', ',', ' ', 'b', 'u', 't', ' ', 't', 'h', 'e', 'y', ' ', 'a', 'r', 'e', ' ', 'n', 'o', 't', '.']


## Match everything not in specific set

In [29]:
text = """My boss asked me to turn in my TPS reports. 
I told him they were done, but they are not."""

In [30]:
pattern = '[^a-m]'
print(re.findall(pattern, text))

['M', 'y', ' ', 'o', 's', 's', ' ', 's', ' ', ' ', 't', 'o', ' ', 't', 'u', 'r', 'n', ' ', 'n', ' ', 'y', ' ', 'T', 'P', 'S', ' ', 'r', 'p', 'o', 'r', 't', 's', '.', ' ', '\n', 'I', ' ', 't', 'o', ' ', ' ', 't', 'y', ' ', 'w', 'r', ' ', 'o', 'n', ',', ' ', 'u', 't', ' ', 't', 'y', ' ', 'r', ' ', 'n', 'o', 't', '.']


## Match sentences `beginning with pattern`

In [40]:
text = '''My boss asked me to turn in my TPS reports. 
The boss told him they were done, but they are not.'''

In [42]:
pattern = '^My.boss'
print(re.findall(pattern, text))

['My boss']


In [32]:
pattern = '^[The boss]'
print(re.findall(pattern, text))

[]


In [33]:
pattern = '^turn'
print(re.findall(pattern, text))

[]


In [34]:
pattern = 'reports.$'
print(re.findall(pattern, text))

[]


In [35]:
pattern = 'are not.$'
print(re.findall(pattern, text))

['are not.']


In [36]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå'

pattern = '.'
print(re.findall(pattern, text))

['A', 'n', 'd', 'r', 'e', ' ', 'a', 'n', 'd', 'r', 'e', ' ', 'a', 'o', 'i', 'j', 'o', ' ', '(', ' ', ' ', '$', ' ', 'p', ' ', 'i', 'o', ' ', 'x', ' ', '-', 'o', ' ', '=', ' ', '3', '2', '3', '2', ' ', '1', '3', ' ', '™', '¡', '¡', '™', '£', '¡', 'Ω', 'å']


## Characters classes

* `\d`: numeric characters
* `\w`: alphanumeric characters 
* `\s`: spaces
* `\D`: not numeric characters

In [43]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.1 áéóãà'

pattern = '\d'
print(re.findall(pattern, text))

['3', '2', '3', '2', '1', '3', '3', '1']


In [44]:
pattern = '[^\d]'
# pattern = '\D'

print(re.findall(pattern, text))

['A', 'n', 'd', 'r', 'e', ' ', 'a', 'n', 'd', 'r', 'e', ' ', 'a', 'o', 'i', 'j', 'o', ' ', '(', ' ', ' ', '$', ' ', 'p', ' ', 'i', 'o', ' ', 'x', ' ', '-', 'o', ' ', '=', ' ', ' ', ' ', '™', '¡', '¡', '™', '£', '¡', 'Ω', 'å', ' ', '.', ' ', 'á', 'é', 'ó', 'ã', 'à']


In [45]:
pattern = '\w'

print(re.findall(pattern, text))

['A', 'n', 'd', 'r', 'e', 'a', 'n', 'd', 'r', 'e', 'a', 'o', 'i', 'j', 'o', 'p', 'i', 'o', 'x', 'o', '3', '2', '3', '2', '1', '3', 'Ω', 'å', '3', '1', 'á', 'é', 'ó', 'ã', 'à']


# Quantifiers 

* *: Matches previous character 0 or more times
* +: Matches previous character 1 or more times
* ?: Matches previous character 0 or 1 times (optional)
* {}: Matches previous characters however many times specified within:
* {n} : Exactly n times
* {n,} : At least n times
* {n,m} : Between n and m times

## \d* --> Matches any numeric character that appears 0 or more times.

In [49]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.1 áéóãà'

pattern = '\d*'
print(re.findall(pattern, text))

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '3232', '', '13', '', '', '', '', '', '', '', '', '', '', '3', '', '1', '', '', '', '', '', '', '']


In [None]:
## \d+ --> Matches any numeric character that appears 1 or more times.

In [50]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.1 áéóãà'

pattern = '\d+'
print(re.findall(pattern, text))

['3232', '13', '3', '1']


In [51]:
text = 'Andre andre aoijo (  $ p io x -o = 3232 13 ™¡¡™£¡Ωå 3.1 áéóãà'

pattern = '\d+\.?\d+'
print(re.findall(pattern, text))

['3232', '13', '3.1']


In [58]:
text = 'Oi, meu nome é Andre, meu telefone é 93333-3333 ou 933333333, meu e-mail é andre.aguiar@ironhack.com'

In [53]:
re.findall('\d{4}', text)

['9333', '3333', '9333', '3333']

In [54]:
re.findall('\d{5}', text)

['93333', '93333']

In [55]:
re.sub('\d', '100', text)

'Oi, meu nome é Andre, meu telefone é 100100100100100-100100100100 ou 100100100100100100100100100, meu e-mail é andre.aguiar@ironhack.com'

In [60]:
re.sub('\d{4}', '100', text)

'Oi, meu nome é Andre, meu telefone é 1003-100 ou 1001003, meu e-mail é andre.aguiar@ironhack.com'

In [61]:
re.sub('\d+-?\d+', '100', text)

'Oi, meu nome é Andre, meu telefone é 100 ou 100, meu e-mail é andre.aguiar@ironhack.com'

## Application of previous example of `$` using one of the most useful quantifiers `*`

In [73]:
text = '''My boss asked me to turn in my TPS reports. My boss told him they were done, but they are not.'''

In [74]:
pattern = 'are not\.$'
print(re.findall(pattern, text))

['are not.']


In [77]:
pattern = '.*are not\.$'
print(re.findall(pattern, text))

['My boss asked me to turn in my TPS reports. My boss told him they were done, but they are not.']


In [None]:
pattern = '.*\n.*are not\.$'
print(re.findall(pattern, text))

In [None]:
text

In [None]:
pattern = ',.*are not.$'
print(re.findall(pattern, text))

In [None]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss told, him they were done, but they, are not.'''

In [None]:
pattern = ',.*are not.$'
print(re.findall(pattern, text))

In [None]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss (told him they) were done (but they) are not.'''

In [None]:

pattern = '\(.*?\)'
print(re.findall(pattern, text))

In [None]:
re.findall('coisas?','coisa coisas')

# Capturing group

What if I wanted to capture only things up until the comma (`,`), however, not include the comma?

I would have to use a capturing group to specify what specifically I want to capture.

In [83]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss (told him they) were done (but they) are not.'''

In [84]:
pattern = '\((.*?)\)'
print(re.findall(pattern, text))

['told him they', 'but they']


In [81]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss -told him they- were done -but they- are not.'''

In [85]:
pattern = '-(.*?)-'
print(re.findall(pattern, text))

[]


In [86]:
pattern = '-(.*?)-'
print(re.findall(pattern, text))

[]


In [None]:
text = '''My boss asked, me to turn in my TPS reports. 
My boss told, him they were done, but they, are ,not.'''

In [None]:
pattern = ',(.*?),'
print(re.findall(pattern, text))

In [87]:
text = "TerraPower, a nuclear-energy company founded by Bill Gates, is unlikely to follow through on building a demonstration reactor in China, due largely to the Trump administration’s crackdown on the country."

pattern = '[A-Z][a-zA-Z]+'
print(re.findall(pattern, text))

['TerraPower', 'Bill', 'Gates', 'China', 'Trump']


In [88]:
pattern = '([A-Z][a-zA-Z]+ ?[A-Z][a-zA-Z]+)|([A-Z][a-z]+)'

In [90]:
print(re.findall(pattern, text))

[('TerraPower', ''), ('Bill Gates', ''), ('', 'China'), ('', 'Trump')]


In [91]:
simple_names = [name[1] for name in re.findall(pattern, text) if name[1] != '']
combined_names = [name[0] for name in re.findall(pattern, text) if name[0] != '']

In [92]:
combined_names

['TerraPower', 'Bill Gates']

In [None]:
pattern = '([A-Z][a-z]+)|([A-Z][a-zA-Z]+ ?[A-Z][a-zA-Z]+)'

In [None]:
print(re.findall(pattern, text))

## Using quantifiers again

> `*` matches **0 or more** times

In [None]:
text = "The cacat complicit cat interacted with the other cats exactly as we expected caaaat."

pattern = "ca?\w*t"
print(re.findall(pattern, text))

In [None]:
pattern = "ca+t"
print(re.findall(pattern, text))

> `?` matches previous pattern 0 or 1 time

In [None]:
text = "The colonel likes the color blue"

pattern = "colou?r"
print(re.findall(pattern, text))

How the Regex engine works?

In [None]:
text = "Is the correct spelling color, colour, or colr?"

pattern = "colou?r"
print(re.findall(pattern, text))

# Important Regex Concept: Greediness


What will this match?

In [93]:
text = 'You are yelling! So I will yell too! Let me yell!.'

# anything up to exclamation point
pattern = ".*!"
print(re.findall(pattern, text))

['You are yelling! So I will yell too! Let me yell!']


In [94]:
pattern = ".*?!"
re.findall(pattern, text)

['You are yelling!', ' So I will yell too!', ' Let me yell!']

In [95]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2}"
print(re.findall(pattern, text))

['aww', 'aww', 'aww', 'aww']


In [96]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,}"
print(re.findall(pattern, text))

['aww', 'awww', 'awwww', 'awwwww']


In [97]:
text = "Let's see how we can match the following: aw, aww, awww, awwww, awwwww"

pattern = "aw{2,3}"
print(re.findall(pattern, text))

['aww', 'awww', 'awww', 'awww']


In [98]:
text = "Ooooooiiiii gente"

pattern = "[Oo]{1,}i{1,}"
pattern = "[Oo]+i+"
print(re.findall(pattern, text))

['Ooooooiiiii']


In [99]:
text = "If you tell the truth 1 time, you don't have to remember anything 2 times."

pattern = '\w+'
print(re.findall(pattern, text))

['If', 'you', 'tell', 'the', 'truth', '1', 'time', 'you', 'don', 't', 'have', 'to', 'remember', 'anything', '2', 'times']


In [100]:
## word length
pattern = '\w{4,}'
print(re.findall(pattern, text))

['tell', 'truth', 'time', 'have', 'remember', 'anything', 'times']


In [101]:
text = "TerraPower, a nuclear-energy company founded by Bill Gates, is unlikely to follow through on building a demonstration reactor in China, due largely to the Trump administration’s crackdown on the country."

pattern = '[A-Z][a-z]+'
print(re.findall(pattern, text))

['Terra', 'Power', 'Bill', 'Gates', 'China', 'Trump']


https://phoneregex.com/