# REGular EXpression - INTRO TO REGEX

In [2]:
# your code here!
import re

1. `re.search`: Returns the first instance of an expression in a string.
2. `re.findall`: Finds all instances of an expression in a string and returns them as a list.
3. `re.split`: Splits a string based on a specified delimiter.
4. `re.sub`: Substitutes a string/substring with another.

Website to visually see what your regular expressions look like: https://regexper.com/

In [5]:
text = 'My neighbor, Mr. Rogers, has 5 dogs.'
pattern = 'neighbor'

re.findall(pattern, text)

['neighbor']

In [7]:
text = 'My neighbor, Mr. Rogers, has 5 dogs. I love those dogs.'
pattern = 'dog'

re.findall(pattern, text)

['dog', 'dog']

In [8]:
text = 'My neighbor, Mr. Rogers, has 5 dogs. I love those dogs.'
pattern = 'cat'

re.findall(pattern, text)

[]

In [17]:
text = 'My neighbor, Mr. Rogers, has 5 dogs. I love those dogs.'
pattern = '\.'

re.findall(pattern, text)

['.', '.', '.']

## Introducing Sets

In [22]:
text = 'My neighbor, Mr. Rogers, has 5 dogs.'
pattern = '[Mm][Yy]'

re.findall(pattern, text)

['My']

In [26]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'
pattern = '[Rr]ogers'

re.findall(pattern, text)

['Rogers', 'rogers']

In [30]:
re.search(pattern, text)

<re.Match object; span=(17, 23), match='Rogers'>

In [40]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'

pattern = '[Rr]ogers|My|5'

re.sub(pattern, 'Madu', text)

'Madu neighbor, Mr. Madu, has Madu Madu.'

In [53]:
text = 'Sáo Paulo São Paulo Sao Paulo Sao Paolo San Pablo sao paulo sao Paulo são Paulo sao-paulo são paulo São Paulo Saon Paulo'

pattern = '[Ss][aáã][on][n -][ -Pp][aáã][buo]lo'

re.findall(pattern, text)

['Sáo Paulo',
 'São Paulo',
 'Sao Paulo',
 'Sao Paolo',
 'San Pablo',
 'sao paulo',
 'sao Paulo',
 'são Paulo',
 'sao-paulo',
 'são paulo',
 'São Paulo']

# Pattern sets:

Range

1. [a-z]: Any lowercase letter between a and z.
2. [A-Z]: Any uppercase letter between A and Z.
3. [0-9]: Any numeric character between 0 and 9.

In [62]:
text = 'My neighbor, Mr. Rogers, has 5 rogers. [A-Z]'
pattern = '[A-Z][a-z][a-z\.]'

re.findall(pattern, text)

['Mr.', 'Rog']

In [67]:
text = 'My neighbor, Mr. Rogers, has 5 rogers. [A-Z]'
pattern = '[4-9]'

re.findall(pattern, text)

['5']

In [75]:
# you can concatenate ranges

text = 'My neighbor, Mr. Rogers, has 5 rogers. [A-Z] π'
pattern = '[A-Za-z0-9]'

# print(re.findall(pattern, text))

print(re.search(pattern, text))

<re.Match object; span=(0, 1), match='M'>


The opposite: 
- `^` matches everything except the pattern 

In [76]:
pattern = '[^A-Za-z0-9]'

re.findall(pattern, text)

[' ',
 ',',
 ' ',
 '.',
 ' ',
 ',',
 ' ',
 ' ',
 ' ',
 '.',
 ' ',
 '[',
 '-',
 ']',
 ' ',
 'π']

# Meta Characters:

Characters that don't mean what they are.

1. `\w`: Any alphanumeric character.
3. `\d`: Any numeric character.
7. `.` : Any character except newline (\n).

In [87]:
text = 'My neighbor, Mr. Rogers, ] has 5 - dogs 10. α π \d'

In [83]:
pattern = '[A-Za-z0-9]'
print(re.findall(pattern, text))

['M', 'y', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', 'M', 'r', 'R', 'o', 'g', 'e', 'r', 's', 'h', 'a', 's', '5', 'd', 'o', 'g', 's', '1', '0']


In [84]:
# your code here!
pattern = '\w'
print(re.findall(pattern, text))

['M', 'y', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', 'M', 'r', 'R', 'o', 'g', 'e', 'r', 's', 'h', 'a', 's', '5', 'd', 'o', 'g', 's', '1', '0', 'α', 'π']


In [101]:
# your code here!
pattern = r'\\d' # [0-9]
print(re.findall(pattern, text)[0])

\d


In [99]:
# your code here!
pattern = '.'
print(re.findall(pattern, text))

['M', 'y', ' ', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', ',', ' ', 'M', 'r', '.', ' ', 'R', 'o', 'g', 'e', 'r', 's', ',', ' ', ']', ' ', 'h', 'a', 's', ' ', '5', ' ', '-', ' ', 'd', 'o', 'g', 's', ' ', '1', '0', '.', ' ', 'α', ' ', 'π', ' ', '\\', 'd']


## Quantifiers

1. `*`: 0 or more
2. `?`: 0 or 1
3. `+`: 1 or more

In [147]:
text = '''do My neighbor, Mr. Rogers, has 5 -m dogs and 100 cats and β sheeps. doc1s'''

In [115]:
# your code here!
re.findall('\d+', text)

['5', '100']

In [129]:
re.findall('-.+', text)

['-m dogs and 100 cats and β sheeps.']

In [157]:
re.findall('do\w{3}', text)

['doc1s']

In [158]:
text = 'Sáo Paulo São Paulo Sao Paulo Sao Paolo San Pablo sao paulo sao Paulo são Paulo sao-paulo são paulo São SãoPaulo Saon Paulo'

pattern = '[Ss][ãaáàâä][on]n? ?[Pp]a[buo]lo'

re.findall(pattern, text)

['Sáo Paulo',
 'São Paulo',
 'Sao Paulo',
 'Sao Paolo',
 'San Pablo',
 'sao paulo',
 'sao Paulo',
 'são Paulo',
 'são paulo',
 'SãoPaulo',
 'Saon Paulo']

In [159]:
text = 'This colonel has the colour or color blue'

re.findall('colou?r', text)

['colour', 'color']

In [162]:
text = 'These apples are beautiful and the apple is blue.'

re.findall('apples?', text)

['apples', 'apple']

# Other methods for regular expressions

In [164]:
'Maria, Eduarda'.split(',')

['Maria', ' Eduarda']

In [176]:
text = 'My neighbor, Mr. Rogers, ] has 5 - rogers 1000, blabla'

In [166]:
# your code here!
len(re.split('[Rr]ogers', text))

3

In [168]:
re.split('[Rr]ogers', text)[0]

'My neighbor, Mr. '

In [169]:
re.split('[Rr]ogers', text)[1]

', ] has 5 - '

In [170]:
re.split('[Rr]ogers', text)[2]

' 1000,'

In [177]:
re.split('\d+', text)[0]

'My neighbor, Mr. Rogers, ] has '

In [178]:
re.split('\d+', text)[1]

' - rogers '

In [179]:
re.split('\d+', text)[2]

', blabla'