In [None]:
# Regular expressions or regex
# References: 
#   -  https://www.rexegg.com/regex-quickstart.html
#   - https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285

#   We use regex to search for patterns in a string (imagine the string is huge)
#   Every character type has a corresponding pattern code
#     Examples:
#       - digitos: \d (the backslash allow the code to understand it's a special code and not the letter "d")

In [3]:
text = "My phone number is 311-313-2543"

In [4]:
import re


# re.search(pattern, text) -> Return a regex object, the fisrt match. 
# rex_obj.span() -> Will return a tuple with the start and end of the match
# rex_obj.start() -> Will return an integer referring the start of the match
# rex_obj.end() -> Will return an integer referring the end of the match
dummy_pattern = "number"
number_match = re.search(dummy_pattern, text)

In [5]:
number_match

<re.Match object; span=(9, 15), match='number'>

In [6]:
print(number_match.span())
print(type(number_match.span()))

(9, 15)
<class 'tuple'>


In [7]:
start, end = number_match.span()
print(text[start:end])

number


In [None]:
# Multiple Matches

In [8]:
text = "No fue suficiente, que te amara hasta la muerte, te juré que el corazón te lo daría para siempre"
te_pattern = 'te'
te_match = re.findall(te_pattern, text)
print(te_match)

['te', 'te', 'te', 'te', 'te']


In [9]:
# Multiple Matches - Returning regex objects
for match in re.finditer(te_pattern, text):
    print(match.span())

(15, 17)
(23, 25)
(45, 47)
(49, 51)
(72, 74)


In [None]:
# Regular Expression Patterns
# Examples:
# 1. Searching for files with common names: logs_2021_01_01.txt, logs_2021_01_02.txt
# 2. Searching for phone numbers: 311-313-2543

In [13]:
agent_conversation_text = "The number of Alicia is 311-313-2543, the number of Juan is 313-246-7887 and the number of Melanie is 317-345-9642"
phone_pattern_v1 = r'\d\d\d-\d\d\d-\d\d\d\d'
phone_numbers = re.search(phone_pattern_v1, agent_conversation_text)
s,e = phone_numbers.span()

In [14]:
agent_conversation_text[s:e]

'311-313-2543'

In [17]:
phone_numberss = re.findall(phone_pattern_v1, agent_conversation_text)
for match in re.finditer(phone_pattern_v1, agent_conversation_text):
    s,e = match.span()
    print(f"Match(start={s}, end={e}) found: {agent_conversation_text[s:e]}")
    

Match(start=24, end=36) found: 311-313-2543
Match(start=60, end=72) found: 313-246-7887
Match(start=102, end=114) found: 317-345-9642


In [20]:
# Regular Expression Quantifiers
agent_conversation_text = "The number of Alicia is 311-313-2543, the number of Juan is 313-246-7887 and the number of Melanie is 317-345-9642"
phone_pattern_v1 = r'\d{3}-\d{3}-\d{4}'
phone_numberss = re.findall(phone_pattern_v1, agent_conversation_text)
print(phone_numberss)

['311-313-2543', '313-246-7887', '317-345-9642']


In [21]:
# Regular Expression Groups
# match.group()
# match.group(i), where i=1,2,3,..., k
phone_pattern_v1_gruped = r'(\d{3})-(\d{3})-(\d{4})'
phone_number = re.search(phone_pattern_v1_gruped, agent_conversation_text)

In [24]:
print(phone_number.group())
print(phone_number.group(1))
print(phone_number.group(2))
print(phone_number.group(3))

311-313-2543
311
313
2543


In [26]:
phone_number.group(4)

IndexError: no such group

In [29]:
# Regular Expressions: Pipe (|)
r1 = re.search(r"moon|mon", "Talking to the moon")
print(r1, r1.span(), r1.group())

<re.Match object; span=(15, 19), match='moon'> (15, 19) moon


In [30]:
# Regular Expressions: Wildcard(.)
r1 = re.findall(r"d.d", "My dad did not make it")
print(r1)

['dad', 'did']


In [32]:
# Regular Expressions: Removing numbers
text = "there are 4 numbers inside 45 this sentence 31243"
re.findall(r"[^\d]+", text)

['there are ', ' numbers inside ', ' this sentence ']

In [37]:
# Regular Expressions: Removing punctuation
text = "A sentence! Oh that's awesome. It should have some, punctuation i guess."
result = re.findall(r"[^!.,]+", text)
phrase = ' '.join(result)
print(phrase)

A sentence  Oh that's awesome  It should have some  punctuation i guess
