In [1]:
# regular expression library is built in. we import it with
import re

In [2]:
# regex patters : r"---"

In [3]:
text = "The agent phone number is 408-555-1234. Call soon!"

In [4]:
'phone' in text

True

In [5]:
pattern = 'phone'

In [6]:
match=re.search(pattern, text)
print(match)
print(type(match))

<re.Match object; span=(10, 15), match='phone'>
<class 're.Match'>


In [7]:
match.span()

(10, 15)

In [8]:
match.start()

10

In [9]:
match.end()

15

In [10]:
match.group()

'phone'

In [11]:
# re.search only finds first match.
# for finding all matches, use findall

text = 'my phone once, my phone twice'
pattern = 'phone'
matches=re.findall(pattern, text)


# note : it returns a list of all matches. not re.Match objects

In [12]:
matches

['phone', 'phone']

In [13]:
len(matches)

2

In [14]:
for match in matches:
    print(match)

phone
phone


In [15]:
# in order to iterate over matches properly
# re.finditer()

for match in re.finditer(pattern, text):
    print(match)

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(18, 23), match='phone'>


In [17]:
for match in re.finditer(pattern, text):
    print(match.group())

phone
phone


In [18]:
# Regex patterns
# \d : digit
# \w : Alphanumeric (undescore in alphanumeric)
# \s : White space
# \D : non digit
# \W : non alphanumeric
# \S : non white space


In [22]:
text = 'My phone number is 408-555-1234'

In [23]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d', text)

In [24]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [25]:
phone.group()

'408-555-1234'

In [27]:
# Quantifiers
# +     : 1+ times
# {n}   : exactly n times
# {m,n} : m to n times
# {n,}  : n or more times
# *     : 0+ times
# ?     : zero or once

In [29]:
phone = re.search(r'\d{3}-\d{3}-\d{4}', text)
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [30]:
# To extract area code
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')


In [31]:
results = re.search(phone_pattern, text)

In [32]:
results.group()

'408-555-1234'

In [34]:
results.group(1), results.group(2), results.group(3)

('408', '555', '1234')

In [35]:
re.search(r'cat|dog', 'The dog is here')

<re.Match object; span=(4, 7), match='dog'>

In [42]:
re.findall(r'.at', 'The cat in the hat sat there at now')

['cat', 'hat', 'sat', ' at']

In [47]:
re.findall(r'...at|..at', 'The cat in the hat sat there at at now')

['e cat', 'e hat', ' sat', 're at']

In [48]:
re.findall(r'^\d', '1 is a number')

['1']

In [49]:
re.findall(r'^\d', 'The 1 is a number')

[]

In [50]:
re.findall(r'\d$', 'The number is 2')

['2']

In [70]:
phrase = 'There are 3 numbers 34 inside 5 this sentence'

pattern = r'[^\d]+'

re.findall(pattern, phrase)

['There are ', ' numbers ', ' inside ', ' this sentence']

In [72]:
phrase = 'This is a string! But it has punctuation. How can we remove it?'

re.findall(r'[^!.?]+', phrase)

['This is a string', ' But it has punctuation', ' How can we remove it']

In [76]:
"".join(re.findall(r'[^!.?]+', phrase))

'This is a string But it has punctuation How can we remove it'

In [74]:
clean = re.findall(r'[^!.? ]+', phrase)
''.join(clean)

'ThisisastringButithaspunctuationHowcanweremoveit'

In [75]:
' '.join(clean)

'This is a string But it has punctuation How can we remove it'

In [79]:
text = 'Only find the hyphen-words in this sentence. But you do not know how long-ish they are'

pattern = r'[\w]+-[\w]+'

re.findall(pattern, text)

['hyphen-words', 'long-ish']

In [80]:
text = 'Only find the hyphen-words in this sentence. But you do not know how long-ish they are'

pattern = r'\w+-\w+'

re.findall(pattern, text)

['hyphen-words', 'long-ish']

In [84]:
one = 'Hello, would you  like some catfish?'
two = 'Hello, would you like to take a catnap?'
three = 'Hello, have you seen this caterpillar?'

pattern = r'cat(fish|nap|claw)'
re.search(pattern, three)