In [28]:
text = "The person's phone number is 408-555-1234. Call soon!"

In [3]:
'phone' in text

True

In [1]:
import re

In [2]:
pattern = 'phone'

In [6]:
match = re.search(pattern, text)

In [8]:
match.start()

13

In [9]:
match.end()

18

In [10]:
match.span()

(13, 18)

In [11]:
text = 'phone once, phone twice'

In [14]:
match = re.findall('phone', text)

In [15]:
len(match)

2

In [18]:
for match in re.finditer('phone',text):
    print(match.group())

phone
phone


In [19]:
test_re = 'My contact num is 408-555-1234'

In [20]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d', test_re)

In [21]:
phone.group()

'408-555-1234'

In [22]:
phone = re.search(r'\d{3}-\d{3}-\d{4}', test_re)

In [23]:
phone.group()

'408-555-1234'

In [29]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [30]:
results = re.search(phone_pattern,text)

In [31]:
results.group()

'408-555-1234'

In [32]:
# Can then also call by group position.
# remember groups were separated by parenthesis ()
# Something to note is that group ordering starts at 1. Passing in 0 returns everything
results.group(1)

'408'

In [33]:
results.group(2)

'555'

Additional Regex Syntax : 
Or operator |   
Use the pipe operator to have an or statment. For example

In [34]:
re.search(r"man|woman","This man was here.")

<re.Match object; span=(5, 8), match='man'>

In [35]:
re.search(r"man|woman","This woman was here.")

<re.Match object; span=(5, 10), match='woman'>

The Wildcard Character :

Use a "wildcard" as a placement that will match any character placed there. You can use a simple period . for this.

For example:

In [36]:
re.findall(r".at","The cat in the hat sat here.")

['cat', 'hat', 'sat']

In [37]:
re.findall(r".at","The bat went splat")

['bat', 'lat']

Notice how we only matched the first 3 letters, that is because we need a . for each wildcard letter.

Or use the quantifiers described above to set its own rules.

In [38]:

re.findall(r"...at","The bat went splat")

['e bat', 'splat']

However this still leads the problem to grabbing more beforehand. Really we only want words that end with "at".

In [39]:
# One or more non-whitespace that ends with 'at'
re.findall(r'\S+at',"The bat went splat")

['bat', 'splat']

Starts with and Ends With

We can use the ^ to signal starts with, and the $ to signal ends with:

In [40]:
# Ends with a number
re.findall(r'\d$','This ends with a number 2')

['2']

In [41]:
# Starts with a number
re.findall(r'^\d','1 is the loneliest number.')
# Note that this is for the entire string, not individual words!

['1']

In [42]:
re.findall(r'^\d','the loneliest number is 1.')

[]

In [43]:
re.findall(r'^\d','the 1 loneliest number.')

[]

Exclusion

To exclude characters, we can use the ^ symbol in conjunction with a set of brackets []. 

Anything inside the brackets is excluded. For example:

In [44]:
phrase = "there are 3 numbers 34 inside 5 this sentence."
re.findall(r'[^\d]',phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 '.']

To get the words back together, use a + sign

In [45]:
re.findall(r'[^\d]+',phrase)

['there are ', ' numbers ', ' inside ', ' this sentence.']

We can use this to remove punctuation from a sentence. Here we are removing ! . ? and ' ' (Whitespace)

In [49]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'
re.findall('[^!.? ]+',test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [47]:
' '.join(re.findall('[^!.? ]+',test_phrase))

'This is a string But it has punctuation How can we remove it'

Brackets for Grouping

As we showed above we can use brackets to group together options, for example if we wanted to find hyphenated words:

In [50]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'
re.findall(r'[\w]+-[\w]+',text)

['hypen-words', 'long-ish']

Parenthesis for Multiple Options

If we have multiple options for matching, we can use parenthesis to list out these options. For Example:

In [51]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [52]:
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [54]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [53]:
# None returned
re.search(r'cat(fish|nap|claw)',textthree)