In [1]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [2]:
"408" in text

True

In [3]:
import re

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern, text)

<re.Match object; span=(12, 17), match='phone'>

In [6]:
pattern = 'NOT IN TEXT'

In [7]:
re.search(pattern, text)

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern, text)

In [10]:
type(match)

re.Match

In [11]:
match.span()

(12, 17)

In [12]:
match.start()

12

In [13]:
match.end()

17

In [14]:
text = "my phone once my phone twice"

In [15]:
match = re.search(pattern, text)

In [16]:
match

<re.Match object; span=(3, 8), match='phone'>

In [17]:
matches = re.findall('phone', text)

In [18]:
matches

['phone', 'phone']

In [19]:
matches[0]

'phone'

In [20]:
for match in re.finditer('phone', text):
    print(match)

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(17, 22), match='phone'>


In [21]:
# to return the actual text use

In [22]:
for match in re.finditer('phone', text):
    print(match.group())

phone
phone


# Patterns

In [29]:
text = "My telephone number is 408-555-1235"

In [30]:
phone = re.search('408-555-1234', text)

In [31]:
phone

### We usually dont know the number, but we know the pattern

#### We add that small r in front of the pattern to tell python that this is a regular expression

In [32]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d', text)

In [33]:
phone

<re.Match object; span=(23, 35), match='408-555-1235'>

### What is we have a 100 digits? We would not want to write \d a 100 times. For that we have to learn about quantifiers.

In [34]:
phone = re.search(r'\d{3}-\d{3}-\d{4}', text)

In [35]:
phone

<re.Match object; span=(23, 35), match='408-555-1235'>

### Finding the phone number and then extracting the first three digits

In [36]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [38]:
results = re.search(phone_pattern, text)

In [39]:
results.group()

'408-555-1235'

In [40]:
results.group(1)

'408'

In [41]:
results.group(2)

'555'

# Additional regex syntax 

## The OR operator

In [42]:
re.search(r'cat', 'The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [43]:
re.search(r'dog', 'The cat is here')

In [44]:
re.search(r'cat|dog', 'The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [46]:
re.findall(r'at','The cat in the hat sat there.')

['at', 'at', 'at']

In [47]:
re.findall(r'.at','The cat in the hat sat there.')

['cat', 'hat', 'sat']

In [48]:
re.findall(r'...at','The cat in the hat sat there splat.')

['e cat', 'e hat', 'splat']

## starts with and ends with

#### This is an example of searching for a string that starts with a number

In [50]:
re.findall(r'^\d', '1 is a number')

['1']

#### Ends with

In [51]:
re.findall(r'\d$', 'this sentence ends with a number5')

['5']

### excluding characters

In [52]:
phrase = 'there are 3 numbers 34 inside 5 this sentence'

Get back everything that is not a number

In [55]:
pattern = r'[^\d]'

In [56]:
re.findall(pattern, phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

You can use this pattern in regular expressions to quickly remove things

In [57]:
pattern = r'[^\d]+'

In [58]:
re.findall(pattern, phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

### This is a really common way to get rid of punctuation from a sentence

In [59]:
test_phrase = "This is a string! But it has punctuation. How can we remove it?"

In [61]:
re.findall(r'[^!.?]+', test_phrase)

['This is a string', ' But it has punctuation', ' How can we remove it']

You can add a space in there as well to get a clean list of all the words

In [62]:
clean = re.findall(r'[^!.? ]+', test_phrase)

In [63]:
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

You can join these all with a space

In [65]:
" ".join(clean)

'This is a string But it has punctuation How can we remove it'

### Using the [] to group for inclusion

In [66]:
text = "Only find the hyphen-words in this sentence. But you do not know how long-ish they are."

So we have words-wordsxyz we dont know how many letters are before the - and how many are after

In [67]:
pattern = r'[\w]+'

This is looking for a group of alpha numerics

In [68]:
re.findall(pattern, text)

['Only',
 'find',
 'the',
 'hyphen',
 'words',
 'in',
 'this',
 'sentence',
 'But',
 'you',
 'do',
 'not',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

In [69]:
pattern = r'[\w]+-[\w]+'

In [70]:
re.findall(pattern, text)

['hyphen-words', 'long-ish']

### You can also use parentheses for multiple options

In [71]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [72]:
re.search(r'cat(fish|nap|claw)', text)

<re.Match object; span=(27, 34), match='catfish'>

In [74]:
re.search(r'cat(fish|nap|erpillar)', textthree)

<re.Match object; span=(26, 37), match='caterpillar'>