In [1]:
import re

In [12]:
message = 'In case you find this case, please call either 111-222-3456 or 345-333-5555 before noon'
myRegex=re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
om = myRegex.search(message)
print(om.group())

111-222-3456


In [4]:
om = myRegex.findall(message)
om

['111-222-3456', '345-333-5555']

## Groups

In [29]:
message='My number is 123-456-1111'
myRegex=re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
objectmatch = myRegex.search(message)
print objectmatch
print objectmatch.group()
print objectmatch.groups()
print objectmatch.group(1), objectmatch.group(2)

<_sre.SRE_Match object at 0x103ea8b58>
123-456-1111
('123', '456-1111')
123 456-1111


### No results
The `objectmatch` will be `none` if nothing is matched

In [31]:
message='My number is 123-ABC-1111'
objectmatch = myRegex.search(message)
print objectmatch
objectmatch==None

None


True

## Matching

### ?

Using `?` after something in a regex means it is optional. Zero or one time will be matched.

In [32]:
mystring="Batman wants a mobile"
myRegex=re.compile(r'Bat(wo)?man')
om = myRegex.search(mystring)
om.group()

'Batman'

In [33]:
mystring="Batwoman wants a mobile"
om = myRegex.search(mystring)
om.group()

'Batwoman'

In [34]:
mystring="Batwowoman wants a mobile"
om = myRegex.search(mystring)
om==None

True

### * any number of times
Means zero or any number of times

In [36]:
mystring="Batwowoman wants a mobile"
myRegex=re.compile(r'Bat(wo)*man')
om = myRegex.search(mystring)
om.group()

'Batwowoman'

### + not optional
One or more times

In [37]:
mystring="Batwowoman wants a mobile"
myRegex=re.compile(r'Bat(wo)+man')
om = myRegex.search(mystring)
om.group()

'Batwowoman'

In [38]:
mystring="Batwoman wants a mobile"
myRegex=re.compile(r'Bat(wo)+man')
om = myRegex.search(mystring)
om.group()

'Batwoman'

In [40]:
mystring="Batman wants a woman"
myRegex=re.compile(r'Bat(wo)+man')
om = myRegex.search(mystring)
om==None

True

## Exact match {}

In [41]:
mystring="Batwowoman wants a mobile"
myRegex=re.compile(r'Bat(wo){2}man')
om = myRegex.search(mystring)
om.group()

'Batwowoman'

In [42]:
mystring="Batwowowman wants a mobile"
om = myRegex.search(mystring)
om==None

True

In [44]:
mystring="Batwoman wants a mobile"
om = myRegex.search(mystring)
om==None

True

Let's try again using this technique with the telephone matching.

In [45]:
mystring='My number is 123-456-1111'
myRegex=re.compile(r'(\d{3}-){2}(\d){4}')
om = myRegex.search(mystring)
om.group()

'123-456-1111'

## Min/Max {n,m}
If there is no upper limit, just leave out the last number. If you leave out the first, it means up to the last number.

In [46]:
mystring='My number is 123-456-1111'
myRegex=re.compile(r'(\d{3}-){1,2}(\d){4}')
om = myRegex.search(mystring)
om.group()

'123-456-1111'

In [47]:
mystring='My number is 456-1111 without regional code'
om = myRegex.search(mystring)
om.group()

'456-1111'

## Greedy matching
By default regex will match as much as possible, so in the example here, it will not be happy with the first three digits, but go for the upper limit.

In [49]:
mystring='1234567890'
myRegex=re.compile(r'\d{3,5}')
om = myRegex.search(mystring)
om.group()

'12345'

### Non greedy ?
A question mark after a group makes the regex non greedy

In [50]:
mystring='1234567890'
myRegex=re.compile(r'\d{3,5}?')
om = myRegex.search(mystring)
om.group()

'123'

## Table with special characters
Note how I avoid the table rendering to be centered

In [53]:
%%html
<style>
table {float:left}
</style>

|Shorthand   |   Represents|
|--|--
|\\d    |   Any digit
|\\D| Any non digit
|\\w| Any `word` character, alphanumeric or underscore
|\\W| Any `non word` character
|\\s| Space, Tab or newline
|\\S| Anything that is not a space character
|\.| Anything except for newline

And you can of course define your own using the square brackets, like this example

In [54]:
mystring='Batman likes soft food, he is a woman eater'
# Wovels
myRegex=re.compile(r'[aeiou]')
om = myRegex.findall(mystring)
om

['a', 'a', 'i', 'e', 'o', 'o', 'o', 'e', 'i', 'a', 'o', 'a', 'e', 'a', 'e']

In [55]:
# Two Wovels in row
myRegex=re.compile(r'[aeiou]{2}')
om = myRegex.findall(mystring)
om

['oo', 'ea']

### Negation ^
Addng this symbol in front of the wovel makes them consonants (well also spaces or digits will be accepted)

In [58]:
myRegex=re.compile(r'[^aeiou]')
consonants = myRegex.findall(mystring)
consonants[:10]

['B', 't', 'm', 'n', ' ', 'l', 'k', 's', ' ', 's']

## Start of string ^


In [59]:
myRegex=re.compile(r'^Hello')
om = myRegex.findall('She said "Hello"')
om

[]

In [61]:
om = myRegex.findall('Hello there...')
om

['Hello']

## End of String $

In [62]:
myRegex=re.compile(r'Hello World$')
om = myRegex.findall('Hello World')
om

['Hello World']

In [63]:
om = myRegex.findall('"Hello World"')
om

[]

## Combined begin and end

In [65]:
allDigitsRegex=re.compile(r'^\d+$')
om = allDigitsRegex.findall('123234234524532')
om

['123234234524532']

In [66]:
om = allDigitsRegex.findall('123234B234524532')
om

[]

## Any character match

In [68]:
allDigitsRegex=re.compile(r'.at')
om = allDigitsRegex.findall('The cat in the hat sat on the flat mat.')
om

['cat', 'hat', 'sat', 'lat', 'mat']

Notice the word `flat` was not matched here. To catch that we can modify the pattern to allow two or more "anything" preceeding "at", but that opens up for spaces as well.

In [69]:
allDigitsRegex=re.compile(r'.{,2}at')
om = allDigitsRegex.findall('The cat in the hat sat on the flat mat.')
om

[' cat', ' hat', ' sat', 'flat', ' mat']

## Parsing with .*

In [71]:
allDigitsRegex=re.compile(r'First name: (.*) Last name: (.*)')
om = allDigitsRegex.findall('First name: Hank Last name: B. Marvin')
om

[('Hank', 'B. Marvin')]

## re.DOTALL
This is used when you want to match newlines as well as we do in this example

In [72]:
mission='To serve\nand protect'
greedyRegex=re.compile(r'.*')
om=greedyRegex.findall(mission)
om

['To serve', '', 'and protect', '']

By using re.DOTALL in the compile statement, we will geta truly greedy function

In [73]:
greedyRegex=re.compile(r'.*', re.DOTALL)
om=greedyRegex.findall(mission)
om

['To serve\nand protect', '']

## Ignoring case

In [78]:
wovelRegex=re.compile(r'[aeiou]')
om=wovelRegex.findall('Oh my god, did you say present?')
om

['o', 'i', 'o', 'u', 'a', 'e', 'e']

In [77]:
wovelRegex=re.compile(r'[aeiou]', re.I)
om=wovelRegex.findall('Oh my god, did you say present?')
om

['O', 'o', 'i', 'o', 'u', 'a', 'e', 'e']

# Search and replace

In [81]:
namesRegex=re.compile(r'(Agent \w+)')
om=namesRegex.findall("Agent Bond was interested in Agent Moneypenny")
om

['Agent Bond', 'Agent Moneypenny']

In [82]:
namesRegex.sub("REDACTED", "Agent Bond was interested in Agent Moneypenny")

'REDACTED was interested in REDACTED'

What if we want a little bit more info - like the first letter of the Agent name?

In [90]:
namesRegex=re.compile(r'Agent (\w)\w*')
om=namesRegex.findall(r"Agent Bond was interested in Agent Moneypenny")
om

['B', 'M']

So now we have the letter we need, we can use it directly in the `sub` statement

In [89]:
namesRegex.sub(r"Agent \1****", "Agent Bond was interested in Agent Moneypenny")

'Agent B**** was interested in Agent M****'