In [1]:
#Patterns

In [3]:
#What about more complex examples? Such as trying to find a telephone number in a large string of text? Or an email address?

#We could just use search method if we know the exact phone or email, but what if we don't know it? We may know the general format, 
#and we can use that along with regular expressions to search the document for strings that match a particular pattern.

In [5]:
#Identifiers for Characters in Patterns

#You can use these to build up a pattern string. Notice how these make heavy use of the backwards slash \ . 
#Because of this when defining a pattern string for regular expression we use the format:

#r'mypattern'
#placing the r in front of the string allows python to understand that the \ in the pattern string are not meant to be escape slashes.

#Below you can find a table of all the possible identifiers:

# Character	Description	     Example Pattern Code	Exammple Match
#
# \d	    A digit	         file_\d\d	            file_25
# \w	    Alphanumeric     \w-\w\w\w	            A-b_1 (A-Za-Z0-9 and also _)
# \s	    White space	     a\sb\sc	            a b c
# \D	    A non digit	     \D\D\D	                ABC
# \W	    Non-alphanumeric \W\W\W\W\W	            *-+=)
# \S	    Non-whitespace   \S\S\S\S	            Yoyo

In [7]:
text = "My phone number is 543-333-2321"

In [9]:
pattern = "543-333-2321"
text = "My phone number is 543-333-2321"

In [11]:
import re
re.search(pattern,text)

<re.Match object; span=(19, 31), match='543-333-2321'>

In [13]:
#What if number changes say

In [15]:
text = "My phone number is 543-333-7777"

In [17]:
re.search(pattern,text)

In [19]:
#It didnt match so its exact value and not possible to match if numbers change
#so how to find the general or special pattern for search

In [23]:
pattern = r"\d\d\d-\d\d\d-\d\d\d\d"
text = "My phone number is 543-333-2321"
re.search(pattern,text)

<re.Match object; span=(19, 31), match='543-333-2321'>

In [25]:
pattern = r"\d\d\d-\d\d\d-\d\d\d\d"
text = "My phone number is 543-333-7777"
re.search(pattern,text)

<re.Match object; span=(19, 31), match='543-333-7777'>

In [27]:
pattern = r"\d\d\d-\d\d\d-\d\d\d\d"
text = "My phone number is 543-333-0101"
re.search(pattern,text)

<re.Match object; span=(19, 31), match='543-333-0101'>

In [29]:
#this now works perfectly, but what if tomorrow there are 100 numbers to be matched, do we need to add \d 100 times

In [31]:
#how to make it easy in such cases, we will now need quantifiers to say how much of these patterns should be applied

In [34]:
#Now that we know the special character designations, we can use them along with quantifiers to define how many we expect.

In [36]:
#  Character	Description	                Example Pattern Code	Exammple Match
#
#  +	        Occurs one or more times	Version \w-\w+	        Version A-b1_1
#  {3}	        Occurs exactly 3 times	    \D{3}	                abc
#  {2,4}	    Occurs 2 to 4 times	        \d{2,4}	                123
#  {3,}	        Occurs 3 or more	        \w{3,}	                anycharacters
#  \*	        Occurs zero or more times	A\*B\*C*	            AAACC
#  ?	        Once or none	            plurals?	            plural


In [38]:
#Let's rewrite our pattern using these quantifiers:
pattern = r"\d{3}-\d{3}-\d{4}"
text = "My phone number is 543-333-0101"
re.search(pattern,text)

<re.Match object; span=(19, 31), match='543-333-0101'>

In [40]:
pattern = r"\d{3}-\d{3}-\d{4}"
text = "My phone number is 543-333-7777"
re.search(pattern,text)

<re.Match object; span=(19, 31), match='543-333-7777'>

In [42]:
pattern = r"\d{3}-\d{3}-\d{4}"
text = "My phone number is 543-333-2321"
re.search(pattern,text)

<re.Match object; span=(19, 31), match='543-333-2321'>

In [44]:
#What if we wanted to do two tasks, find phone numbers, but also be able to quickly extract their area code (the first three digits). 
#We can use groups for any general task that involves grouping together regular expressions (so that we can later break them down).

#Using the phone number example, we can separate groups of regular expressions using parenthesis:


In [64]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})') #compiles different regex into groups

In [66]:
phone_pattern

re.compile(r'(\d{3})-(\d{3})-(\d{4})', re.UNICODE)

In [68]:
results = re.search(phone_pattern,text)

In [70]:
results.group()

'543-333-2321'

In [72]:
results.group(1) #area code of phone number

'543'

In [74]:
results.group(2)

'333'

In [76]:
results.group(3)

'2321'

In [78]:
results.group(4)

IndexError: no such group