# Regular expressions in Python

In [2]:
import re

## Functionalities 

<tt> findall</tt>: Returns a list containing all matches. <br>
<tt> search</tt>: Returns a <tt>Match</tt> object if there is a match anywhere in the string. <br>
<tt> split</tt>: Returns a list where the string has been split at each match. <br>
<tt> sub</tt>: Replaces one or many matches with a string. <br>
    
Let us test these functions with a simple pattern first. 

In [3]:
text = "Do not start the blame game."

In [4]:
matches = re.findall("ame", text)
print(matches)

['ame', 'ame']


In [5]:
match = re.search("ame", text)
print(match)

<re.Match object; span=(19, 22), match='ame'>


In [6]:
tokens = re.split("ame", text)

# Also works:
# text.split("ame") 

print(tokens)

['Do not start the bl', ' g', '.']


In [7]:
changed_text = re.sub("ame","xyz", text)
print(changed_text)

Do not start the blxyz gxyz.


## Metacharacters

<tt>[]</tt> : 	A set of characters.<br>

The following are sets of characters. <br>
<tt>[arn]</tt> :	Returns a match where one of the specified characters (a, r, or n) are present. <br>
<tt>[a-n]</tt> :	Returns a match for any lower case character, alphabetically between a and n. <br>
<tt>[^arn]</tt> :	Returns a match for any character EXCEPT a, r, and n. <br>
<tt>[0123]</tt> :	Returns a match where any of the specified digits (0, 1, 2, or 3) are present. <br>
<tt>[0-9]</tt> :	Returns a match for any digit between 0 and 9. <br>
<tt>[0-5][0-9]</tt> :	Returns a match for any two-digit numbers from 00 and 59. <br>
<tt>[a-zA-Z]</tt> :	Returns a match for any character alphabetically between a and z, lower case OR upper case. <br>
<tt>[+]</tt> :	In sets, <tt>+, *, ., |, (), $,{}</tt> has no special meaning, so <tt>[+]</tt> means: return a match for any <tt>+</tt> character in the string. <br>

In [10]:
matches = re.findall("[sl]", text)
print(matches)

['s', 'l']


In [11]:
text = "Customers can contact at customerservice@company.com, or call Toll-free at +91-1234567890."

In [14]:
#consonants = re.findall("[a-z]",text)
consonants = re.findall("[A-Za-z0-9]",text)
print(consonants)

['C', 'u', 's', 't', 'o', 'm', 'e', 'r', 's', 'c', 'a', 'n', 'c', 'o', 'n', 't', 'a', 'c', 't', 'a', 't', 'c', 'u', 's', 't', 'o', 'm', 'e', 'r', 's', 'e', 'r', 'v', 'i', 'c', 'e', 'c', 'o', 'm', 'p', 'a', 'n', 'y', 'c', 'o', 'm', 'o', 'r', 'c', 'a', 'l', 'l', 'T', 'o', 'l', 'l', 'f', 'r', 'e', 'e', 'a', 't', '9', '1', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']


In [17]:
m = re.findall("[^c-z]",text)
print(m)

['C', ' ', 'a', ' ', 'a', ' ', 'a', ' ', '@', 'a', '.', ',', ' ', ' ', 'a', ' ', 'T', '-', ' ', 'a', ' ', '+', '9', '1', '-', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '.']


In [20]:
m = re.findall("[c-f][u-z]",text)
print(m)

['cu']


<tt>\</tt> :	Signals a special sequence (can also be used to escape special characters)<br>

<tt>\A</tt> :	Returns a match if the specified characters are at the beginning of the string	"\AThe"	<br> 

<tt>\b</tt> :	Returns a match where the specified characters are at the beginning or at the end of a word <br>
(the "r" in the beginning is making sure that the string is being treated as a "raw string")	r"\bain"
r"ain\b"	<br>

<tt>\B </tt> :	Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word<br>
(the "r" in the beginning is making sure that the string is being treated as a "raw string")	r"\Bain"
r"ain\B"	<br>

<tt>\d </tt> :	Returns a match where the string contains digits (numbers from 0-9)	"\d"<br>

<tt>\D </tt> :	Returns a match where the string DOES NOT contain digits	"\D" <br>

<tt>\s </tt> :	Returns a match where the string contains a white space character	"\s"	<br>

<tt>\S</tt> :	Returns a match where the string DOES NOT contain a white space character	"\S"	<br>

<tt>\w</tt> :	Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	<br>

<tt>\W </tt> :	Returns a match where the string DOES NOT contain any word characters	"\W" <br>

<tt>\Z </tt> :	Returns a match if the specified characters are at the end of the string	"Spain\Z" <br>

In [21]:
m = re.findall("[\d][\d][\d][\d]",text)
print(m)

m = re.findall(".",text)

['1234', '5678']


### More metacharacters

<tt>.</tt> :	Any character (except newline character) "he..o"	<br>

<tt>^</tt> :	Starts with	"^hello" <br>

<tt>\$</tt> :	Ends with	"world$"	<br>
    
<tt>\*</tt> :	Zero or more occurrences	"aix*"	<br>

<tt>+</tt> :	One or more occurrences	"aix+"	<br>

<tt>{}</tt> :	Exactly the specified number of occurrences	"al{2}"	<br>

<tt>|</tt> :	Either or	"falls|stays"	<br>
    
<tt>() </tt> :	Capture and group

In [22]:
m = re.findall("\d+",text)
print(m)

['91', '1234567890']


In [24]:
# Let's try to match the whole phone number
phone = "The phone number is 123-345-567."
# phone = "The phone number is 123-"
phone1 = "12345 34532"

m = re.findall("\d+-\d+",phone)
print(m)

['123-345']


## Let us try to extract email IDs

In [94]:
text = "Customers can contact at customer.service@company.com.au, or call Toll-free at +91-1234567890."

In [95]:
# Simplest one
emails = re.findall("\w[\.\w-]+@[\w\.]+\w+", text)

print(emails)

['customer.service@company.....com.au']
