# What is Regex?
A regular expression, regex or regexp (sometimes called a rational expression) is a sequence of characters that define a search pattern. Usually such patterns are used by string searching algorithms for “find” or “find and replace” operations on strings, or for input validation. It is a technique developed in theoretical computer science and formal language theory

In [45]:
import re
from PIL import Image

Find Explicit characters

In [46]:
text = 'This is a string with term1, but it does not have the other term.'
re.findall("term1",text)

['term1']

In [47]:
lowercase_alphabet = "abcdefghijklmnopqrstuvwxyz"
print(lowercase_alphabet)
re.findall("abc",lowercase_alphabet)

abcdefghijklmnopqrstuvwxyz


['abc']

In [48]:
uppercase_alphabet = lowercase_alphabet.upper()
print(uppercase_alphabet)
re.findall("abc",uppercase_alphabet)

ABCDEFGHIJKLMNOPQRSTUVWXYZ


[]

In [49]:
print(uppercase_alphabet)
re.findall("ABC",uppercase_alphabet)

ABCDEFGHIJKLMNOPQRSTUVWXYZ


['ABC']

Anchors.:<br/><br/>
 <img src="images/REs_anchor.png" width="300"/>

In [50]:
text = 'This is a string with term1, but it does not have the other term'
patterns = [ 'term1', 'term2' ]

# the string that starts with "T"
s1 = re.findall("^T",text)
print(s1)

# the string does not starts with "h"
s2 = re.findall("^h",text)
print(s2)

# the string that ends with "m"
s1 = re.findall("m$",text)
print(s1)

# the string does not starts with "r"
s2 = re.findall("r$",text)
print(s2)

['T']
[]
['m']
[]


Matching literal characters

In [51]:
website = "www.medium.com"
print(website)
re.findall("www.medium.com",website)

www.medium.com


['www.medium.com']

Quantifiers and Alternation.:<br/><br/>
 <img src="images/REs_Quantifiers_Alternation.png" width="300"/>

In [52]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""

#find strings with cero or more 1s
list1 = re.findall("1*",phone_numbers)
print(list1)

#The size of the string will count the number of 1s in the text.
print(len(list1))

['1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '11', '', '', '', '', '', '', '', '', '', '', '', '', '1', '', '', '', '', '', '111', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1', '', '', '', '11', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1111', '', '111111', '', '', '', '', '', '']
88


In [53]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

#find strings with one or more 1s
list1 = re.findall("1+",phone_numbers)

print(list1)

123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345
['1', '11', '1', '111', '1', '11', '1111', '111111']


In [54]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

#find strings with cero or one 1
list1 = re.findall("1?",phone_numbers)
print(list1)


123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345
['1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1', '1', '', '', '', '', '', '', '', '', '', '', '', '', '1', '', '', '', '', '', '1', '1', '1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1', '', '', '', '1', '1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1', '1', '1', '1', '', '1', '1', '1', '1', '1', '1', '', '', '', '', '', '']


In [55]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

#find strings with exactly two 1s
list1 = re.findall("1{2}",phone_numbers)

print(list1)

123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345
['11', '11', '11', '11', '11', '11', '11', '11']


In [56]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

#find string with two or more 1s
list1 = re.findall("1{2,}",phone_numbers)

print(list1)

123-456-7890
                    987.654.311
                    234-567-8901
                    654.111.987
                    345-678-9012
                    311.654.978
                    456-789-1111
                    111111-2345
['11', '111', '11', '1111', '111111']


In [69]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

#find string with two or four 1s
list1 = re.findall("1{2,4}",phone_numbers)

print(list1)

123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345
['11', '111', '11', '1111', '1111', '11']


In [71]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

#find as few 1s as possible
list1 = re.findall("1+?",phone_numbers)

print(list1)
print(len(list1))

123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
20


In [72]:
phone_numbers = """123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

#find 12 or 21
list1 = re.findall("12|21",phone_numbers)

print(list1)

123-456-7890
987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345
['12', '12']


Matching by Patterns:

<img src="images/REs_Character_Classes.png" width="300"/>

In [73]:
# Select all special character "."
special_characters = "[\^$.|?*+()"
print(special_characters)
re.findall("\.",special_characters)

[\^$.|?*+()


['.']

In [74]:
# Select all special character "|"
special_characters = "[\^$.|?*+()"
print(special_characters)
re.findall("\|",special_characters)

[\^$.|?*+()


['|']

In [75]:
# Select all new characters except newline
special_characters = "[\^$.|?*+()"
print(special_characters)
re.findall(".",special_characters)

[\^$.|?*+()


['[', '\\', '^', '$', '.', '|', '?', '*', '+', '(', ')']

In [76]:
phone_numbers = """987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345"""
print(phone_numbers)

# Select no digits
list3 = re.findall("\D",phone_numbers)

# print list
print(list3)

987.654.311
234-567-8901
654.111.987
345-678-9012
311.654.978
456-789-1111
111111-2345
['.', '.', '\n', '-', '-', '\n', '.', '.', '\n', '-', '-', '\n', '.', '.', '\n', '-', '-', '\n', '-']


In [64]:
sentence2 = "States are represented in the House of Representatives in approximate proportion to their populations, There are currently 435 voting-representatives."
print(sentence2)

# Select no words
list4 = re.findall("\W",sentence2)

# print list
print(list4)

States are represented in the House of Representatives in approximate proportion to their populations, There are currently 435 voting-representatives.
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ',', ' ', ' ', ' ', ' ', ' ', '-', '.']


In [65]:
# \w = any word 
# {4,} = 3 or more characters <br/>

sentence1 = "1000 We need to understand that if we all work on inclusion together, it's going to be faster, broader, better, and more thorough than anything we can do on our own"
print(sentence1)

# Translation: \w{3,} match [any word] with [4 or more characters] and put each word matched in a list 
SentenceList = re.findall("\w{4,}",sentence1)
print(SentenceList)

1000 We need to understand that if we all work on inclusion together, it's going to be faster, broader, better, and more thorough than anything we can do on our own
['1000', 'need', 'understand', 'that', 'work', 'inclusion', 'together', 'going', 'faster', 'broader', 'better', 'more', 'thorough', 'than', 'anything']


In [66]:
# match [any word] with [2 characters] followed by a blank space
print(sentence1)
re.findall("\s\w{2}\s",sentence1)

1000 We need to understand that if we all work on inclusion together, it's going to be faster, broader, better, and more thorough than anything we can do on our own


[' We ', ' to ', ' if ', ' on ', ' to ', ' we ', ' do ']

In [67]:
# match [any word] with [4 or more characters] and put each word matched in a list
print(sentence1)
SentenceList1 = re.findall("o.r",sentence1)

print(SentenceList1)

1000 We need to understand that if we all work on inclusion together, it's going to be faster, broader, better, and more thorough than anything we can do on our own
['our']


In [77]:
phone_numbers = """ 987.654.311
                    234-567-8901
                    654.111.987
                    345-678-9012
                    311.654.978
                    456-789-1111
                    111111-2345"""
print(phone_numbers)

# 'd' = any digit
# '{3}' = exactly three
# '\-' = select hyphen
# '\d' = any digit
# '{3}' = exactly three
# '\-' = select hyphen
# '\d' = any digit
# '{4}' = exactly four

#Translation: `\d{3}\-\d{3}\-\d{4}` match [any digit] with [exactly three characters] followed by [hyphen] match [any digit] with [exactly three characters] followed by [hyphen] match [any digit] with [exactly four characters]
re.findall("\d{3}\.\d{3}\.\d{3}",phone_numbers)

 987.654.311
                    234-567-8901
                    654.111.987
                    345-678-9012
                    311.654.978
                    456-789-1111
                    111111-2345


['987.654.311', '654.111.987', '311.654.978']