In [40]:
# Import re module, which is python stores regular expression libraries
import re

In [41]:
# Example
text = "This is a good day"

# Search for "good" in text
if re.search("good", text):
    print("Wonderful!")
else:
    print(":<")

Wonderful!


In [42]:
# Example 2
text = "Amy works diligently. Amy gets good grades. Our student Amy is successful."

# Split the text by "Amy" and get a list of substrings
re.split("Amy", text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is successful.']

In [43]:
# Find all "Amy" in text
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [44]:
# Example 3
text = "Amy works diligently. Amy gets good grades. Our student Amy is successful."

# Use "^" for specifying the start of the pattern in text
# Use "$" for specifying the end of the pattern in text
re.search("^Amy", text)

<re.Match object; span=(0, 3), match='Amy'>

# Patterns and Character Classes

In [45]:
# Example 4
grades = "ACAAAABCBCBAA"

# Get list of "B" in grades
re.findall("B", grades)

['B', 'B', 'B']

In [46]:
# Get list of "A" + "B" in grades
# Use []
re.findall("[AB]", grades)

['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']

In [47]:
# Get list of "A" followed by "B" or "C"
print(re.findall("[A][B-C]", grades))

# Another way is 
print(re.findall("AB|AC", grades))

['AC', 'AB']
['AC', 'AB']


In [48]:
# Get list of not "A"
re.findall("[^A]", grades)

['C', 'B', 'C', 'B', 'C', 'B']

In [49]:
# If "^" is used inside [] -> specify NOT operator
# If "^" is used outside [] -> specify the pattern must start with the given string -> can be TRUE or FALSE
# Get value if there exists character != "A" at the beginning of the text
re.findall("^[^A]", grades)

[]

# Quantifiers

In [50]:
# Quantifiers are the number of times a pattern is matched
# The most basic quantifier is expressed as e{m, n}
# e: expression or character to be matched
# m: minimum number of times
# n: maximum number of times

In [51]:
# Find all combinations of 2 A's up to 10 A's in a row
# NOTE: cannot put space in between braces
re.findall("A{2,10}", grades)

['AAAA', 'AA']

In [52]:
# Find combinations of 2 A's back to back
re.findall("A{1,1}A{1,1}", grades)

['AA', 'AA', 'AA']

In [53]:
# Another way
re.findall("AA", grades)

['AA', 'AA', 'AA']

In [54]:
# Define m = n by using only 1 number in the braces
re.findall("A{2}", grades)

['AA', 'AA', 'AA']

In [55]:
# Find a decreasing trend in a student's grades
re.findall("A{1,10}B{1,10}C{1,10}", grades)

['AAAABC']

In [56]:
# Example
# Open text file and read the file in a variable wiki
with open("ferpa.txt", "r") as file:
    wiki = file.read()

In [57]:
# Get a list of headers
# just the last word of the headers
re.findall("[a-zA-Z]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [58]:
# Use \w to match any letter, including digits and numbers
re.findall("[\w]{1,100}\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [59]:
# \s matches any whitespace character.
# * matches 0 or more times (reduce the {} )
re.findall("[\w]*\[edit\]", wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [60]:
# Add in a spaces using the space character
re.findall("[\w ]*\[edit\]", wiki)

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [61]:
# Iterating through list of titles
for title in re.findall("[\w ]*\[edit\]", wiki):
    # Split to take the first result and get rid of [edit]
    print(re.split("\[", title)[0])

Overview
Access to public records
Student medical records


# Groups

In [62]:
# Match with different patterns, called GROUPS
# Using ()
# Example
re.findall("([\w ]*)(\[edit\])", wiki)

[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [63]:
# Get a list of Match objects using finditer()
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [64]:
# Get an individual group using group(number)
# group(0) is the whole match, other number is the portion of the match
for item in re.finditer("([\w ]*)(\[edit\])", wiki):
    print(item.group(1))

Overview
Access to public records
Student medical records


In [65]:
# Naming groups
# Syntax (?P<name>)
# ?P indicates that this is an extension to basic regexes
# <name> is the dictionary key
for item in re.finditer("(?P<title>[\w ]*)(?P<edit_link>\[edit\])", wiki):
    print(item.groupdict()["title"])

Overview
Access to public records
Student medical records


In [66]:
# Get whole dictionary
print(item.groupdict())

{'title': 'Student medical records', 'edit_link': '[edit]'}


In [67]:
# []: match individual character patterns
# (): match groups
# *: zero or more times
# +: 1 or more times
# ?P: indicate extension
# m{n}: n times show up for a character m in a row
# \w: word character
# \d: digit character
# \s: whitespace character

# Look-ahead and Look-behind

In [68]:
# Use [edit] to match but don't want to capture it in the result
# Use ?=
for item in re.finditer("(?P<title>[\w ]+)(?=\[edit\])", wiki):
    print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(3692, 3715), match='Student medical records'>


# Example: Wikipedia Data

In [69]:
with open("buddhist.txt", "r") as file:
    wiki = file.read()

In [73]:
# Multi-line regexes
pattern = """
(?P<title>.*)
(-\ located\ in\ )
(?P<city>\w*)
(,\ )
(?P<state>\w*)"""

# Call finditer()
for item in re.finditer(pattern, wiki, re.VERBOSE):
    print(item.groupdict())