Click <a href='https://www.dataquest.io/blog/web-scraping-tutorial-python/'>here</a> to learn about Regular Expressions (RegEx) using Python.

In [None]:
########################
# DO NOT RUN THIS CELL #
########################

a, X, 9, < -- ordinary characters just match themselves exactly.
. (a period) -- matches any single character except newline '\n'
\w -- matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
\W -- matches any non-word character.
\b -- matches word boundary (in between a word character and a non word character)
\s -- matches a single whitespace character -- space, newline, return, tab
\S -- matches any non-whitespace character.
\t, \n, \r -- tab, newline, return
\d -- matches any numeric digit [0-9]
\D matches any non-numeric character.
^ -- matches the beginning of the string, or specify omition of certain characters
$ -- matches the end of the string
\ -- escapes special character.
(x|y|z) matches exactly one of x, y or z.
(x) in general is a remembered group. We can get the value of what matched by using the groups() method of the object returned by re.search.
x? matches an optional x character (in other words, it matches an x zero or one times).
x* matches x zero or more times.
x+ matches x one or more times.
x{m,n} matches an x character at least m times, but not more than n times.
?: matches an expression but do not capture it. Non capturing group.
?= matches a suffix but exclude it from capture. Positive lookahead.
a(?=b) will match the "a" in "ab", but not the "a" in "ac"
In other words, a(?=b) matches the "a" which is followed by the string 'b', without consuming what follows the a.
?! matches if suffix is absent. Negative look ahead.
a(?!b) will match the "a" in "ac", but not the "a" in "ab"
?<= positive look behind
[] matches for groupings of consecutive characters
?<! negative look behind

########################
# DO NOT RUN THIS CELL #
########################

What are word boundaries?
--------------------------------------------------
Before the first character in the string, if the first character is a word character.<br>
After the last character in the string, if the last character is a word character.<br>
Between two characters in the string, where one is a word character and the other is not a word character<br>

In [2]:
import re

# open a .txt file using python
file = open('./names.txt', encoding='utf-8')

# read the data from the txt file
data = file.read()

# ALWAYS close your data streams to any files. Memory leakage and whatnot
file.close()

In [None]:
# find(4, data) => <re.Match object; span=(XX, XX), match='5555'>

In [3]:
# .match() - Check for a specific string from the beginning of the string
re.match(r'Hawkins', data)

<re.Match object; span=(0, 7), match='Hawkins'>

In [4]:
re.match(r'Time Lord', data)

In [5]:
# .search() - Looks for the first matching string in a string of text
re.search(r'Time Lord', data)\d\d

<re.Match object; span=(480, 489), match='Time Lord'>

In [7]:
re.search('Johnson', data)

<re.Match object; span=(169, 176), match='Johnson'>

In [8]:
re.search(r'\w, \w', data)

<re.Match object; span=(6, 10), match='s, D'>

In [9]:
re.search(r'\w\w\w\w\w\w\w, \w\w\w\w\w', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [10]:
re.search(r'\w{7}, \w{5}', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [12]:
re.search(r'\(\d\d\d\) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

<strong>Exercise 1</strong>:<br>
Write a function that checks for n number of consecutive digits and returns the match

In [13]:
"a" * 67

'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'

In [15]:
def find_digits(n, searchable_text):
    return re.search(r'\d'*n, searchable_text)

find_digits(3, data)

<re.Match object; span=(39, 42), match='555'>

In [17]:
# .findall() - Looks for matching string anywhere in the searchable text string and stores each instance into a list
phone_numbers = "(555) 555-5555 555-555-5555 555 555-5555"

In [19]:
# find numbers with parans
re.findall(r'\(\d{3}\) \d{3}-\d{4}', phone_numbers)

['(555) 555-5555']

In [20]:
# also find numbers with/without parens
re.findall(r'\(?\d{3}\)? \d{3}-\d{4}', phone_numbers)

['(555) 555-5555', '555 555-5555']

In [22]:
# also find numbers without parens, with hyphens
re.findall(r'\(?\d{3}\)?\s?-?\d{3}-\d{4}', phone_numbers)

['(555) 555-5555', '555-555-5555', '555 555-5555']

In [23]:
# Search for names
re.findall(r'\w+, \w+', data)

['Hawkins, Derek',
 'Teacher, Coding',
 'Milliken, Connor',
 'Teacher, Coding',
 'Johnson, Joe',
 'Carter, Joel',
 'Österberg, Sven',
 'Governor, Norrbotten',
 'Enchanter, Killer',
 'Butz, Ryan',
 'CEO, Coding',
 'Doctor, The',
 'Lord, Gallifrey',
 'Exampleson, Example',
 'Example, Example',
 'Obama, Barack',
 'President, United',
 'Patel, Ripal',
 'Teacher, Coding',
 'Vader, Darth',
 'Lord, Galactic',
 'Sanz, María',
 'Minister, Spanish']

In [24]:
# Search for emails
re.findall(r'[\d\w\'-+.]+@[-.\w\d]+', data)

['derek@codingtemple.com',
 'connor@codingtemple.com',
 'joejohnson@codingtemple.com',
 'governor@norrbotten.co.se',
 'tim@killerrabbit.com',
 'ryanb@codingtemple.com',
 'doctor+companion@tardis.co.uk',
 'me@example.com',
 'president.44@us.gov',
 'ripalp@codingtemple.com',
 'vader@empire.gov',
 'mtfvs@spain.gov']

In [26]:
# re.VERBOSE/re.X - Allows multiline regular expressions
# re.IGNORECASE/re.I - Ignores casing

information = """
Patel, Ripal: ripalp@codingtemple.com : 555 555-5555
Carter, Joel: joelc@codingtemple.com : (555) 555-5555
Butz, Ryan: ryanb@codingtemple.gov : (555) 555-5555
Stanton, Brain: brians@codingtemple.com : 555-555-5555
Davitt, Sam: samd@codingtemple.com : (555) 555-5555
"""

information

'\nPatel, Ripal: ripalp@codingtemple.com : 555 555-5555\nCarter, Joel: joelc@codingtemple.com : (555) 555-5555\nButz, Ryan: ryanb@codingtemple.gov : (555) 555-5555\nStanton, Brain: brians@codingtemple.com : 555-555-5555\nDavitt, Sam: samd@codingtemple.com : (555) 555-5555\n'

In [28]:
re.findall(r'''
    @[-.\w\d]+ # @symbol, one or more number/digit characters
''', information, re.X|re.I)

['@codingtemple.com',
 '@codingtemple.com',
 '@codingtemple.gov',
 '@codingtemple.com',
 '@codingtemple.com']

In [38]:
info = re.findall(r'''
   ([\w]+,\s[\w]+)                    # last, first name
   (:\s[\d\w\'-+.]+@[-.\w\d]+)        # email
   (\s:\s\(?\d{3}\)?\s?-?\d{3}-\d{4}) # phone
''', information, re.X)

info

[('Patel, Ripal', ': ripalp@codingtemple.com', ' : 555 555-5555'),
 ('Carter, Joel', ': joelc@codingtemple.com', ' : (555) 555-5555'),
 ('Butz, Ryan', ': ryanb@codingtemple.gov', ' : (555) 555-5555'),
 ('Stanton, Brain', ': brians@codingtemple.com', ' : 555-555-5555'),
 ('Davitt, Sam', ': samd@codingtemple.com', ' : (555) 555-5555')]

In [41]:
info_list = []
for tup in info:
    obj_data = {
        'name': tup[0],
        'email': tup[1][2:],
        'phone': tup[2][3:]
    }
    info_list.append(obj_data)
    
info_list

[{'name': 'Patel, Ripal',
  'email': 'ripalp@codingtemple.com',
  'phone': '555 555-5555'},
 {'name': 'Carter, Joel',
  'email': 'joelc@codingtemple.com',
  'phone': '(555) 555-5555'},
 {'name': 'Butz, Ryan',
  'email': 'ryanb@codingtemple.gov',
  'phone': '(555) 555-5555'},
 {'name': 'Stanton, Brain',
  'email': 'brians@codingtemple.com',
  'phone': '555-555-5555'},
 {'name': 'Davitt, Sam',
  'email': 'samd@codingtemple.com',
  'phone': '(555) 555-5555'}]

In [44]:
compiler = re.compile(r'''
    (?P<name>[\w]+,\s[\w]+)                     # last, first name
    (?P<email>:\s[\d\w\'-+.]+@[-.\w\d]+)        # email
    (?P<phone>\s:\s\(?\d{3}\)?\s?-?\d{3}-\d{4}) # phone
''', re.X)

In [53]:
for i in compiler.finditer(information):
    print(f"Name: {i.group('name')}\nEmail: {i.group('email')[2:]}\nPhone: {i.group('phone')[3:]}\n")

Name: Patel, Ripal
Email: ripalp@codingtemple.com
Phone: 555 555-5555

Name: Carter, Joel
Email: joelc@codingtemple.com
Phone: (555) 555-5555

Name: Butz, Ryan
Email: ryanb@codingtemple.gov
Phone: (555) 555-5555

Name: Stanton, Brain
Email: brians@codingtemple.com
Phone: 555-555-5555

Name: Davitt, Sam
Email: samd@codingtemple.com
Phone: (555) 555-5555



### Homework Exercise <br>
<p>Print each persons name and twitter handle etc., using groups, should look like:</p>

In [107]:
# [
#     (First and last name,
#      email, 
#      phone,
#      title,
#      Twitter handle)
# ]