Click <a href='https://www.dataquest.io/blog/web-scraping-tutorial-python/'>here</a> to learn about Regular Expressions (RegEx) using Python.

In [None]:
########################
# DO NOT RUN THIS CELL #
########################

a, X, 9, < -- ordinary characters just match themselves exactly.
. (a period) -- matches any single character except newline '\n'
\w -- matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
\W -- matches any non-word character.
\b -- matches word boundary (in between a word character and a non word character)
\s -- matches a single whitespace character -- space, newline, return, tab
\S -- matches any non-whitespace character.
\t, \n, \r -- tab, newline, return
\d -- matches any numeric digit [0-9]
\D matches any non-numeric character.
^ -- matches the beginning of the string, or specify omition of certain characters
$ -- matches the end of the string
\ -- escapes special character.
(x|y|z) matches exactly one of x, y or z.
(x) in general is a remembered group. We can get the value of what matched by using the groups() method of the object returned by re.search.
x? matches an optional x character (in other words, it matches an x zero or one times).
x* matches x zero or more times.
x+ matches x one or more times.
x{m,n} matches an x character at least m times, but not more than n times.
?: matches an expression but do not capture it. Non capturing group.
?= matches a suffix but exclude it from capture. Positive lookahead.
a(?=b) will match the "a" in "ab", but not the "a" in "ac"
In other words, a(?=b) matches the "a" which is followed by the string 'b', without consuming what follows the a.
?! matches if suffix is absent. Negative look ahead.
a(?!b) will match the "a" in "ac", but not the "a" in "ab"
?<= positive look behind
[] matches for groupings of consecutive characters
?<! negative look behind

########################
# DO NOT RUN THIS CELL #
########################

What are word boundaries?
--------------------------------------------------
Before the first character in the string, if the first character is a word character.<br>
After the last character in the string, if the last character is a word character.<br>
Between two characters in the string, where one is a word character and the other is not a word character<br>

In [4]:
import re

# open a new data stream into a file
file = open('./names.txt', encoding='utf-8')

# read the text of the file and store it as Python data
data = file.read()

# always close the data stream
file.close()

### Search for names

In [5]:
type(b'e0r9dfizbjpgSd')

bytes

In [6]:
type('asdfdgnf')

str

In [7]:
type(r'aisojdsio')

str

In [8]:
# re.match - Searches from the very beginning of the string only

In [9]:
re.match(r'Hawkins', data)

<re.Match object; span=(0, 7), match='Hawkins'>

In [10]:
re.match(r'Vader', data)

In [11]:
# re.search - Returns the first match anywhere within the string

In [12]:
re.search(r'Hawkins', data)

<re.Match object; span=(0, 7), match='Hawkins'>

In [13]:
re.search(r'Vader', data)

<re.Match object; span=(754, 759), match='Vader'>

In [14]:
re.search(r'\w', data)

<re.Match object; span=(0, 1), match='H'>

In [15]:
re.search(r'\w, \w', data)

<re.Match object; span=(6, 10), match='s, D'>

In [16]:
re.search(r'\w\w\w\w\w\w\w, \w\w\w\w\w', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [17]:
re.search(r'\w{7}, \w{5}', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [18]:
re.search(r'\w+, \w+', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

### Search for phone numbers

In [19]:
'Derek's Hawkins' 

SyntaxError: invalid syntax (<ipython-input-19-21c77f103b24>, line 1)

In [20]:
'Derek\'s Hawkins' 

"Derek's Hawkins"

In [21]:
re.search(r'(\d\d\d) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(613, 625), match='555 555-5551'>

In [22]:
re.search(r'\(\d\d\d\) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

In [23]:
re.search(r'\(\d\d\d\) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

In [24]:
re.search(r'\(\d{3}\) \d{3}-\d{4}', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

In [25]:
re.search(r'\(\d{3}\)\s\d{3}-\d{4}', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

<strong>Exercise 1</strong>:<br>
Write a function that checks for n number of consecutive digits and returns the match

In [26]:
re.search(fr'\d{3}', data)

<re.Match object; span=(405, 407), match='43'>

In [27]:
def find_digits(n, searchable_text):
    return re.search(rf'\d{n}', searchable_text)

In [28]:
find_digits(3, data)

<re.Match object; span=(405, 407), match='43'>

In [29]:
# THIS WORKS. DON'T KNOW WHY
def find_digits(n, searchable_text):
    return re.search(fr'\d{{{n}}}', searchable_text)

In [30]:
find_digits(3, data)

<re.Match object; span=(39, 42), match='555'>

In [31]:
def find_digits(n, searchable_text):
    return re.search(fr'\d' * n, searchable_text)

In [32]:
find_digits(3, data)

<re.Match object; span=(39, 42), match='555'>

In [33]:
def find_digits(n, searchable_text):
    return re.search(r'\d{3}', searchable_text)

In [34]:
find_digits(3, data)

<re.Match object; span=(39, 42), match='555'>

In [46]:
# .findall - Returns a list of matching patterns

phone_numbers = "(555) 555-5555 (555) +1 (555) 555-5548 (555) 555-5554 555-555-5553 555-555-5552 555-555-5551 555 555-5550 555 555-5549"

In [48]:
# return with parans
re.findall(r'\(\d{3}\)\s\d{3}-\d{4}', phone_numbers)

['(555) 555-5555', '(555) 555-5548', '(555) 555-5554']

In [49]:
# return w/o parans
re.findall(r'\d{3}-\d{3}-\d{4}', phone_numbers)

['555-555-5553', '555-555-5552', '555-555-5551']

In [50]:
# return with only a space parans
re.findall(r'\d{3}\s\d{3}-\d{4}', phone_numbers)

['555 555-5550', '555 555-5549']

In [52]:
# Search for all phone number patterns
re.findall(r'\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4}', phone_numbers)

['(555) 555-5555',
 '+1 (555) 555-5548',
 ' (555) 555-5554',
 ' 555-555-5553',
 ' 555-555-5552',
 ' 555-555-5551',
 ' 555 555-5550',
 ' 555 555-5549']

In [55]:
# Search for names
re.findall(r'\w+,\s\w+', data)

['Hawkins, Derek',
 'Teacher, Coding',
 'Milliken, Connor',
 'Teacher, Coding',
 'Johnson, Joe',
 'Carter, Joel',
 'Österberg, Sven',
 'Governor, Norrbotten',
 'Enchanter, Killer',
 'Butz, Ryan',
 'CEO, Coding',
 'Doctor, The',
 'Lord, Gallifrey',
 'Exampleson, Example',
 'Example, Example',
 'Obama, Barack',
 'President, United',
 'Patel, Ripal',
 'Teacher, Coding',
 'Vader, Darth',
 'Lord, Galactic',
 'Sanz, María',
 'Minister, Spanish']

In [57]:
# Search for emails
re.findall(r'[\d\w\'-+.]+@[-.\d\w]+', data)

['derek@codingtemple.com',
 'connor@codingtemple.com',
 'joejohnson@codingtemple.com',
 'governor@norrbotten.co.se',
 'tim@killerrabbit.com',
 'ryanb@codingtemple.com',
 'doctor+companion@tardis.co.uk',
 'me@example.com',
 'president.44@us.gov',
 'ripalp@codingtemple.com',
 'vader@empire.gov',
 'mtfvs@spain.gov']

In [16]:
information = """
Patel, Ripal : ripalp@codingtemple.com : 555 555-5555
Carter, Joel : joelc@codingtemple.com : (555) 555-5555
Girolamo, Frank : frankg@codingtemple.com : +1 (555) 555-5555
Stanton, Brian : brains@ilstu.edu : 555-555-5555
Castle, Lu : lucasl@codingtemple.com : (555) 555-5555
Butz, Ryan : ryanb@codingtemple.edu : +1 (555) 555-5555
"""

re.findall(r'@[-.\d\w]+', information)

['@codingtemple.com',
 '@codingtemple.com',
 '@codingtemple.com',
 '@ilstu.edu',
 '@codingtemple.com',
 '@codingtemple.edu']

In [71]:
re.findall(r'[\d\w\'-+.]+@[tmplecoding]+[.com]+', information)

['ripalp@codingtemple.com',
 'joelc@codingtemple.com',
 'frankg@codingtemple.com',
 'lucasl@codingtemple.com',
 'ryanb@codingtemple.']

<strong>Exercise 2</strong>:<br>
Use Regular Expressions to pull the last phone number with the country code in the list using .findall()

In [None]:
# our_emails = ['jordanw@codingtemple.orgcom', 'pocohontas1234@gmail.com', 'helloworld@aol..com', 'yourfavoriteband@g6.org', '@codingtemplecom']

##### Exercise 2:

Write a function using regular expressions to find the domain name in the given email addresses (and return None for the invalid email addresses)<br><b>HINT: Use '|' for either or</b>

In [72]:
# skip

##### Exercise 3: 

Use a regular expression to find every number in the given string

In [73]:
# skip

In [None]:
# VERBOSE - re.X
# MULTILINE - re.M
# IGNORECASE - re.I

In [17]:

info = re.findall(r'''
    ([\w]+,\s[\w]+)                             # last name, first name
    \s:\s
    ([\d\w\'-+.]+@[.-/\w\d]+)                   # email
    \s:\s
    (\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})      # phone number
''', information, re.X)

people = []

for person in info:
    person_dict = {
        'name': person[0],
        'email': person[1],
        'phone': person[2],
    }
    people.append(person_dict)
    
for p in people:
    print(f"Name: {p['name']}")
    print(f"Email: {p['email']}")
    print(f"Phone: {p['phone']}")
    print()

Name: Patel, Ripal
Email: ripalp@codingtemple.com
Phone: 555 555-5555

Name: Carter, Joel
Email: joelc@codingtemple.com
Phone: (555) 555-5555

Name: Girolamo, Frank
Email: frankg@codingtemple.com
Phone: +1 (555) 555-5555

Name: Stanton, Brian
Email: brains@ilstu.edu
Phone: 555-555-5555

Name: Castle, Lu
Email: lucasl@codingtemple.com
Phone: (555) 555-5555

Name: Butz, Ryan
Email: ryanb@codingtemple.edu
Phone: +1 (555) 555-5555



In [12]:
newer_information = """
Hawkins, Derek : derekh@codingtemple.com : 555-555-5555
Davis, Miles : milesd@codingtemple.com : (555) 555-5555
Davitt, Sam : samd@codingtemple.edu : +1 (555) 555-5555
"""

In [13]:
new_info = re.compile(r'''
    (?P<name>[\w]+,\s[\w]+)                              # last name, first name
    \s:\s
    (?P<email>[\d\w\'-+.]+@[.-/\w\d]+)                   # email
    \s:\s
    (?P<phone>\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})      # phone number
''', re.X)

In [14]:
for p in new_info.finditer(newer_information):
    print(f"Name: {p.group('name')}\nEmail: {p.group('email')}\nPhone: {p.group('phone')}\n")

Name: Hawkins, Derek
Email: derekh@codingtemple.com
Phone: 555-555-5555

Name: Davis, Miles
Email: milesd@codingtemple.com
Phone: (555) 555-5555

Name: Davitt, Sam
Email: samd@codingtemple.edu
Phone: +1 (555) 555-5555



In [143]:
re.findall(r'[\s\w]+?,[\s\w]+', data)

['Hawkins, Derek\tderek',
 '5555\tTeacher, Coding Temple\t',
 'derekhawkins\nMilliken, Connor\tconnor',
 '5554\tTeacher, Coding Temple\nJohnson',
 'com\t\tCarter, Joel\nÖsterberg',
 'se\t\tGovernor, Norrbotten\t',
 'sverik\n, Tim\ttim',
 'com\t\tEnchanter, Killer Rabbit Cave\nButz',
 '5543\tCEO, Coding Temple\t',
 'ryanbutz\nDoctor, The\tdoctor',
 'uk\t\tTime Lord, Gallifrey\nExampleson',
 '5552\tExample, Example Co',
 'example\nObama, Barack\tpresident',
 '5551\tPresident, United States of America\t',
 'potus44\nPatel, Ripal\tripalp',
 '5553\tTeacher, Coding Temple\t',
 'ripalp\nVader, Darth\tdarth',
 '4444\tSith Lord, Galactic Empire\t',
 'darthvader\nFernández de la Vega Sanz, María Teresa\tmtfvs',
 'gov\t\tFirst Deputy Prime Minister, Spanish Govt']

### Homework Exercise <br>
<p>Print each persons name and twitter handle etc., using groups, should look like:</p>

In [107]:
# [
#     ([first name] [last name],
#      email, 
#      phone,
#      title,
#      Twitter handle)
# ]

In [2]:
import re

# open a new data stream into a file
file = open('./names.txt', encoding='utf-8')

# read the text of the file and store it as Python data
data = file.read()

# always close the data stream
file.close()

In [38]:
info = re.findall(r"""
    ([\w]+?,\s[\w]+)                             # last name, first name
    \s
    ([\d\w\'-+.]+@[.-/\w\d]+)?                   # email
    \s
    (\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})?      # phone number
    \s
    ([\w]+,\s[\w]+\s[\w]+)?                             # title
    (\s@[\w]+)?                                    # twitter
""", data, re.X)

people = []

for person in info:
    person_dict = {
        'name': person[0],
        'email': person[1],
        'phone': person[2],
        'title': person[3],
        'twitter': person[4],
    }
    people.append(person_dict)
    
for p in people:
    print(f"Name: {p['name']}")
    print(f"Email: {p['email']}")
    print(f"Phone: {p['phone']}")
    print(f"Title: {p['title']}")
    print(f"Twitter: {p['twitter']}")
    print()

# twitter = re.findall(r'(\s@[\w]+)', data)
# print(info)

Name: Hawkins, Derek
Email: derek@codingtemple.com
Phone: (555) 555-5555
Title: Teacher, Coding Temple
Twitter: 	@derekhawkins

Name: Milliken, Connor
Email: connor@codingtemple.com
Phone: (555) 555-5554
Title: Teacher, Coding Temple
Twitter: 

Name: Johnson, Joe
Email: joejohnson@codingtemple.com
Phone: 
Title: Carter, Joel
Österberg
Twitter: 

Name: Butz, Ryan
Email: ryanb@codingtemple.com
Phone: (555) 555-5543
Title: CEO, Coding Temple
Twitter: 	@ryanbutz

Name: Doctor, The
Email: doctor+companion@tardis.co.uk
Phone: 
Title: 
Twitter: 

Name: Exampleson, Example
Email: me@example.com
Phone: 555-555-5552
Title: Example, Example Co
Twitter: 

Name: Obama, Barack
Email: president.44@us.gov
Phone: 555 555-5551
Title: President, United States
Twitter: 

Name: Patel, Ripal
Email: ripalp@codingtemple.com
Phone: (555) 555-5553
Title: Teacher, Coding Temple
Twitter: 	@ripalp

