# Advanced Regular Expressions

Complete the following set of exercises to solidify your knowledge of regular expressions.

In [2]:
import re

### 1. Use a regular expression to find and extract all vowels in the following text.

In [3]:
text = "This is going to be a sentence with a good number of vowels in it."

In [4]:
text_vowels = re.findall(r'[aeiou]', text.lower()) #lower case everything and find any vowel in string 'text'
text_vowels

['i',
 'i',
 'o',
 'i',
 'o',
 'e',
 'a',
 'e',
 'e',
 'e',
 'i',
 'a',
 'o',
 'o',
 'u',
 'e',
 'o',
 'o',
 'e',
 'i',
 'i']

### 2. Use a regular expression to find and extract all occurrences and tenses (singular and plural) of the word "puppy" in the text below.

In [5]:
text = "The puppy saw all the rest of the puppies playing and wanted to join them. I saw this and wanted a puppy of my own!"

In [6]:
puppy_words = re.findall(r'\bpupp\w*', text.lower())
#find a word in 'text' that:
    #\bpupp starts with 'pupp'
    #\w followed by any word character [a-b]
    # * and what follows
puppy_words

['puppy', 'puppies', 'puppy']

### 3. Use a regular expression to find and extract all tenses (present and past) of the word "run" in the text below.

In [7]:
text = "I ran the relay race the only way I knew how to run it."

In [8]:
run_words = re.findall(r'\br\Sn\b', text.lower())
# \br where \b stands for starts with and 'r' is the condition
# \S stands for a string contains any non-whitespace character
# n\b where n stands for condition and \b for ends with
run_words

['ran', 'run']

### 4. Use a regular expression to find and extract all words that begin with the letter "r" from the previous text.

In [9]:
r_words = re.findall(r'\br\S*', text.lower())
# \br where \b stands for starts with and 'r' is the condition
# \S stands for a string contains any non-whitespace character
# * any occurrences that match \S
r_words

['ran', 'relay', 'race', 'run']

### 5. Use a regular expression to find and substitute the letter "i" for the exclamation marks in the text below.

In [10]:
text = "Th!s !s a sentence w!th spec!al characters !n !t."

In [11]:
excl_marks = re.findall(r'\S*[!]\S*', text.lower()) #find occurrences
excl_marks

['th!s', '!s', 'w!th', 'spec!al', '!n', '!t.']

In [12]:
text_good = re.sub('[!]', 'i', text)
# new_string = re.sub(pattern, replace, string)

text_good

'This is a sentence with special characters in it.'

### 6. Use a regular expression to find and extract words longer than 4 characters in the text below.

In [13]:
text = "This sentence has words of varying lengths."

In [14]:
four_or_more = re.findall(r'\w{4,}', text)
# \w followed by any word character [a-b]
# {4,} word of lenght 4 or more
four_or_more

['This', 'sentence', 'words', 'varying', 'lengths']

### 7. Use a regular expression to find and extract all occurrences of the letter "b", some letter(s), and then the letter "t" in the sentence below. 
Ex. beat, bat & bot.

In [15]:
text = "I bet the robot couldn't beat the other bot with a bat, but instead it bit me."

In [19]:
r_to_b = re.findall(r'[a-zA-Z]*b[a-zA-Z]*t', text)
# [a-zA-Z]* any letter occurring zero or more times
# b the letter 'b' somewhere in the middle
# [a-zA-Z]* any letter occurring zero or more times
# t and the letter t later on

r_to_b

['bet', 'robot', 'beat', 'bot', 'bat', 'but', 'bit']

### 8. Use a regular expression to find and extract all words that contain either "ea" or "eo" in them.

In [20]:
text = "During many of the peaks and troughs of history, the people living it didn't fully realize what was unfolding. But we all know we're navigating breathtaking history: Nearly every day could be — maybe will be — a book."


In [24]:
eo_ea = re.findall(r'[a-zA-Z]*e[a,o][a-zA-Z]*', text)
# [a-zA-Z]* any letter occurring zero or more times
# e[a,o] the letter 'e' somewhere in the middle followed by "a" or "o"
# [a-zA-Z]* any letter occurring zero or more times

eo_ea

['peaks', 'people', 'realize', 'breathtaking', 'Nearly']

### 9. Use a regular expression to find and extract all the capitalized words in the text below individually.

In [27]:
text = "Teddy Roosevelt and Abraham Lincoln walk into a bar."

In [28]:
cap_words = re.findall(r'\b[A-Z][a-z]*', text)
# \b[A-Z] starting with a capital letter
# [a-z]* followed by any letter in lower case 0 or more times

cap_words

['Teddy', 'Roosevelt', 'Abraham', 'Lincoln']

### 10. Use a regular expression to find and extract all the sets of consecutive capitalized words in the text above.

In [35]:
cap_foll_words = re.findall(r'\b[A-Z][a-z]*(?:\s[A-Z][a-z]*)*', text)
# \b[A-Z] starting with a capital letter
# [a-z]* followed by any letter in lower case 0 or more times
# (?=\s[A-Z]) with space and uppercase letter following it
# [a-z]* followed by any letter in lower case 0 or more times
# last * defines this could happend consecutively more than 1 time

cap_foll_words

['Teddy Roosevelt', 'Abraham Lincoln']

### 11. Use a regular expression to find and extract all the quotes from the text below.

*Hint: This one is a little more complex than the single quote example in the lesson because there are multiple quotes in the text.*

In [55]:
text = 'Roosevelt says to Lincoln, "I will bet you $50 I can get the bartender to give me a free drink." Lincoln says, "I am in!"'


In [56]:
text_no_quotes = text.replace(r'"', '').replace(r"'", "")
text_no_quotes

'Roosevelt says to Lincoln, I will bet you $50 I can get the bartender to give me a free drink. Lincoln says, I am in!'

In [58]:
text_quotes = re.findall(r'"[a-z]*', text.lower())
text_quotes

['"i', '"', '"i', '"']

### 12. Use a regular expression to find and extract all the numbers from the text below.

In [67]:
text = "There were 30 students in the class. Of the 30 students, 14 were male and 16 were female. Only 10 students got A's on the exam."


In [69]:
text_numbers = re.findall(r'[0-9][0-9]*', text)
text_numbers

['30', '30', '14', '16', '10']

### 13. Use a regular expression to find and extract all the social security numbers from the text below.

In [70]:
text = """
Henry's social security number is 876-93-2289 and his phone number is (847)789-0984.
Darlene's social security number is 098-32-5295 and her phone number is (987)222-0901.
"""

In [120]:
ss_num = re.findall(r'(?<=security number is\s)\d+-*\d+-*\d+-*', text)
# (?<=security number is\s) whatever follows this and a white space (\s)
# \d+-* a number or more (\d+) followed by a '-'
# \d+-*\d+-*\d+-* repetition of the pattern

ss_num

['876-93-2289', '098-32-5295']

### 14. Use a regular expression to find and extract all the phone numbers from the text below.

In [122]:
phone_num = re.findall(r'(?<=phone number is\s).*[13]', text)
phone_num

['(987)222-0901']

### 15. Use a regular expression to find and extract all the formatted numbers (both social security and phone) from the text below.

In [124]:
phone_ss_num = re.findall(r'(?<=security number is\s)\d+-*\d+-*\d+-*|(?<=phone number is\s).*[13]', text)
phone_ss_num

['876-93-2289', '098-32-5295', '(987)222-0901']