# To open this file in Google Colab add 'tocolab' just after github in the URL
## https://github.com/...........
## https://githubtocolab.com/...........

In [202]:
#############################################
############ REGULAR EXPRESSIONS ############
#############################################

In [6]:
### METACHARACTER
#
#  .       - Any Character Except New Line
#  \d      - Digit (0-9)
#  \D      - Not a Digit (0-9)
#  \w      - Word Character (a-z, A-Z, 0-9, _)
#  \W      - Not a Word Character
#  \s      - Whitespace (space, tab, newline)
#  \S      - Not Whitespace (space, tab, newline)
#
#  \b      - Word Boundary
#  \B      - Not a Word Boundary
#  ^       - Beginning of a String
#  $       - End of a String
#
#  []      - Matches Characters in brackets
#  [^ ]    - Matches Characters NOT in brackets
#  |       - Either Or
#  ( )     - Group


### QUANTIFIER
#
#  *       - 0 or More
#  +       - 1 or More
#  ?       - 0 or One
#  {n}     - Exact Number
#  {min,}    - Range of Numbers (Minimum Defined Only) 
#  {,max}    - Range of Numbers (Maximum Defined Only)
#  {min,max}    - Range of Numbers (Minimum, Maximum)

In [9]:
# Import library for regular expression

import re

In [7]:
# Search something in text without regular expression

text = "The phone number of the agent is 408-555-1234. Call soon!"
"408-555-1234" in text

True

In [10]:
# Search pattern in text using regular expression provides more information

text = "The phone number of the agent is 207-123-1234. Call soon!"
pattern = 'phone'
re.search(pattern, text)

<re.Match object; span=(4, 9), match='phone'>

In [13]:
# Store the pattern result in a variable

my_match = re.search(pattern, text)
print(f"Pattern's starting and ending position:", my_match.span())
print(f"Pattern's starting position:", my_match.start())
print(f"Pattern's ending position:", my_match.end())

Pattern's starting and ending position: (4, 9)
Pattern's starting position: 4
Pattern's ending position: 9


In [14]:
# In case of multiple matches RegEx returns the position of first match

text = 'My phone is a new phone.'
match = re.search('phone', text)
match.span()

(3, 8)

In [18]:
# Store all matches in a variable

text = 'My phone is a new phone.'
match = re.search('phone', text)
all_matches = re.findall('phone', text)
print(f'Matches are stored in a list:', all_matches)
print(f'Length of list:', len(all_matches))

Matches are stored in a list: ['phone', 'phone']
Length of list: 2


In [19]:
# Find the position of all matches

text = 'My phone is a new phone.'
match = re.search('phone', text)
all_matches = re.findall('phone', text)
count = 0

for match in re.finditer('phone', text):
    count += 1
    print(f'Match number {count}:', match.span())

Match number 1: (3, 8)
Match number 2: (18, 23)


In [128]:
# Create a string


In [22]:
# Create a pattern
# Check the video for the references

text = 'My phone number is 207-123-1234.'
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'
phone_number = re.search(pattern, text)
print(phone_number)
print(phone_number.group())

<re.Match object; span=(19, 31), match='207-123-1234'>
207-123-1234


In [130]:
# Match the pattern with the text


In [131]:
# Print the phone number


<re.Match object; span=(19, 31), match='207-951-8393'>

In [132]:
# Group only the matching object
# Similar to group by


'207-951-8393'

In [133]:
# Quantifiers for the regular expression
# Check the video for more references
pattern = r"\d{3}-\d{3}-\d{4}"

In [134]:
# Match the pattern with the text
phone_number = re.search(pattern, text)

In [135]:
# Group only the matching object
# Similar to group by
phone_number.group()

'207-951-8393'

In [25]:
############################################
###### REGULAR EXPRESSIONS - PART TWO ######
############################################

In [143]:
# Group the pattern using parentheses
pattern = r"(\d{3})-(\d{3})-(\d{4})"

In [144]:
# Match the pattern with the text
my_match = re.search(pattern, text)

In [145]:
# Group only the matching object
# Similar to group by
my_match.group()

'207-951-8393'

In [146]:
# Retrieve the first group of the string
# For example, to find the area code only
my_match.group(1)

'207'

In [151]:
# Or operator in regular expression
re.search(r"man|woman", "He is a good woman.")

<re.Match object; span=(13, 18), match='woman'>

In [159]:
# Wildcard in regular expression
re.findall(r".at", "The cat in the hat sat splat")

['cat', 'hat', 'sat', 'lat']

In [160]:
# Starts with in regular expression
re.findall(r"^\d", "1 is the loneliest number")

['1']

In [164]:
# Ends with in regular expression
re.findall(r"\d$", "This line ends with the number 2")

['2']

In [182]:
# Exclude specific patterns from the phrase
phrase = "There are 3 numbers 34 inside 5 this sentence"
re.findall(r"[^\d]+", phrase)

['There are ', ' numbers ', ' inside ', ' this sentence']

In [171]:
# Exclude punctuation from the sentence
test_phrase = "This is a string! But it has punctuation. How to remove it?"
re.findall(r"[^.!? ]+", test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'to',
 'remove',
 'it']

In [175]:
# Join string with the list
my_list = re.findall(r"[^.!? ]+", test_phrase)
" ".join(my_list)

'This is a string But it has punctuation How to remove it'

In [176]:
# + sign allow grouping in the regular expression
text = "Only find the hyphen-words. Were are the long-ish dash words?"
re.findall(r"[\w]+-[\w]+", text)

['hyphen-words', 'long-ish']