In [0]:
# https://docs.python.org/3/howto/regex.html

# "Regular expressions (called REs, or regexes, or regex patterns) are essentially a tiny, highly specialized programming 
# language embedded inside Python and made available through the re module. Using this little language, you specify the 
# rules for the set of possible strings that you want to match; this set might contain English sentences, or e-mail addresses, 
# or TeX commands, or anything you like. You can then ask questions such as “Does this string match the pattern?”, or 
# “Is there a match for the pattern anywhere in this string?”. You can also use REs to modify a string or to split it apart in 
# various ways."

In [0]:
# Note: 
#    - alphanumeric here implies 0-9, a-z, A-Z, or _
#    - a word is defined as a sequence of alphanumeric characters

# metacharacters
#    [ ]      matches character class specified within the square brackers
#             - and ^ have special meaning within character class
#             $ does not have special meaning within character class
#     -      when used inside a characted class set, implies range of characters
#    ^      when used as first character inside a character class set, implies match of complementing character class set
#    \       is used to either escape a metacharacter of its special meaning, or to signify a special squence
#    .        matches anything except a newline character
#    *       previous character is matched 0 or more times
#    +      previous character is matched 1 or more times
#    ?       previous characer is mathced 0 or 1 times
#    { }     {m,n} means there must be at least m repetitions, and at most n
#           {0,} is the same as *, {1,} is equivalent to +, and {0,1} is the same as ?
#    ^     when NOT used as first character inside a character class set, matches at the begining of a line
#    \A    matches only at the start of a string (equivalent to ^ in non-MULTILINE mode)
#    $     matches at the end of a line
#    \Z    matches only at the end of a string (equivalent to $ in non-MULTILINE mode)
#    \b    matches only at the begining or end of a word (that is, at a word boundary)
#    \B    matches only when not at the begining or end of a word (that is, not at a word boundary)
#    |      matches either/or expression on either side of | opeartor
#    ( )    used to group together the expressions contained inside;
#           you can then repeat the contents of a group with a repeating qualifier, such as *, +, ?, or {m,n}

# special squences (all sequencces can be included in a character set)
#    \d    matches any digit character; equivalent to [0-9]
#    \D    matches any non-digit character; equivalent to [^0-9]
#    \s     matches any whitespace character; equivalent to [ \t\n\r\f\v] => space, tab, newline, carriage return, form feed, vertical tab
#    \S     matches any non-whitespace character; equivalent to [^\t\n\r\f\v] 
#    \w    matches any alphanumeric character; equivalent to [0-9a-zA-Z_]
#    \W    matches any non-alphanumeric character; equivalent to [^0-9a-zA-Z_]

# Raw Strings
# Regular expressions use the backslash character ('\') to indicate special forms or to allow 
#    special characters to be used without invoking their special meaning. 
# This conflicts with Python’s usage of the same character for the same purpose in string literals.
# The solution is to use Python’s raw string notation for regular expressions.
# This is done by preceeding the regular expression pattern by r".."

In [0]:
# Regular Expressions are compiled into pattern objects:
#    import re
#    regex = re.compile(pattern, options)
#        - pattern: created using metacharacters and special squences
#        - options: can be re.IGNORECASE, re.VERBOSE, etc

# Once a pattern object is created, you can use one of several methods on it to create a match object
# match(): determines if the pattern matches at the begining of the string
# search(): determines if the pattern matches at any location of the string
# findall(): find all substrings where pattern matches, and return them as a list
# finditer(): find all substrings where pattern matches, and return them as an iterator

# Once a match object is created,  you can query the match object for information about the matching string
# group(): returns string matched by the pattern
# start(): return starting position of the match
# end(): return ending position of the match
# span(): return a tuple containing (start, end) position of the match

# Once a pattern object is created, you can also use the following methods to modify strings
# split(string[, maxsplit=0]): 
#               split the string into a list, splitting wherever the pattern matches 
#               if maxsplit is non-zero, at most maxsplit splits are performed (otherwise all splits are done)
# sub(replacement, string[, count=0]): 
#               find all substrings where the pattern matches, and replace them with a different string
#               if count is non-zero, at most count replacements are performed (otherwise all replacements are done)
# subn(): same as sub, but returns new string and number of replacements

In [0]:
import re
import string

In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# first create an RE pattern object of all characters you'd like to match;
# then replace all matched characters with ''#''
regex = re.compile(r"[a!1]")
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
Hey# Are we still on for lunch tod#y #t ###m?


In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# create an RE pattern object of all characters from a-z;
# then replace all matched characters with ''#''
regex = re.compile(r"[a-z]")
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
H##! A## ## ##### ## ### ##### ##### ## 11##?


In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# create an RE pattern object of all characters from a-zA-Z;
# then replace all matched characters with ''#''
regex = re.compile(r"[a-zA-Z]")
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
###! ### ## ##### ## ### ##### ##### ## 11##?


In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# create an RE pattern object of all characters from a-z and make match case-insensitive (re.IGNORECASE)
# then replace all matched characters with ''#''
regex = re.compile(r"[a-z]", re.IGNORECASE)
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
###! ### ## ##### ## ### ##### ##### ## 11##?


In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# create an RE pattern object of all digits 0-9;
# then replace all matched characters with ''#''
regex = re.compile(r"[\d]")
#regex = re.compile(r"[0-9]")
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
Hey! Are we still on for lunch today at ##am?


In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# create an RE pattern object of the complement of all characters from a-zA-Z (that is, any characted that is not a-zA-Z)
# then replace all matched characters with ''#''
regex = re.compile(r"[^a-zA-Z]")
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
Hey##Are#we#still#on#for#lunch#today#at###am#


In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# create an RE pattern object of the complement of all characters from 0-9 (that is, any characted that is not 0-9)
# then replace all matched characters with ''#''
regex = re.compile(r"[^\d]")
#regex = re.compile(r"^[0-9]")
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
########################################11###


In [0]:
oldstr = "Hey! Are we still on for lunch today at 11am?"

# create an RE pattern object of whitespaces
# then replace all matched characters with ''#''
regex = re.compile(r"[\s]")
newstr = regex.sub('#', oldstr)
print (oldstr)
print (newstr)

Hey! Are we still on for lunch today at 11am?
Hey!#Are#we#still#on#for#lunch#today#at#11am?


In [0]:
oldstr = "Why Lisa, why, WHY"
print (oldstr)

# create an RE pattern object of  (case-insensitive) "why" anywhere in the string
# then replace all matches with ''#''
regex1 = re.compile(r"why", re.IGNORECASE)
newstr1 = regex1.sub('#', oldstr)
print (newstr1)

# create an RE pattern object of (case-insensitive) "why" at the begining of the string
# then replace all matches with ''#''
regex2 = re.compile(r"^why", re.IGNORECASE)
newstr2 = regex2.sub('#', oldstr)
print (newstr2)

# create an RE pattern object of (case-insensitive) "why" at the end of the string
# then replace all matches with ''#''
regex3 = re.compile(r"why$", re.IGNORECASE)
newstr3 = regex3.sub('#', oldstr)
print (newstr3)

Why Lisa, why, WHY
# Lisa, #, #
# Lisa, why, WHY
Why Lisa, why, #


In [0]:
oldstr = "the cat will catch-up with you in muscat"
print (oldstr)

# first create an RE pattern object of "cat"
# then replace all matches with ''#''
regex1 = re.compile(r"cat")
newstr1 = regex1.sub('#', oldstr)
print (newstr1)

# first create an RE pattern object of "cat" at a word boundary at begining as well as end
# then replace all matches with ''#''
regex2 = re.compile(r"\bcat\b")
newstr2 = regex2.sub('#', oldstr)
print (newstr2)

# first create an RE pattern object of "cat" at a word boundary at the begining 
# then replace all matches with ''#''
regex3 = re.compile(r"\bcat")
newstr3 = regex3.sub('#', oldstr)
print (newstr3)

# first create an RE pattern object of "cat" at a word boundary at the end 
# then replace all matches with ''#''
regex4 = re.compile(r"cat\b")
newstr4 = regex4.sub('#', oldstr)
print (newstr4)

the cat will catch-up with you in muscat
the # will #ch-up with you in mus#
the # will catch-up with you in muscat
the # will #ch-up with you in muscat
the # will catch-up with you in mus#


In [0]:
# Exercise 1: 
#    - find file names of the form base.extension  
#    - and print the file names

fnamestr = "The two files are foo1.bar and foo2.bar. There are no other files."
print (fnamestr)

regex = re.compile(r"\b\w+[.]\w+\b")
fnames = regex.findall(fnamestr)
print (fnames)

The two files are foo1.bar and foo2.bar. There are no other files.
['foo1.bar', 'foo2.bar']


In [0]:
# Exercise 2:
#     - find punctuations and digits
#     - replace with empty

oldstr = "Hey! Are we still on for lunch today at 11am?"
print (oldstr)

regex1 = re.compile(r"[%s]" % string.punctuation)
newstr1 = regex1.sub('',oldstr)
print (newstr1)

regex1 = re.compile(r"[%s%s]" % (string.punctuation,string.digits))
newstr1 = regex1.sub('',oldstr)
print (newstr1)

Hey! Are we still on for lunch today at 11am?
Hey Are we still on for lunch today at 11am
Hey Are we still on for lunch today at am
