In [1]:
import re

In [2]:
# Shakespeare
text = """
Ha! let me see her: out, alas! he's cold:
Her blood is settled, and her joints are stiff;
Life and these lips have long been separated:
Death lies on her like an untimely frost
Upon the sweetest flower of all the field.
"""

In [3]:
re.findall('her',text)

['her', 'her', 'her']

## 1. Special Symbols

In [4]:
# quick reference 
# \n - the new line 
# \t - tabular character 
# \s - whitespace character 
# \S - Negation of white space  character 
# \w - word character ( A-Z,a-z,0-9,_)
# \W - Negation of word character 
# \b - boundary character 
# \d - digit character [0-9]
# \D - nondigit character [^0-9]

In [5]:
print(re.findall('\w+\W+\w+',text))
# matches each pair of words in the text
# matched text is consumed and doesn't match again. 
# thats why the combination 'let me' is not a matching substring.

['Ha! let', 'me see', 'her: out', 'alas! he', 's cold', 'Her blood', 'is settled', 'and her', 'joints are', 'stiff;\nLife', 'and these', 'lips have', 'long been', 'separated:\nDeath', 'lies on', 'her like', 'an untimely', 'frost\nUpon', 'the sweetest', 'flower of', 'all the']


In [6]:
print(re.findall('\d',text))

[]


In [7]:
# searching all occurences of a newline character, followed by a tab character
print(re.findall('\n\t',text))
# no match because each line starts with a sequence of four whitespaces
# rather than the tab character.

[]


In [8]:
# lets match the new line 
print(re.findall('\n',text))

['\n', '\n', '\n', '\n', '\n', '\n']


In [9]:
print(re.findall('\s\w\w\s',text)) 
# matches two letter words surrounded by a whitespace

[' me ', ' is ', ' on ', ' an ', ' of ']


## 2.Character Sets  

In [10]:
# [a-z] matches any character from a to z
# [0-9] matches any character from 0 to 9

In [11]:
re.findall('[a-e]','hello world')

['e', 'd']

In [12]:
# multiple character ranges 
pattern = '[a-eA-E0-4]'
re.findall(pattern,'hello WORLD 42!')

['e', 'D', '4', '2']

In [13]:
# negating the character sets
re.findall('[^a-e]+','hello world')
# + indicates one or many

['h', 'llo worl']

In [14]:
# puzzle 
def special_name(name):
    return re.match('j[a-z]+n',name)

In [15]:
special_name('chris') # no match

In [16]:
special_name('joan')

<re.Match object; span=(0, 4), match='joan'>

In [17]:
special_name('joan john') 
# matches the pattern at the beginning and returns a match object

<re.Match object; span=(0, 4), match='joan'>

In [18]:
# accessing the start and end positions of the match in the string 
m = re.match('h...o','hello world')
m.start(),m.end()

(0, 5)

In [19]:
'hello world'[0:5]

'hello'

In [20]:
# Shakespeare
text = """
Ha! let me see her: out, alas! he's cold:
Her blood is settled, and her joints are stiff;
Life and these lips have lips long been separated:
Death lies on her like an untimely frost
Upon the sweetest flower of all the field.
"""

In [21]:
m = re.match('(.|\n)*lips',text)
text[m.start():m.end()]

"\nHa! let me see her: out, alas! he's cold:\nHer blood is settled, and her joints are stiff;\nLife and these lips have lips"

In [22]:
m = re.match('.*lips',text,flags = re.DOTALL)
# re.DOTALL flag ensures that the dot matches all characters,
# including the new line character. 
text[m.start():m.end()]

"\nHa! let me see her: out, alas! he's cold:\nHer blood is settled, and her joints are stiff;\nLife and these lips have lips"

In [23]:
# re.match(pattern,string) returns a match object 
# re.findall(pattern,string) returns all matches in the string. 

In [24]:
text = 'Python is superior to Python'
re.match('Py...n',text)

<re.Match object; span=(0, 6), match='Python'>

In [25]:
re.findall('Py...n',text)

['Python', 'Python']

In [26]:
# re.fullmatch() returns a match object if the pattern matches
# the whoole string. 

In [27]:
text = '''
Call me Ishmael. Some years ago--never mind how long
    precisely
--having little or no money in my purse, and nothing
    particular
to interest me on shore, I thought I would sail about
    a little
and see the watery part of the world.
'''

In [28]:
re.fullmatch('Call(.|\n)*',text)

In [29]:
re.fullmatch('\nCall(.|\n)*',text)

<re.Match object; span=(0, 241), match='\nCall me Ishmael. Some years ago--never mind how>

In [30]:
text = 'More with less'
re.match('More.*',text)

<re.Match object; span=(0, 14), match='More with less'>

In [31]:
re.fullmatch('More.*',text)

<re.Match object; span=(0, 14), match='More with less'>

In [32]:
re.match('More',text)

<re.Match object; span=(0, 4), match='More'>

In [33]:
re.fullmatch('More',text)

## Dot Regex 

In [34]:
# dot matches all characters except the newline character

In [35]:
text = """But then I saw no harm, and then I
    heard
Each syllable that breath made up between them.
"""

In [36]:
re.findall('B..',text)

['But']

In [37]:
re.findall('heard.Each',text)

[]

In [38]:
re.findall('heard\nEach',text)

['heard\nEach']

## Flags

In [39]:
# re.ASCII = for matching only ASCII characters 
# re.DEBUG = for printing some useful information to the shell 
# re.IGNORECASE = for case-insensitive matching 
# re.MULTILINE = matches at the beginning of each line 
# re.DOTALL = matching all characters including the new line character
# re.VERBOSE = improving the readability 

In [40]:
text = '''
    Ha! let me see her: out, alas! he's cold:
    Her blood is settled, and her joints are stiff;
    Life and these lips have long been separated:
    Death lies on her like an untimely frost
    Upon the sweetest flower of all the field.
'''

In [41]:
re.findall('HER',text,flags = re.IGNORECASE)

['her', 'Her', 'her', 'her']

In [42]:
re.findall('  HER # ignored', text,flags = re.IGNORECASE+re.VERBOSE)

['her', 'Her', 'her', 'her']

## Matching newline 

In [43]:
# dot regex . matches a single arbitrary character except the newline

In [44]:
s = '''hello
python
'''
re.findall('o.p',s)

[]

In [45]:
re.findall('o.p',s,flags = re.DOTALL)

['o\np']

In [46]:
re.findall('o[.\n]p',s)

['o\np']

In [47]:
# if we want to match the . (period) character use \
text = 'Python. is a great Language.'
re.findall('\w+\.',text)

['Python.', 'Language.']

In [48]:
re.findall('\.\s',text)

['. ']

In [49]:
re.findall('[nt]\.',text)

['n.']

In [50]:
text = 'Speedy Gonzalez'
matches = re.findall('.',text)
result= len(matches) == len(text)
result

True

## The Asterisk * Quantifier

In [51]:
# A* matches zero or more occurences of A

In [54]:
text = 'jupyter for f fast and fun python learning'
re.findall('f.* ',text)

['for f fast and fun python ']

In [55]:
re.findall('f[a-z]*',text)

['for', 'f', 'fast', 'fun']

In [56]:
text = 'Python is ***great***'
re.findall('\*',text)

['*', '*', '*', '*', '*', '*']

In [57]:
re.findall('\**',text)

['', '', '', '', '', '', '', '', '', '', '***', '', '', '', '', '', '***', '']

In [59]:
re.findall('\*+',text)

['***', '***']

In [63]:
famous_tweet = '''
I'm going give someone random who retweets this
tweet $10,000 because it's my birthday and I feel
like being nice (you have to be following me so
I can dm you the code if you win)
@MrBeastYT
#Twitter
'''
user = re.findall('@.*', famous_tweet)
print(user)

['@MrBeastYT']


## The plus + operator 

In [67]:
website ="""
<!doctype html> 
<html>
    <head>
        <meta charset = 'utf-8'>
    </head>
    <body>
        hello world
    </body>
</html>
"""

re.findall('<.+>', website)

['<!doctype html>',
 '<html>',
 '<head>',
 "<meta charset = 'utf-8'>",
 '</head>',
 '<body>',
 '</body>',
 '</html>']

## Look Behind 