# Searching for a target

In [2]:
import re

In [3]:
text = 'HKU Business School'
findHKU = re.search('HKU', text)
findMSBA = re.search('MSBA', text)
print(findHKU)
print(findMSBA)

<re.Match object; span=(0, 3), match='HKU'>
None


In [4]:
print(type(findHKU))
print(findHKU.span())
print(findHKU.start())
print(findHKU.end())
print(findHKU.string)

<class 're.Match'>
(0, 3)
0
3
HKU Business School


In [5]:
if findHKU: 
    print("Yes, HKU") 
else:
    print("No, HKU")
if findMSBA:
    print("Yes, MSBA") 
else:
    print("No, MSBA")

Yes, HKU
No, MSBA


# Using metacharacters and sets

In [6]:
re.findall('.', text[-6:])
# match with any one character, except for a newline

['S', 'c', 'h', 'o', 'o', 'l']

In [39]:
re.findall('.+', text[-6:])
# match as many as possible, except for a newline
# this is greedy matching

['School']

In [38]:
re.findall('.+?', text[-6:])
# ? turns off greedy matching

['S', 'c', 'h', 'o', 'o', 'l']

In [7]:
text2 = '''From chao.ding@hku.hk end
From: eric.wong@hku.hk over
from michael.chau4@hku.hk done
exam@friday'''

text2    # notice the \n in the text

'From chao.ding@hku.hk end\nFrom: eric.wong@hku.hk over\nfrom michael.chau4@hku.hk done\nexam@friday'

In [8]:
re.findall('F', text2)
# extract all the "F"s from the text

['F', 'F']

In [9]:
re.findall('^F', text2)
# extract only the one at the begining of the text

['F']

In [10]:
re.findall('e', text2)
# extract all the "e"s from the text

['e', 'e', 'e', 'e', 'e', 'e']

In [11]:
re.findall('e$', text2)

[]

In [12]:
re.findall('From|from', text2)
# extract either From or from

['From', 'From', 'from']

In [13]:
re.findall('[Ff]rom', text2)

['From', 'From', 'from']

In [14]:
pattern = re.compile("[a-z.]+@[a-z.]+")
# create a pattern to extract email address
# a dot in a set matches with a real dot

pattern.findall(text2)
# but it does not match with michael.chau4@hku.hk
# and it does match with exam@friday, which is not desired

['chao.ding@hku.hk', 'eric.wong@hku.hk', 'exam@friday']

In [15]:
pattern = re.compile("[a-z0-9.]+@[a-z]+[.][a-z]+")
# add another range 0-9 in the template

pattern.findall(text2)
# now you have all three email addresses

['chao.ding@hku.hk', 'eric.wong@hku.hk', 'michael.chau4@hku.hk']

In [16]:
# extract the part before @, use a pair of parenthese

pattern = re.compile("([a-z0-9.]+)@[a-z]+[.][a-z]+")
pattern.findall(text2)

['chao.ding', 'eric.wong', 'michael.chau4']

In [17]:
text3 = 'My 2 favorite numbers are 19 and 42'
re.findall('[0-9]', text3)

['2', '1', '9', '4', '2']

In [18]:
re.findall('[0-9]+', text3)
# this is greedy matching

['2', '19', '42']

In [19]:
re.findall('[0-9]+?', text3)
# use ? to turn greedy matching to non-greedy

['2', '1', '9', '4', '2']

In [20]:
re.findall('[0-9]{2}', text3)
# use {} to specify the exact number of occurences

['19', '42']

In [167]:
re.findall('[^0-9 ]{3,5}', text3)
# matches with substrings with a length from 3 to 5, but without any digits or spaces
# note there is no space after ,
# {} is also greedy matchin

['favor', 'ite', 'numbe', 'are', 'and']

In [40]:
re.findall('[^0-9 ]{3,}', text3)
# as many as possible, but at least three

['favorite', 'numbers', 'are', 'and']

# Using escape characters

In [21]:
text4 = 'From <chao.ding@hku.hk> Assignment 1'
pattern = re.compile('<(\S+@\S+)>')
matches = pattern.findall(text4)
print(matches)

['chao.ding@hku.hk']


In [22]:
re.findall('@(\S+)>', text4)[0]    
# using list index to get a specific string in the list

'hku.hk'

In [23]:
re.findall('\w+', text4)
# matches with string containing any word characters
# characters from a to Z, digits from 0-9, and the underscore _ character

['From', 'chao', 'ding', 'hku', 'hk', 'Assignment', '1']

In [24]:
re.findall('\w+\.\w+', text4)
# \. matches with a real dot as well
# similar to [.]

['chao.ding', 'hku.hk']

In [25]:
text5 = 'We just received $10.88 for 20 cookies.'
re.findall('\d+', text5)

['10', '88', '20']

In [26]:
re.findall('\$\d+', text5)[0]

'$10'

In [27]:
# \b: word boundary
# the use of \b needs to be combined with a raw string

text6 = "Lis1bon is2 an oasis3"
re.findall(r'is[0-9]', text6)

['is1', 'is2', 'is3']

In [28]:
re.findall(r'\bis[0-9]', text6)

['is2']

In [29]:
re.findall(r'is[0-9]\b', text6)

['is2', 'is3']

In [30]:
re.findall(r'\bis[0-9]\b', text6)

['is2']

# Splitting strings & Substituting substrings

In [31]:
text7 = 'The University of Hong Kong (HKU)'
re.split('\s', text7, 2)
# third argument: makes 2 splits

['The', 'University', 'of Hong Kong (HKU)']

In [32]:
re.split('\W+', text7)

['The', 'University', 'of', 'Hong', 'Kong', 'HKU', '']

In [33]:
re.sub('\(|\)', '--', text7)
# replace parenthese with --

'The University of Hong Kong --HKU--'

In [34]:
re.sub('\s\(.+\)', '', text7)
# remove the content in the parenthese

'The University of Hong Kong'

In [35]:
re.sub('\s[(].+[)]', '', text7)

'The University of Hong Kong'

In [36]:
# another way to remove the content in the parenthese: using string slicing
text7[0 : text7.find('(')-1]

'The University of Hong Kong'

# about flags

In [5]:
re.findall('[a-z]+', text7, flags = re.I)

['The', 'University', 'of', 'Hong', 'Kong', 'HKU']

In [21]:
# to add multiple flags
re.findall("""[a-z]{2}            # {4} means to match exactly 4
                    \s[a-z]{4}    # \s means to match with a space""",
          text7, flags = re.I | re.X)

#  re.X: to allow comments in the pattern.

['he Univ', 'of Hong']

In [26]:
s = """Regex
Flags"""

re.findall('^\w+', s, re.M)  
# re.M: to match in multiple lines, each has a start and an end

['Regex', 'Flags']