# CH07 模式匹配与正则表达式


- ?匹配零次或一次前面的分组。
- *匹配零次或多次前面的分组。
- +匹配一次或多次前面的分组。
- {n}匹配 n 次前面的分组。
- {n,}匹配 n 次或更多前面的分组。
- {,m}匹配零次到 m 次前面的分组。
- {n,m}匹配至少 n 次、至多 m 次前面的分组。
- {n,m}?或*?或+?对前面的分组进行非贪心匹配。
- ^spam 意味着字符串必须以 spam 开始。
- spam$意味着字符串必须以 spam 结束。
- .匹配所有字符，换行符除外。
- \d、\w 和\s 分别匹配数字、单词和空格。
- \D、\W 和\S 分别匹配出数字、单词和空格外的所有字符。
- [abc]匹配方括号内的任意字符（诸如 a、b 或 c）。
- [^abc]匹配不在方括号内的任意字符。

In [2]:
import re

# 通过在字符串的第一个引号之前加上 r，可以将该字符串标记为原始字符串，它不包括转义字符
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242.') 
print('Phone number found: ' + mo.group())


Phone number found: 415-555-4242


In [7]:
# 利用括号分组
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.') 
print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.group())


print(mo.groups())
areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)


#
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is (415) 555-4242.') 
print(mo.group(1))
print(mo.group(2))

415
555-4242
415-555-4242
415-555-4242
('415', '555-4242')
415
555-4242
(415)
555-4242


In [10]:
# 用管道匹配多个分组
# 第一次出现的匹配文本，将作为 Match 对象返回
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey.')
print(mo1.group())

mo2 = heroRegex.search('Tina Fey and Batman.')
print(mo2.group())

#
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
print(mo.group())
print(mo.group(1))

Batman
Tina Fey
Batmobile
mobile


In [12]:
# 用问号实现可选匹配
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

Batman
Batwoman


In [15]:
# 用星号匹配零次或多次
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batwowowoman')
print(mo3.group())

Batman
Batwoman
Batwowowoman


In [18]:
# 用加号匹配一次或多次
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1 == None)

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batwowowoman')
print(mo3.group())

True
Batwoman
Batwowowoman


In [20]:
# 用花括号匹配特定次数
# (Ha){3}将匹配 3 次，
# (Ha){3,}将匹配 3 次或更多次实例，
# (Ha){,5}将匹配 0 到 5 次实例
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('Ha')
print(mo2 == None)

HaHaHa
True


In [22]:
# 贪心和非贪心匹配
# Python 的正则表达式默认是“贪心”的
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print(mo1.group())

nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
print(mo2.group())

HaHaHaHaHa
HaHaHa


In [25]:
# findall()方法
txt = 'Cell: 415-555-9999 Work: 212-555-0000'
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
# search()返回的 Match 对象只包含第一次出现的匹配文本
print(phoneNumRegex.search(txt).group())
print(phoneNumRegex.findall(txt))

# 如果在正则表达式中有分组，那么 findall 将返回元组的列表。
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
# search()返回的 Match 对象只包含第一次出现的匹配文本
print(phoneNumRegex.search(txt).group())
print(phoneNumRegex.findall(txt))

415-555-9999
['415-555-9999', '212-555-0000']
415-555-9999
[('415', '555', '9999'), ('212', '555', '0000')]


In [26]:
# 字符分类
# 字符分类[0-5]只匹配数字 0 到 5，这比输入(0|1|2|3|4|5)要短很多
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

In [28]:
# 建立自己的字符分类
# 所有元音字符
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

# 非 元音字符
consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

In [34]:
# 插入字符和美元字符
beginsWithHello = re.compile(r'^Hello')
mo1 = beginsWithHello.search('Hello world!')
print(mo1.group())

mo2 = beginsWithHello.search('Say Hello')
print(mo2 == None)

#
wholeStringIsNum = re.compile(r'^\d+$')
m = wholeStringIsNum.search('1234567890')
print(m.group())

m = wholeStringIsNum.search('123abc456')
print(m == None)


m = wholeStringIsNum.search('123 456')
print(m == None)

Hello
True
1234567890
True
True


In [35]:
# 通配字符
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [38]:
# 用点-星匹配所有字符
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
print(mo.group(1))
print(mo.group(2))

nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

Al
Sweigart
<To serve man>
<To serve man> for dinner.>


In [40]:
# 用句点字符匹配换行
txt = f"""Serve the public trust.
Protect the innocent.
Uphold the law.
"""
noNewlineRegex = re.compile('.*')
print(noNewlineRegex.search(txt).group())

newlineRegex = re.compile('.*', re.DOTALL)
print(newlineRegex.search(txt).group())

Serve the public trust.
Serve the public trust.
Protect the innocent.
Uphold the law.



In [42]:
# 不区分大小写的匹配
robocop = re.compile(r'robocop', re.I)
print(robocop.search('RoboCop is part man, part machine, all cop.').group())
print(robocop.search('ROBOCOP protects the innocent.').group())

RoboCop
ROBOCOP


In [44]:
# 用 sub()方法替换字符串
# 第一个参数是一个字符串，用于取代发现的匹配。第二个参数是一个字符串，即正则表达式。
# sub()方法返回替换完成后的字符串。
namesRegex = re.compile(r'Agent \w+')
s = namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')
print(s)

agentNamesRegex = re.compile(r'Agent (\w)\w*')
s = agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')
print(s)

CENSORED gave the secret documents to CENSORED.
A**** told C**** that E**** knew B**** was a double agent.
