#2.1使用多个界定符分割字符串

In [12]:
import re
line = 'asdf fjdk; afed, fjek,asdf, foo'
#分隔符可以是逗号(,)， 分号(;)或者是空格， 并且后面紧跟着任意个的空格
re.split(r'[;,\s]\s', line)

['asdf fjdk', 'afed', 'fjek,asdf', 'foo']

In [13]:
#使用括号捕获分组会让匹配文本也出现在结果中
fields = re.split(r'(;|,|\s)\s*', line)

fields

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [22]:
values = fields[::2] #从第一个元素每隔两个取值
delimiters = fields[1::2] + [''] #从第二个元素开始每隔两个取值

#重新组成新的字符串
''.join(v+d for v, d in zip(values, delimiters))


'asdf fjdk;afed,fjek,asdf,foo'

#2.2字符串开头或结尾匹配

In [24]:
filename = 'stam.txt'
print(filename.endswith('.txt'))
print(filename.startswith('file:'))


True
False


In [28]:
import os
file_names = os.listdir('.')
print(file_names)
#检查多种匹配可能
[name for name in file_names if name.endswith(('.py', '.md'))]


['.git', '.idea', 'first.py', 'FirstChapter.ipynb', 'README.md', 'SecondChapter.ipynb', '__pycache__']


['first.py', 'README.md']

In [29]:
#注意with函数要进行多值匹配时传入元组
from urllib.request import urlopen

def read_data(name):
    if name.startswith(('http', 'https:', 'ftp:')):
        return  urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

choices = ['http:', 'ftp:']
url = 'http://www.python.org'
url.startswith(tuple(choices))



True

In [32]:
#开头结尾匹配还可以使用字符串切片检查
filename = 'spam.txt'
filename[-4:] == '.txt'


True

#2.3用Shell通配符匹配字符串

In [2]:
from fnmatch import fnmatch, fnmatchcase

print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.txt', 'Dat[0-9]*.txt'))


True
True
True


In [4]:
#如果对大小写在意，可以用fnmatchcase实现完全大小写匹配
print(fnmatchcase('foo.txt', '*.TXT'))
print(fnmatch('foo.txt', '*.txt'))


False
True


In [5]:
addresses = [
'5412 N CLARK ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK ST',
'4802 N BROADWAY',
]

[addr for addr in addresses if fnmatch(addr, '*ST')]

['5412 N CLARK ST', '2122 N CLARK ST']

#2.4字符串匹配和搜索

In [7]:
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.startswith('yeah'))
print(text.endswith('yeah'))
#查找第一个no出现的位置
text.find('no')

True
True


10

In [9]:
import re
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'
#简单匹配\d+意味着匹配一个或多个数字
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')

if re.match(r'\d+/\d+/\d+', text2):
    print('yes')
else:
    print('no')

yes
no


In [12]:
# 多次匹配，应该将模式字符串编译为模式对象
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')

yes


In [13]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text)


['11/27/2012', '3/13/2013']

In [21]:
#定义正则表达式用()捕获分组
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(3))
print(m.groups())
month, day, year = m.groups()
print(month + day + year)
for month, day, year in datepat.findall(text):
    #注意s和r的差别
    print('{!s}-{!r}-{!r}'.format(year,month,day))



11/27/2012
11
27
2012
('11', '27', '2012')
11272012
2012-'11'-'27'
2013-'3'-'13'


In [22]:
#findall() 方法会搜索文本并以列表形式返回所有的匹配。 如果你想以迭代方式返回匹配， 可以使用
#finditer() 方法来代替
for m in datepat.finditer(text):
    print(m.groups())


('11', '27', '2012')
('3', '13', '2013')


In [23]:
#如果只想做一次文本匹配搜索，可以略过编译部分直接re模块
re.findall(r'(\d+)/(\d+)/(\d+)', text)


[('11', '27', '2012'), ('3', '13', '2013')]

#2.5字符串搜索和替换

In [24]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')


'yep, but no, but yep, but no, but yep'

In [27]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
#第一个参数是被匹配的模式， 第二个参数是替换模式。 反斜杠数字比如3指向前面模式的捕获组号
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2',text)


'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [28]:
#多次使用先编译
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(r'\3-\1-\2', text)



'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [29]:
#对于更复杂的替换可以传递回调函数
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

datepat.sub(change_date, text)

'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

In [31]:
#如果想知道还有多少替换发生了，可以使用subn
newtext, n = datepat.subn(r'\3-\1-\2', text)
n

2

#2.6字符串忽略大小写的搜索替换

In [8]:
import re
text = 'UPPER PYTHON, lower python, Mixed Python'
#无视大小写全部替换
re.findall('python', text, flags=re.IGNORECASE)
#笨方法按照原大小区别替换
text1 = re.sub('python', 'snake',text)
text2 = re.sub('Python', 'Snake',text1)
re.sub('PYTHON', 'SNAKE',text2)


'UPPER SNAKE, lower snake, Mixed Snake'

In [11]:
#高级方法,matchcase返回的是一个回调函数，sub允许接受
def matchcase(word):
    #这个replace会自动迭代所有匹配到的字符串，然后进行判断
    def replace(match):
        text = match.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.title()
        else:
            return word
    return replace

re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)

'UPPER SNAKE, lower snake, Mixed Snake'

#2.7最短匹配模式

In [22]:
#默认最长匹配会存在问题
#PLUS:双引号中包含单引号不用加转义字符\，反过来单引号中包含双引号也不需要加\
str_pat = re.compile(r'"(.*)"')
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)



['no." Phone says "yes.']

In [42]:
#最短匹配
#PLUS: *会匹配0或多个表达式, +会匹配1或多个表达式,都属于贪婪匹配
#加上?号结尾可以转为非贪婪模式
str_pat = re.compile(r'"(.*?)"')
str_pat.findall(text2)

['no.', 'yes.']

#2.8多行匹配模式

In [46]:
#.匹配符不接受换行符,因此我们要修改模式字符串
comment = re.compile(r'/\*(.*?)\*/')
text2 = '''/* this is a
 multiline comment */
'''
print(comment.findall(text2))
#(?:.|\n)指定了非捕获组
comment_plus = re.compile(r'/\*((?:.|\n)*?)\*/')
comment_plus.findall(text2)

[]


[' this is a\n multiline comment ']

In [47]:
#使用标志参数也可以解决
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a\n multiline comment ']

#2.9将Unicode文本标准化

In [48]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

print(s1)
print(s2)

Spicy Jalapeño
Spicy Jalapeño


In [50]:
#标准化
import unicodedata
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
t1 == t2


True

#2.10在正则式中使用Unicode

In [51]:
import re
num = re.compile('\d+')
num.match('\u0661\u0662\u0663')

<re.Match object; span=(0, 3), match='١٢٣'>

#2.11删除字符串中不需要的字符

In [55]:
s = ' hello world \n'
#删除开头和结尾字符
print(s.strip())
#删除开头字符
print(s.lstrip())
#删除结尾字符
print(s.rstrip())

hello world
hello world 

 hello world


In [60]:
t = '----hello====='
print(t.lstrip('-'))
print(t.rstrip('='))
print(t.strip('-='))

hello=====
----hello
hello


In [61]:
#如果想处理中间的空格，可以用replace
s = 'hello    world'
s.replace(' ','')


'helloworld'

In [64]:
#或者用正则
import re
place = re.compile(r'\s+')
place.sub(' ',s)

'hello world'

#2.12审查清理文本字符串

In [65]:
s = 'pýtĥöñ\fis\tawesome\r\n'
s

'pýtĥöñ\x0cis\tawesome\r\n'

In [66]:
#使用translate清理空白字符
remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None,
}
#translate接收字典映射清理字符
a = s.translate(remap)
a


'pýtĥöñ is awesome\n'

In [70]:
import unicodedata
import sys
#删除所有和音符
#首先构建每个unicode和音符作为key的字典
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))
#标准化字符串
b = unicodedata.normalize('NFD', a)
b.translate(cmb_chrs)


'python is awesome\n'

In [72]:
#也可以使用编码解码进行清理
b = unicodedata.normalize('NFD', a)
b.encode('ascii', 'ignore').decode('ascii')


'python is awesome\n'

In [75]:
#清理字符串的方法中replace是最快的，因此我们可以
def clean_spaces(s):
    s = s.replace('\r', '')
    s = s.replace('\t', ' ')
    s = s.replace('\f', ' ')
    return s

clean_spaces(s)

'pýtĥöñ is awesome\n'

#2.13字符串对齐