#2.1使用多个界定符分割字符串

In [207]:
import re
line = 'asdf fjdk; afed, fjek,asdf, foo'
#分隔符可以是逗号(,)， 分号(;)或者是空格， 并且后面紧跟着任意个的空格
re.split(r'[;,\s]\s', line)

['asdf fjdk', 'afed', 'fjek,asdf', 'foo']

In [208]:
#使用括号捕获分组会让匹配文本也出现在结果中
fields = re.split(r'(;|,|\s)\s*', line)

fields

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [209]:
values = fields[::2] #从第一个元素每隔两个取值
delimiters = fields[1::2] + [''] #从第二个元素开始每隔两个取值

#重新组成新的字符串
''.join(v+d for v, d in zip(values, delimiters))


'asdf fjdk;afed,fjek,asdf,foo'

#2.2字符串开头或结尾匹配

In [210]:
filename = 'stam.txt'
print(filename.endswith('.txt'))
print(filename.startswith('file:'))


True
False


In [211]:
import os
file_names = os.listdir('.')
print(file_names)
#检查多种匹配可能
[name for name in file_names if name.endswith(('.py', '.md'))]


['.git', '.idea', 'first.py', 'FirstChapter.ipynb', 'README.md', 'SecondChapter.ipynb', '__pycache__']


['first.py', 'README.md']

In [212]:
#注意with函数要进行多值匹配时传入元组
from urllib.request import urlopen

def read_data(name):
    if name.startswith(('http', 'https:', 'ftp:')):
        return  urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()

choices = ['http:', 'ftp:']
url = 'http://www.python.org'
url.startswith(tuple(choices))



True

In [213]:
#开头结尾匹配还可以使用字符串切片检查
filename = 'spam.txt'
filename[-4:] == '.txt'


True

#2.3用Shell通配符匹配字符串

In [214]:
from fnmatch import fnmatch, fnmatchcase

print(fnmatch('foo.txt', '*.txt'))
print(fnmatch('foo.txt', '?oo.txt'))
print(fnmatch('Dat45.txt', 'Dat[0-9]*.txt'))


True
True
True


In [215]:
#如果对大小写在意，可以用fnmatchcase实现完全大小写匹配
print(fnmatchcase('foo.txt', '*.TXT'))
print(fnmatch('foo.txt', '*.txt'))


False
True


In [216]:
addresses = [
'5412 N CLARK ST',
'1039 W GRANVILLE AVE',
'2122 N CLARK ST',
'4802 N BROADWAY',
]

[addr for addr in addresses if fnmatch(addr, '*ST')]

['5412 N CLARK ST', '2122 N CLARK ST']

#2.4字符串匹配和搜索

In [217]:
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.startswith('yeah'))
print(text.endswith('yeah'))
#查找第一个no出现的位置
text.find('no')

True
True


10

In [218]:
import re
text1 = '11/27/2012'
text2 = 'Nov 27, 2012'
#简单匹配\d+意味着匹配一个或多个数字
if re.match(r'\d+/\d+/\d+', text1):
    print('yes')
else:
    print('no')

if re.match(r'\d+/\d+/\d+', text2):
    print('yes')
else:
    print('no')

yes
no


In [219]:
# 多次匹配，应该将模式字符串编译为模式对象
datepat = re.compile(r'\d+/\d+/\d+')
if datepat.match(text1):
    print('yes')
else:
    print('no')

yes


In [220]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat.findall(text)


['11/27/2012', '3/13/2013']

In [221]:
#定义正则表达式用()捕获分组
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
m = datepat.match('11/27/2012')
print(m.group(0))
print(m.group(1))
print(m.group(2))
print(m.group(3))
print(m.groups())
month, day, year = m.groups()
print(month + day + year)
for month, day, year in datepat.findall(text):
    #注意s和r的差别
    print('{!s}-{!r}-{!r}'.format(year,month,day))



11/27/2012
11
27
2012
('11', '27', '2012')
11272012
2012-'11'-'27'
2013-'3'-'13'


In [222]:
#findall() 方法会搜索文本并以列表形式返回所有的匹配。 如果你想以迭代方式返回匹配， 可以使用
#finditer() 方法来代替
for m in datepat.finditer(text):
    print(m.groups())


('11', '27', '2012')
('3', '13', '2013')


In [223]:
#如果只想做一次文本匹配搜索，可以略过编译部分直接re模块
re.findall(r'(\d+)/(\d+)/(\d+)', text)


[('11', '27', '2012'), ('3', '13', '2013')]

#2.5字符串搜索和替换

In [224]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')


'yep, but no, but yep, but no, but yep'

In [225]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
#第一个参数是被匹配的模式， 第二个参数是替换模式。 反斜杠数字比如3指向前面模式的捕获组号
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2',text)


'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [226]:
#多次使用先编译
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(r'\3-\1-\2', text)



'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [227]:
#对于更复杂的替换可以传递回调函数
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

datepat.sub(change_date, text)

'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

In [228]:
#如果想知道还有多少替换发生了，可以使用subn
newtext, n = datepat.subn(r'\3-\1-\2', text)
n

2

#2.6字符串忽略大小写的搜索替换

In [229]:
import re
text = 'UPPER PYTHON, lower python, Mixed Python'
#无视大小写全部替换
re.findall('python', text, flags=re.IGNORECASE)
#笨方法按照原大小区别替换
text1 = re.sub('python', 'snake',text)
text2 = re.sub('Python', 'Snake',text1)
re.sub('PYTHON', 'SNAKE',text2)


'UPPER SNAKE, lower snake, Mixed Snake'

In [230]:
#高级方法,matchcase返回的是一个回调函数，sub允许接受
def matchcase(word):
    #这个replace会自动迭代所有匹配到的字符串，然后进行判断
    def replace(match):
        text = match.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.title()
        else:
            return word
    return replace

re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)

'UPPER SNAKE, lower snake, Mixed Snake'

#2.7最短匹配模式

In [231]:
#默认最长匹配会存在问题
#PLUS:双引号中包含单引号不用加转义字符\，反过来单引号中包含双引号也不需要加\
str_pat = re.compile(r'"(.*)"')
text2 = 'Computer says "no." Phone says "yes."'
str_pat.findall(text2)



['no." Phone says "yes.']

In [232]:
#最短匹配
#PLUS: *会匹配0或多个表达式, +会匹配1或多个表达式,都属于贪婪匹配
#加上?号结尾可以转为非贪婪模式
str_pat = re.compile(r'"(.*?)"')
str_pat.findall(text2)

['no.', 'yes.']

#2.8多行匹配模式

In [233]:
#.匹配符不接受换行符,因此我们要修改模式字符串
comment = re.compile(r'/\*(.*?)\*/')
text2 = '''/* this is a
 multiline comment */
'''
print(comment.findall(text2))
#(?:.|\n)指定了非捕获组
comment_plus = re.compile(r'/\*((?:.|\n)*?)\*/')
comment_plus.findall(text2)

[]


[' this is a\n multiline comment ']

In [234]:
#使用标志参数也可以解决
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a\n multiline comment ']

#2.9将Unicode文本标准化

In [235]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

print(s1)
print(s2)

Spicy Jalapeño
Spicy Jalapeño


In [236]:
#标准化
import unicodedata
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
t1 == t2


True

#2.10在正则式中使用Unicode

In [237]:
import re
num = re.compile('\d+')
num.match('\u0661\u0662\u0663')

<re.Match object; span=(0, 3), match='١٢٣'>

#2.11删除字符串中不需要的字符

In [238]:
s = ' hello world \n'
#删除开头和结尾字符
print(s.strip())
#删除开头字符
print(s.lstrip())
#删除结尾字符
print(s.rstrip())

hello world
hello world 

 hello world


In [239]:
t = '----hello====='
print(t.lstrip('-'))
print(t.rstrip('='))
print(t.strip('-='))

hello=====
----hello
hello


In [240]:
#如果想处理中间的空格，可以用replace
s = 'hello    world'
s.replace(' ','')


'helloworld'

In [241]:
#或者用正则
import re
place = re.compile(r'\s+')
place.sub(' ',s)

'hello world'

#2.12审查清理文本字符串

In [242]:
s = 'pýtĥöñ\fis\tawesome\r\n'
s

'pýtĥöñ\x0cis\tawesome\r\n'

In [243]:
#使用translate清理空白字符
remap = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None,
}
#translate接收字典映射清理字符
a = s.translate(remap)
a


'pýtĥöñ is awesome\n'

In [244]:
import unicodedata
import sys
#删除所有和音符
#首先构建每个unicode和音符作为key的字典
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))
#标准化字符串
b = unicodedata.normalize('NFD', a)
b.translate(cmb_chrs)


'python is awesome\n'

In [245]:
#也可以使用编码解码进行清理
b = unicodedata.normalize('NFD', a)
b.encode('ascii', 'ignore').decode('ascii')


'python is awesome\n'

In [246]:
#清理字符串的方法中replace是最快的，因此我们可以
def clean_spaces(s):
    s = s.replace('\r', '')
    s = s.replace('\t', ' ')
    s = s.replace('\f', ' ')
    return s

clean_spaces(s)

'pýtĥöñ is awesome\n'

#2.13字符串对齐

In [247]:
text = 'Hello World'
print(text.ljust(20))
print(text.rjust(20))
print(text.center(20))

Hello World         
         Hello World
    Hello World     


In [248]:
print(text.ljust(20, '-'))
print(text.rjust(20, '+'))
print(text.center(20, '='))

Hello World---------
+++++++++Hello World
====Hello World=====


In [249]:
#format同理
print(format(text, '=>20'))
print(format(text, '=<20'))
print(format(text, '=^20'))


====Hello World=====


In [250]:
'{:>10}  {:>10}'.format(text, text)

'Hello World  Hello World'

In [251]:
#format可以格式化任何值
x = 1.2345
print(format(x, '>10'))
print(format(x, '^10.2f'))


    1.2345
   1.23   


In [252]:
#老版本
'%20s' %'hello'

'               hello'

#2.14合并拼接字符串

In [253]:
parts = ['Is', 'Chicage', 'Not', 'Chicage']
print(' '.join(parts))
print(','.join(parts))
print('.'.join(parts))

Is Chicage Not Chicage
Is,Chicage,Not,Chicage
Is.Chicage.Not.Chicage


In [254]:
a = 'Is Chicago'
b = 'Not Chicago'
a + ' ' + b

'Is Chicago Not Chicago'

In [255]:
print('{} {}'.format(a, b))
#把两个字符串放到一起直接合并
print('hello''world')

Is Chicago Not Chicago
helloworld


In [256]:
#利用生成器转换数据格式同时合并字符串
data = ['ACME', 50, 91.1]
','.join(str(d) for d in data)


'ACME,50,91.1'

In [257]:
#注意不必要的字符连接!
print('a', 'b', 'c', sep=':')

a

:b:c


In [258]:
#大量小字符串输出代码可以使用生成器函数，利用yield语句
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago'
'.'.join(sample())


'Is.Chicago.Not.Chicago'

#2.15字符串中插入变量

In [259]:
s = '{name} has {n} messages.'
s.format(name='Guido', n=37)

'Guido has 37 messages.'

In [260]:
name = "Guido"
n = 37
s.format_map(vars())

'Guido has 37 messages.'

In [261]:
#vars()可以用于对象实例
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
    
a = Info('Guido', 37)
s.format_map(vars(a))

'Guido has 37 messages.'

In [262]:
s.format(name='Guido')

KeyError: 'n'

In [263]:
#避免传入参数数量不够的情况
class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'

del n
s.format_map(safesub(vars()))

In [264]:
import sys

def sub(text):
    #sys._getframe(1)返回调用者的栈帧
    return text.format_map(safesub(sys._getframe(1).f_locals))

name = "Guido"
n = 37

print(sub('Hello {name}'))
print(sub('Hello {color}'))

In [267]:
#另一种方法
name = 'Guido'
n = 37
print('%s has %s messages.' %(name,n))


Guido has 37 messages.


In [268]:
import string
s = string.Template('$name has $n messages.')
s.substitute(vars())


'Guido has 37 messages.'

#2.16以指定列宽格式化字符串

In [4]:
import textwrap
s = '''Look into my eyes, look into my eyes, the eyes, the eyes, 
the eyes, not around the eyes, don't look around the eyes, 
look into my eyes, you're under.'''

print(textwrap.fill(s, 70))
print(textwrap.fill(s, 40))
print(textwrap.fill(s, 40, initial_indent='    '))
print(textwrap.fill(s, 40, subsequent_indent='    '))

Look into my eyes, look into my eyes, the eyes, the eyes,  the eyes,
not around the eyes, don't look around the eyes,  look into my eyes,
you're under.
Look into my eyes, look into my eyes,
the eyes, the eyes,  the eyes, not
around the eyes, don't look around the
eyes,  look into my eyes, you're under.
    Look into my eyes, look into my
eyes, the eyes, the eyes,  the eyes, not
around the eyes, don't look around the
eyes,  look into my eyes, you're under.
Look into my eyes, look into my eyes,
    the eyes, the eyes,  the eyes, not
    around the eyes, don't look around
    the eyes,  look into my eyes, you're
    under.


#2.17在字符串中处理html和xml

In [5]:
import html
s = 'Elements are written as "<tag>text</tag>".'
print(html.escape(s))
print(html.escape(s, quote=False))


Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".


In [8]:
#带有编码值的原始文本手动替换
s = 'Spicy &quot;Jalape&#241;o&quot.'
from html.parser import HTMLParser
print(html.unescape(s))
from xml.sax.saxutils import unescape
t = 'The prompt is &gt;&gt;&gt;'
print(unescape(t))

Spicy "Jalapeño".
The prompt is >>>


#2.18字符串令牌解析

In [9]:
import re
from collections import namedtuple
text = 'foo = 23 + 42 * 10'
#使用命名捕获组定义所有令牌
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))

In [10]:
def generate_tokens(pat, text):
    Token = namedtuple('Token', ['type', 'value'])
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
        
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)
    

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


#2.19实现一个简单的递归下降分析器

In [13]:
import re
import collections
# Token specification
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
DIVIDE, LPAREN, RPAREN, WS]))
# Tokenizer
Token = collections.namedtuple('Token', ['type', 'value'])
def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok
# Parser
class ExpressionEvaluator:
    '''
Implementation of a recursive descent parser. Each method
implements a single grammar rule. Use the ._accept() method
to test and accept the current lookahead token. Use the ._expect()
method to exactly match and discard the next token on on the input
(or raise a SyntaxError if it doesn't match).
'''
    def parse(self, text):
        self.tokens = generate_tokens(text)
        self.tok = None # Last symbol consumed
        self.nexttok = None # Next symbol tokenized
        self._advance() # Load first lookahead token
        return self.expr()
    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
    def _accept(self, toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False
    def _expect(self, toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)
    # Grammar rules follow
    def expr(self):
        "expression ::= term { ('+'|'-') term }*"
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
            return exprval
    def term(self):
        "term ::= factor { ('*'|'/') factor }*"
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval
    def factor(self):
        "factor ::= NUM | ( expr )"
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')
        
def descent_parser():
    e = ExpressionEvaluator()
    print(e.parse('2'))
    print(e.parse('2 + 3'))
    print(e.parse('2 + 3 * 4'))
    print(e.parse('2 + (3 + 4) * 5'))
# print(e.parse('2 + (3 + * 4)'))
# Traceback (most recent call last):
# File "<stdin>", line 1, in <module>
# File "exprparse.py", line 40, in parse
# return self.expr()
# File "exprparse.py", line 67, in expr
# right = self.term()
# File "exprparse.py", line 77, in term
# termval = self.factor()
# File "exprparse.py", line 93, in factor
# exprval = self.expr()
# File "exprparse.py", line 67, in expr
# right = self.term()
# File "exprparse.py", line 77, in term
# termval = self.factor()
# File "exprparse.py", line 97, in factor
# raise SyntaxError("Expected NUMBER or LPAREN")
# SyntaxError: Expected NUMBER or LPAREN
if __name__ == '__main__':
    descent_parser()

None
5
14
37


#2.20字节字符串上的操作

In [15]:
data = b'Hello World'
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.replace(b'Hello', b'Hello Cruel'))

b'Hello'
True
b'Hello Cruel World'


In [17]:
#字节数组
data = bytearray(b'Hello World')
print(data[0:5])
print(data.startswith(b'Hello'))
print(data.replace(b'Hello', b'Hello Cruel'))



bytearray(b'Hello')
True
bytearray(b'Hello Cruel World')


In [18]:
import re
data = b'FOO:BAR,SPAM'

re.split(b'[:,]', data)


[b'FOO', b'BAR', b'SPAM']

In [21]:
#字节字符串的索引操作返回整数而不是单独字符
b = b'Hello World'
print(b[0])
print(b[1])
print(b.decode('utf-8'))
print(b)

72
101
Hello World
b'Hello World'


In [22]:
'{:10s} {:10d} {:10.2f}'.format('ACME', 100, 490.1).encode('ascii')

b'ACME              100     490.10'