# 字符串和文本
---

### 使用多个界定符分割字符串

In [1]:
line = 'asdf fjdk; afed, fjek,asdf, foo'

In [2]:
import re
re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [3]:
re.split(r'(?:,|;|\s)\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

---

### 把文本中的变音符转成英文字母

In [4]:
s = 'Spicy Jalape\u00f1o'

In [5]:
s

'Spicy Jalapeño'

In [6]:
# 方法一
import unicodedata
t = unicodedata.normalize('NFD', s)
''.join(c for c in t if not unicodedata.combining(c))

'Spicy Jalapeno'

In [7]:
# 方法二
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
                         if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', s)
b.translate(cmb_chrs)

'Spicy Jalapeno'

---

### 字符串对齐

In [8]:
text = 'Hello World'

In [9]:
text.ljust(20)

'Hello World         '

In [10]:
text.rjust(20)

'         Hello World'

In [11]:
text.center(20)

'    Hello World     '

In [12]:
text.rjust(20,'=')



In [13]:
text.center(20,'*')

'****Hello World*****'

In [14]:
format(text, '>20')

'         Hello World'

In [15]:
format(text, '<20')

'Hello World         '

In [16]:
format(text, '^20')

'    Hello World     '

In [17]:
format(text, '=>20s')



In [18]:
format(text, '*^20s')

'****Hello World*****'

In [19]:
'{:>10s} {:>10s}'.format('Hello', 'World')

'     Hello      World'

In [20]:
'%-20s' % text

'Hello World         '

In [21]:
'%20s' % text

'         Hello World'

---

### 以指定列宽格式化字符串

In [22]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

In [23]:
import textwrap
textwrap.fill(s, 70)

"Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,\nnot around the eyes, don't look around the eyes, look into my eyes,\nyou're under."

In [24]:
textwrap.fill(s, 40, initial_indent='    ')

"    Look into my eyes, look into my\neyes, the eyes, the eyes, the eyes, not\naround the eyes, don't look around the\neyes, look into my eyes, you're under."

In [25]:
textwrap.fill(s, 40, subsequent_indent='    ')

"Look into my eyes, look into my eyes,\n    the eyes, the eyes, the eyes, not\n    around the eyes, don't look around\n    the eyes, look into my eyes, you're\n    under."

In [26]:
# 获取终端的大小尺寸，可用于自动匹配终端大小
import os
os.get_terminal_size().columns

80

---

### 在字符串中处理html和xml

In [27]:
s = 'Elements are written as "<tag>text</tag>".'

In [28]:
import html
# 转换'<'和'>'
html.escape(s)

'Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.'

In [29]:
html.escape(s, quote=False)

'Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".'

In [30]:
# 将非 ASCII 文本替换为对应的编码实体
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')

b'Spicy Jalape&#241;o'

In [31]:
# 解码 HTML 文本
s = 'Spicy &quot;Jalape&#241;o&quot.'
html.unescape(s)

'Spicy "Jalapeño".'

In [32]:
# 解码 XML 文本
t = 'The prompt is &gt;&gt;&gt;'
from xml.sax.saxutils import unescape
unescape(t)

'The prompt is >>>'

---

### 字符串令牌解析

In [33]:
import re

NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))

In [34]:
from collections import namedtuple

def generate_tokens(pat, text):
    Token = namedtuple('Token', ['type', 'value'])
    # scanner 方法返回的对象，每次调用 match 会在文本中匹配一次
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')
