In [3]:
"""字符串

比特(bit)是计算机最小单位
1个字节(byte)等于8个比特，能表示的最大整数为255
ASCII编码是1个字节，Unicode编码通常是2个字节
UTF-8把英文字母编码成1个字节，汉字通常是3个字节，是一种“可变长编码”

在计算机内存中，统一使用Unicode编码，当需要保存到硬盘或者需要传输的时候，就转换成UTF-8编码
用记事本编辑的时候，从文件读取的UTF-8字符被转换为Unicode字符到内存中，编辑完成后，再把Unicode转换为UTF-8保存到文件
浏览网页的时候，服务器会把动态生成的Unicode内容转换为UTF-8再传输到浏览器

Python的字符串类型是str，在内存中以Unicode表示，一个字符对应若干个字节
如果要在网络上传输，或者保存到磁盘上，就需要把str变为以字节为单位的bytes
"""

# 获取字符编码
print(ord('A'))
print(ord('中'))

# 把编码转换为字符
print(chr(65))
print(chr(20013))

# 十六进制编码
print('\u4e2d\u6587')

# bytes的每个字符都只占用一个字节
# 以Unicode表示的str通过encode()方法可以编码为指定的bytes
# 在bytes中，无法显示为ASCII字符的字节，用\x##显示
print('ABC'.encode('ascii'))
print('中文'.encode('utf-8'))

# 从网络或磁盘上读取了字节流，读到的数据就是bytes
# 可以通过decode()方法把bytes解码为str
# 如果bytes中包含无法解码的字节，可以使用参数errors='ignore'忽略错误的字节
print(b'ABC'.decode('ascii'))
print(b'\xe4\xb8\xad\xe6\x96\x87'.decode('utf-8'))

# 用len()函数计算str包含的字符数
print(len('ABC'))
print(len('中文'))

# 用len()函数计算bytes包含的字节数
print(len(b'ABC'))
print(len(b'\xe4\xb8\xad\xe6\x96\x87'))

65
20013
A
中
中文
b'ABC'
b'\xe4\xb8\xad\xe6\x96\x87'
ABC
中文
3
2
3
6


In [4]:
"""字符串属于可迭代对象

可以使用unicodedata.normalize()对字符进行标准化处理
字符标准化对需要以一致的方式处理Unicode文本的程序非常重要

对文本字符串的大部分操作也适用于字节字符串
"""

msg_str = 'hello world'
print([c for c in msg_str])

['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd']


In [5]:
"""字符串分割

简单字符串分割：str.split()
复杂字符串分割：re.split()
"""

date_str = '2020-06-01'
print(str.split(date_str, '-'))

import re
msg_str = 'hello, world; are you, ok'
print(re.split(r'[;,\s]\s*', msg_str))

['2020', '06', '01']
['hello', 'world', 'are', 'you', 'ok']


In [6]:
"""字符串匹配"""

# 开头和结尾匹配
file_name = 'hello_world.py'
print(file_name.startswith('hello'))
print(file_name.endswith('py'))

# 多项匹配，将匹配项放入元祖
files = ['test.py', 'readme.md', 'test.c', 'hello_world.py', 'test.h']
print([f for f in files if f.startswith(('test', 'hello'))])
print([f for f in files if f.endswith(('py', 'md'))])
print(any(f.startswith('test') and f.endswith(('py', 'c')) for f in files))
print([f for f in files if f.startswith('test') and f.endswith(('py', 'c'))])

# 用正则进行匹配
import re
url = 'http://www.abc.com'
print(re.match('http:|https:|ftp:', url))

True
True
['test.py', 'test.c', 'hello_world.py', 'test.h']
['test.py', 'readme.md', 'hello_world.py']
True
['test.py', 'test.c']
<re.Match object; span=(0, 5), match='http:'>


In [7]:
"""用通配符匹配字符串

*可以占位多个字符，?只能占位一个字符
fnmatch()函数匹配能力介于简单的字符串方法和正则方法之间
fnmatch()和fnmatchcase()并不局限于做文件名的匹配
"""

from fnmatch import fnmatch, fnmatchcase

print(fnmatch('test.py', '*.py'))
print(fnmatch('data1.py', 'data?.py'))

files = ['test.py', 'data1.csv', 'readme.md', 'data2.csv', 'data3.csv', 'hello.java']
print([f for f in files if fnmatch(f, 'data*.csv')])

# fnmatchcase()严格区分大小写，fnmatch()则根据当前系统规则
print(fnmatch('test.py', '*.PY'))
print(fnmatchcase('test.py', '*.PY'))

True
True
['data1.csv', 'data2.csv', 'data3.csv']
True
False


In [8]:
"""字符串模式匹配

首先使用re.compile()编译正则表达式
然后使用以下方法进行匹配：
match()从字符串开头进行匹配
findall()匹配全部内容
finditer()返回可迭代结果
"""

import re

date_str = 'Today is 2020/6/3, and tomorrow is 2020/6/4.'
date_pat = re.compile(r'\d+/\d+/\d+')

print(date_pat.match(date_str))
print(date_pat.findall(date_str))

for d in date_pat.finditer(date_str):
    print(d.group())

None
['2020/6/3', '2020/6/4']
2020/6/3
2020/6/4


In [9]:
"""字符串搜索和替换

简单的字面模式，直接使用str.replace()方法即可

对于较复杂的模式，可以使用re.sub()方法
第一个参数是被匹配的模式，第二个参数是替换模式
"""

hello_msg = 'Hello Kobe, Hello James, Hello Jordan'
print(hello_msg.replace('Hello', 'Hi'))

import re
date_str = 'Today is 3/6/2020, and tomorrow is 4/6/2020'
# 反斜杠数字（如\3）指向前面模式的捕获组号
print(re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\2-\1', date_str))

# 需要多次替换时，可以先编译来提高性能
date_pat = re.compile(r'(\d+)/(\d+)/(\d+)')
print(date_pat.sub(r'\3-\2-\1', date_str))

# 对于更加复杂的替换，可以传递一个回调函数参数
# 使用re.subn()方法来获取替换次数
new_date_str, replace_num = date_pat.subn(r'\3-\2-\1', date_str)
print(new_date_str)
print(replace_num)

Hi Kobe, Hi James, Hi Jordan
Today is 2020-6-3, and tomorrow is 2020-6-4
Today is 2020-6-3, and tomorrow is 2020-6-4
Today is 2020-6-3, and tomorrow is 2020-6-4
2


In [10]:
"""re模块

使用re.IGNORECASE标志参数忽略大小写
"""

import re

txt = 'UPPER PYTHON, lower python, Mixed Python'
print(re.findall('python', txt, flags=re.IGNORECASE))
print(re.sub('python', 'snake', txt, flags=re.IGNORECASE))


# 替换字符串不会自动跟匹配字符串的大小写保持一致
# 定义一个辅助函数，该函数返回一个回调函数
def matchcase(word):
    def replace(m):
        t = m.group()
        if t.isupper():
            return word.upper()
        elif t.islower():
            return word.lower()
        elif t[0].isupper():
            return word.capitalize()
        else:
            return word
        
    return replace

print(re.sub('python', matchcase('snake'), txt, flags=re.IGNORECASE))

['PYTHON', 'python', 'Python']
UPPER snake, lower snake, Mixed snake
UPPER SNAKE, lower snake, Mixed Snake


In [11]:
"""最短匹配模式

.点操作符可以匹配除换行符以外的所有字符
"""

import re

txt = 'Python says "YES", and PHP says "NO"'

pat1 = re.compile(r'\"(.*)\"')
pat2 = re.compile(r'\"(.*?)\"')

print(pat1.findall(txt))
print(pat2.findall(txt))

['YES", and PHP says "NO']
['YES', 'NO']


In [12]:
"""删除字符串中的指定字符

strip(), lstrip(), rstrip()默认删除空白字符
可以给以上方法添加参数，删除指定字符
"""

msg = ' _hello world_ '

print(msg.strip())
print(msg.lstrip())
print(msg.rstrip())

print(msg.strip('_ '))
print(msg.strip(' _'))

_hello world_
_hello world_ 
 _hello world_
hello world
hello world


In [13]:
"""审查清理文本字符串"""

msg = 'Apple is \tred,\n tree is \tgreen,\n sky is \tblue.\r\n'
print(msg)

rmap = {
    ord('\t'): '',
    ord('\n'): ' ',
    ord('\r'): None  # Deleted
}
print(msg.translate(rmap))

Apple is 	red,
 tree is 	green,
 sky is 	blue.

Apple is red,  tree is green,  sky is blue. 


In [14]:
"""字符串拼接"""

# 简单字符串拼接
txt1 = 'hello'
txt2 =  'world'
txt = txt1 + ' ' + txt2
print(txt)

# 拼接可迭代对象中的字符串
colors = ['red', 'green', 'blue']
print(' '.join(colors))
print(','.join(colors))

# 在源码中将两个字符串合并
s = 'hello' 'world'
print(s)

hello world
red green blue
red,green,blue
helloworld


In [15]:
"""字符串对齐、填充及格式化"""

txt = 'hello world'

# 字符串对齐
print(txt.ljust(20))
print(txt.rjust(20))
print(txt.center(20))

# 字符串填充
print(txt.center(20, '*'))

# 使用format()格式化字符串
print(format(txt, '<20'))
print(format(txt, '>20'))
print(format(txt, '^20'))
print(format(txt, '*^20'))

# 格式化多个值
print('{:>10s} {:>10s}'.format('hello', 'world'))

hello world         
         hello world
    hello world     
****hello world*****
hello world         
         hello world
    hello world     
****hello world*****
     hello      world


In [16]:
# 以指定列宽格式化字符串

import textwrap

txt = 'Playoff basketball is back! \
It has been a long regular season (177 days, to be exact), but the postseason has arrived, \
and it looks a lot different than it did a year ago. \
For the first time since 2010, a team not led by LeBron James will emerge from the Eastern Conference, \
and for the first time since 2005, James will not be involved in the festivities.As for the players,\
teams and matchups to know, well, we\'ve got you covered.'

# print(textwrap.fill(txt, 50))
# print(textwrap.fill(txt, 50, initial_indent='  '))  # 首行缩进
print(textwrap.fill(txt, 50, subsequent_indent='  ')) # 非首行缩进

Playoff basketball is back! It has been a long
  regular season (177 days, to be exact), but the
  postseason has arrived, and it looks a lot
  different than it did a year ago. For the first
  time since 2010, a team not led by LeBron James
  will emerge from the Eastern Conference, and for
  the first time since 2005, James will not be
  involved in the festivities.As for the
  players,teams and matchups to know, well, we've
  got you covered.


In [17]:
"""处理字符串中的HTML标签"""

import html

txt = 'Elements are written as "<tag>text</tag>"'
print(txt)

# 对标签进行实体转义
h = html.escape(txt, quote=False)
print(h)

# 把转义符重新转换为标签
t = html.unescape(h)
print(t)

Elements are written as "<tag>text</tag>"
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;"
Elements are written as "<tag>text</tag>"
