# 数据获取

## 字符串操作

### 格式化字符串
字符串的组成

#### `%`操作符

In [1]:
import sys


name = "Quan"
val = 10

print("Hi %s, val=%d" % (name, val))

print("Hi %s, val=%d" % (name, val), file=sys.stderr)

s = "Hi %s, val=%d" % (name, val)

print(s)

Hi Quan, val=10
Hi Quan, val=10


Hi Quan, val=10


#### `format`方法

In [3]:
print("Hi {0}, val={1}".format(name, val))

print("Hi {n}, val={v}".format(n=name, v=val))

Hi Quan, val=10
Hi Quan, val=10


In [5]:
print(f"Hi {name}, val={val}")

print(f"Hi {name * 2}, val={float(val):10.4}")

Hi Quan, val=10
Hi QuanQuan, val=      10.0


In [7]:
f"{name!r}"  # repr() -> "'Quan'"
f"{val:#0x}"  # hex() -> "0xa"
f"{name =}"  # "name ='Quan'"

from math import pi
width, precision = 10, 4
f"{pi:{width}.{precision}}"  # '     3.142'

'     3.142'

### 格式化提取

正则表达式

- 匹配字符串中的特定模式
- 正则表达式参考资料：https://deerchao.cn/tutorials/regex/regex.htm
- 标准库：re
- 第三方库：regex （与标准库接口兼容）

In [None]:
import re

a_tag = re.compile("\<a[^\>]*\>(.*)\<\/a\>")  # <a ...>...</a>

a_tag.findall("Link to <a href=\"https://docs.python.org/\">Python Docs</a>.")

In [None]:
a_tag.match(...)  # 判断是否匹配
a_tag.search(...)  # 第一个匹配并定位
a_tag.finditer(...)  # 匹配并定位所有结果，findall的迭代器版本
a_tag.sub(...)  # 替换
a_tag.split(...)  # 分割
a_tag.groups(...)  # 分组

## 文件数据输入

### 标注输入输出

In [None]:
print("Hello")
input("What is your name? ")

### 文件输入输出

In [None]:
fout = open("sample.txt", "w")
fout.write("Hello\n")
fout.close()

fin = open("sample.txt", "r")
text = fin.read()
fin.close()
print(text)

上下文管理器

In [None]:
with open("sample.txt", "w") as fout:
    fout.write("Hello\n")

with open("sample.txt", "r") as fin:
    print(fin.read())

文件系统中查找文件

In [None]:
from glob import glob

for fname in glob("*.cpp"):
    print(f"\n{fname}")
    with open(fname, "r") as fin:
        print(fin.read())

文件格式化存储

In [2]:
def fib(n):
    a, b = 0, 1
    for _ in range(n):
        yield a
        a, b = b, a + b

In [None]:
fibs = list(fib(1000))

# save
with open("fibs.txt", "wb") as fout:
    for i in fibs:
        fout.write(f"{i}\n".encode())

# load
fibs = []
with open("fibs.txt", "rb") as fin:
    while True:
        line = fin.readline()
        if not line:
            break
        fibs.append(int(line.decode()))

In [4]:
fibs = list(fib(1000))

# save
with open("fibs.txt", "wb") as fout:
    fout.writelines(f"{i}\n".encode() for i in fibs)

# load
with open("fibs.txt", "rb") as fin:
    fibs = [int(line.decode()) for line in fin]

In [3]:
import pickle

fibs = list(fib(1000))

# save
with open("fibs.txt", "wb") as fout:
    pickle.dump(fibs, fout)

# load
with open("fibs.txt", "rb") as fin:
    fibs = pickle.load(fin)

In [None]:
import json

fibs = list(fib(1000))

# save
with open("fibs.txt", "w") as fout:
    json.dump(fibs, fout)

# load
with open("fibs.txt", "r") as fin:
    fibs = json.load(fin)

字符串存储和字符（二进制）存储的区别

In [None]:
pickle.dumps(10086)

In [None]:
f"{903147453:#0x}"  # '0x35d4efbd'

In [None]:
# 39 30 33 31 34 37 34 35 33
" ".join(map(lambda h: h[2:], map(hex, map(ord, "903147453"))))

## 网络数据输入

标准库：urllib

In [None]:
from urllib import request

with request.urlopen("http://github.com") as response:
    page = response.read().decode("utf-8")

for link in a_tag.finditer(page):
    print(link.group(1))

In [None]:
from urllib import request, parse
import re

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"}
data = {"l": "jupyter+notebook"}

req = request.Request(
    "https://github.com/topics/python",
    method="GET",
    headers=headers,
    data=parse.urlencode(data).encode("utf-8"),
)

with request.urlopen(req) as res:
    page = res.read().decode("utf-8")

h3a = re.compile("""\<a[^\>]* href\=\"(\/[^\"]*)\" [^\>]*\>([^\<]*)\<\/a\>""", flags=re.M | re.S)
skiplist = ["/team", "/enterprise", "/marketplace", "/explore", "/issues", "/topics"]
for link in h3a.finditer(page):
    if any(map(link.group(1).startswith, skiplist)):
        continue
    print(link.group(1), link.group(2).strip())


第三方库：requests selenium scrapy

Let's move to [PyAI-Lecture-5-2.ipynb](PyAI-Lecture-5-2.ipynb)