In [1]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    # secs = random.random()
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(link, headers=header) as resp:
            return await resp.read()
    # await asyncio.sleep(secs)
    # return html


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)
        time.sleep(0.5)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

获取200个页面耗时1.1203410625457764秒
获取200个页面耗时1.3495452404022217秒
获取200个页面耗时1.1437115669250488秒
获取200个页面耗时1.3108525276184082秒
获取200个页面耗时1.5129530429840088秒
获取200个页面耗时1.1395964622497559秒
获取200个页面耗时1.320845603942871秒
获取200个页面耗时1.172393798828125秒
获取77个页面耗时1.0077250003814697秒


In [2]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    # secs = random.random()
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(link, headers=header) as resp:
            html =  await resp.read()
    # await asyncio.sleep(secs)
    return html


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)
        time.sleep(0.5)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

获取200个页面耗时16.297772884368896秒
获取200个页面耗时4.675647497177124秒
获取200个页面耗时6.743532657623291秒
获取200个页面耗时6.023595809936523秒
获取200个页面耗时4.108102321624756秒
获取200个页面耗时8.21902871131897秒
获取200个页面耗时5.2087249755859375秒
获取200个页面耗时4.323275804519653秒
获取77个页面耗时5.453036785125732秒


In [3]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    secs = random.randint(1, 2)
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(link, headers=header) as resp:
            await asyncio.sleep(secs)
            return await resp.read()
    # return html该位置返回值耗时长


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)
        time.sleep(0.5)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

获取200个页面耗时4.23507833480835秒
获取200个页面耗时3.1086573600769043秒
获取200个页面耗时3.1544721126556396秒
获取200个页面耗时3.7060840129852295秒
获取200个页面耗时4.870309352874756秒
获取200个页面耗时3.3080151081085205秒
获取200个页面耗时3.1477842330932617秒
获取200个页面耗时3.1457390785217285秒
获取77个页面耗时3.0622262954711914秒


In [4]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        async with session.get(link, headers=header) as resp:
            await asyncio.sleep(1)
            return await resp.read()
    # return html该位置返回值耗时长


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

获取200个页面耗时3.034601926803589秒
获取200个页面耗时3.2510640621185303秒
获取200个页面耗时3.2358458042144775秒
获取200个页面耗时3.9722886085510254秒
获取200个页面耗时3.8287055492401123秒
获取200个页面耗时4.011605978012085秒
获取200个页面耗时3.7667717933654785秒
获取200个页面耗时2.132991313934326秒
获取77个页面耗时2.0240566730499268秒


In [5]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        await asyncio.sleep(1)
        async with session.get(link, headers=header) as resp:
            return await resp.read()
    # return html该位置返回值耗时长


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

获取200个页面耗时2.2931997776031494秒
获取200个页面耗时2.2936036586761475秒
获取200个页面耗时2.3401360511779785秒
获取200个页面耗时2.190629005432129秒
获取200个页面耗时2.2158334255218506秒
获取200个页面耗时5.158896207809448秒
获取200个页面耗时5.726853132247925秒
获取200个页面耗时2.37021541595459秒
获取77个页面耗时2.2269649505615234秒


In [6]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        # await asyncio.sleep(1)
        async with session.get(link, headers=header) as resp:
            return await resp.read()
    # return html该位置返回值耗时最长


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

TimeoutError: 

In [7]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        # await asyncio.sleep(1)
        async with session.get(link, headers=header) as resp:
            return await resp.read()
    # return html该位置返回值耗时最长


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)
        time.sleep(0.5)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

获取200个页面耗时1.9270589351654053秒
获取200个页面耗时3.471113681793213秒
获取200个页面耗时1.0841667652130127秒
获取200个页面耗时3.7214713096618652秒
获取200个页面耗时4.400755167007446秒


TimeoutError: 

In [8]:
import asyncio
import random
import time
from turtle import title

import aiohttp
import nest_asyncio

nest_asyncio.apply()
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import requests
from aiohttp import ClientSession
from bs4 import BeautifulSoup
from faker import Faker
from lxml import etree


# 写入txt文本
def saveFile(html, name, dir):
    text = getContent(html)
    path = Path(dir).joinpath(name + ".txt")
    path.write_text('\n'.join((name, text)), encoding="gbk", errors="ignore")


# 解析html，提取小说内容
def getContent(html):
    root = BeautifulSoup(html, "lxml")
    text = root.find("div", id="content").get_text("\n")
    return text.replace("&nbsp;", "")


# 同步，获取章节链接、章节名
def getCatalogue(url):
    header = {"user-agent": fake.chrome()}
    with requests.get(url, headers=header) as resp:
        resp.encoding = "gbk"
        tree = etree.HTML(resp.text)
        href = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/@href')
        title = tree.xpath('//*[@id="list"]/dl/dd[position() >= 13]/a/text()')
    pags = list(zip(href, title))
    random.shuffle(pags)
    return zip(*pags)


# 异步，获取章节页面
async def getChapter(link, referer):
    connector = aiohttp.TCPConnector(limit=30)  # 限制并发数量，使用aiohttp参数而不是asyncio.SemaPhore
    timeout = aiohttp.ClientTimeout(total=20)  # 程序应在total秒内完成，否则报超时错误
    header = {"user-agent": fake.chrome(), "referer": referer}
    async with ClientSession(connector=connector, timeout=timeout) as session:
        await asyncio.sleep(1)
        async with session.get(link, headers=header) as resp:
            return await resp.read()
    # return html该位置返回值耗时最长


# 多阶段运行异步协程
async def getBook(url, href, num, offset):
    href = href[:num]
    for i in range(0, num, offset):
        began = time.time()
        tasks = (getChapter(url + id, book) for id in href[i : i + offset])
        chapters = await asyncio.gather(*tasks)  # note:不能塞入全部协程，否则服务器会拒绝访问，用循环划分
        over = time.time()
        count = min(num - i, offset)
        print(f"获取{count}个页面耗时{over-began}秒")
        htmls.extend(chapters)
        time.sleep(0.5)


# 多进程处理文本
def main(data, dir):  # note:进程池必须位于__main__ 主进程中，必须可以被工作者子进程导入，最好用函数封装
    with ProcessPoolExecutor() as future:
        for html, name in data:
            future.submit(saveFile, html=html, name=name, dir=dir)


if __name__ == "__main__":
    fake = Faker()
    book = f"https://www.xbiquge.so/book/4772/"
    href, title = map(list, getCatalogue(book))
    htmls = list()  # 保存网页

    asyncio.run(getBook(book, href, len(href), 200))
    main(zip(htmls, title), "./download")

获取200个页面耗时5.604754447937012秒
获取200个页面耗时4.615993499755859秒
获取200个页面耗时4.760902643203735秒
获取200个页面耗时5.117491722106934秒
获取200个页面耗时5.04739785194397秒
获取200个页面耗时2.1671507358551025秒
获取200个页面耗时2.7661960124969482秒
获取200个页面耗时4.3974151611328125秒
获取77个页面耗时2.032909870147705秒
