In [None]:
#请求函数
def request(url='https://www.18ii.net/bookstack/', timeout=10):
    import requests
    from lxml import etree
    headers = {
        'User-Agent': 'Mozilla/50 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    proxies = {
        'http': 'http://127.0.0.1:10808',
        'https': 'http://127.0.0.1:10808'
    }
    try:
        response = requests.get(url, headers=headers, proxies=proxies, timeout=timeout)
        
        response.raise_for_status()  # Raise an error for bad responses
        response.encoding = 'utf-8'  # Ensure the response is decoded correctly
        html = etree.HTML(response.text)
        return html
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None


In [None]:
#拉取书单
import time
import pandas as pd

count  = 0
while count < 5:
    html = request()
    if html is None:
        print('拉取失败，跳过本次循环。')
        time.sleep(6)
        count += 1
        continue
    print('拉取成功')
    #获取书单标题和链接
    try:
        df = pd.DataFrame({
            'title': html.xpath('//li/h3/a/text()'),
            'url': html.xpath('//li/h3/a/@href'),
        })
    except Exception as e:
        print(f"处理书单时发生错误: {e}")
        time.sleep(6)
        count += 1
        continue
    
    # 读取书单列表
    try:
        download = pd.read_csv('booklist.csv')
    except FileNotFoundError:
        download = pd.DataFrame(columns=['title', 'url'])
    
    print(f'共有{len(download)}本书')
    # 检查 download 数据框中是否存在 'title' 列
    if 'title' not in download.columns:
        raise ValueError(f"booklist.csv 文件中缺少 'title' 列，请检查文件内容！")
    
    # 验证书单里的书是否已经被拉取过
    existing_titles = set(download['title'].values)
    df_to_add = df[~df['title'].isin(existing_titles)]
    
    print(f'共有{len(df_to_add)}本书未找到')

    # 将未找到的书写入文件
    if not df_to_add.empty:
        df_to_add.to_csv('booklist.csv', mode='a', header=False, index=False)
        print(f'已将 {len(df_to_add)} 本新书写入 booklist.csv')
    else:
        print('没有新的书需要写入。')
    time.sleep(6)
    count =  count + 1

In [None]:
import pandas as pd
import time
import os

# 确保 booklist_downloaded.csv 文件存在，如果不存在则创建并包含 'last_chapter' 列
if not os.path.exists('booklist_downloaded.csv'):
    pd.DataFrame(columns=['title', 'url', 'last_chapter']).to_csv('booklist_downloaded.csv', index=False)

to_download_list = pd.read_csv('booklist.csv')
downloaded_list = pd.read_csv('booklist_downloaded.csv')

# 确保 downloaded_list 包含 'last_chapter' 列
if 'last_chapter' not in downloaded_list.columns:
    downloaded_list['last_chapter'] = 0

for i in range(len(to_download_list)):
    current_book = to_download_list.iloc[i]
    book_title = current_book.title

    start_chapter = 1
    # 查找当前书籍在已下载列表中的行
    existing_book_row_index = downloaded_list[downloaded_list['title'] == book_title].index

    if not existing_book_row_index.empty:
        # 如果书已存在，从上次下载的章节继续
        start_chapter = int(downloaded_list.loc[existing_book_row_index[0], 'last_chapter']) + 1
        print(f"继续下载 {book_title}，从第 {start_chapter} 章开始。")
    else:
        print(f"开始下载新书 {book_title}，从第 {start_chapter} 章开始。")

    j = start_chapter
    while True:
        try:
            url = f'https://www.18ii.net{current_book.url}{j}.html'
            html = request(url)
            if html is None:
                print(f"无法获取 {book_title} 第{j}章内容，跳过。")
                break
            content = html.xpath('//div[@class="content"]/text()')

            if not content: # 如果内容为空，可能已经到最后一章或者页面结构有变
                print(f"{book_title} 第{j}章内容为空，可能已是最后一章或页面结构有变。")
                break

            with open(f'{book_title}.txt', 'a', encoding='utf-8') as f:
                f.write('\n'.join(content))
                print(f'{book_title} 第{j}章下载完成')

            # 更新已下载章节数
            if not existing_book_row_index.empty:
                downloaded_list.loc[existing_book_row_index[0], 'last_chapter'] = j
            else:
                # 如果是新书，添加到 downloaded_list 中
                new_row = current_book.to_frame().T
                new_row['last_chapter'] = j
                downloaded_list = pd.concat([downloaded_list, new_row], ignore_index=True)
                # 更新 existing_book_row_index 以便后续章节更新
                existing_book_row_index = downloaded_list[downloaded_list['title'] == book_title].index

            # 每次成功下载一章后，立即写入CSV，确保数据持久化
            downloaded_list.to_csv('booklist_downloaded.csv', index=False)

            # 检查内容长度是否过短，作为章节结束的标志
            if len(content) < 3:
                break

        except Exception as e:
            print(f"下载 {book_title} 第{j}章时发生错误: {e}")
            break
        j += 1
        time.sleep(1)

