In [6]:
import requests
from bs4 import BeautifulSoup

# ***************************************** Begin ************************************************
# 请求网页
def request_page(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status() 
        response.encoding = 'gb18030'  
        return response.text
    except requests.RequestException as e:
        print(f"请求失败: {e}")
        return None

# 解析网页
def parse_page(html):
    print("数据采集成功。")
    if html is None:
        return
    
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.find_all('tr')
    
    for item in items[1:]:  # 跳过表头
        rank = item.find('td', class_='first').text.strip()
        title = item.find('a', class_='list-title').text.strip()
        hot = item.find('td', class_='last').text.strip()

        print("排名：{0:^4}\t标题：{1:^15}\t热度：{2:^8}".format(rank, title, hot))

# ***************************************** End ************************************************

if __name__ == '__main__':
    url = 'http://top.baidu.com/buzz?b=1&fr=20811'
    # 需要修改为自己的 Header 信息
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'}
    html = request_page(url, headers)
    parse_page(html)

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import json

# 函数1：请求网页
def page_request(session, url, ua):
    response = session.get(url, headers=ua)
    html = response.content.decode('utf-8')
    return html

# 函数2：解析网页
def page_parse(html):
    soup = BeautifulSoup(html, 'lxml')
    info = soup.select('body > div.main3 > div.left > div.sons > div.cont')
    sentence = soup.select('div.left > div.sons > div.cont > a:nth-of-type(1)')

    sentence_list = []
    href_list = []
    
    for i in range(len(info)):
        curInfo = ''.join(info[i].get_text().split('\n'))
        sentence_list.append(curInfo)
        href = sentence[i].get('href')
        href_list.append("https://so.gushiwen.org" + href)

    return [href_list, sentence_list]

# 保存诗句到文件
def save_txt(filename, info_list):
    with open(filename, 'a', encoding='utf-8') as txt_file:
        for element in info_list:
            txt_file.write(json.dumps(element, ensure_ascii=False) + '\n\n')

# 子网页处理函数：请求并解析子网页
def sub_page_request_parse(session, urls, ua):
    sub_html = [page_request(session, url, ua) for url in urls]
    poem_list = []
    
    for html in sub_html:
        soup = BeautifulSoup(html, 'lxml')
        poem = soup.select('div.left > div.sons > div.cont > div.contson')
        if poem:
            poem_list.append(poem[0].get_text().strip())

    return poem_list

if __name__ == '__main__':
    print("****************开始爬取古诗文网站******************")
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'}
    
    session = requests.Session()  # Use session to reuse connections
    for i in range(1, 5):
        url = f'https://so.gushiwen.cn/mingjus/default.aspx?page={i}&tstr=&astr=&cstr=&xstr='
        time.sleep(1)
        
        html = page_request(session, url, ua)
        info_list = page_parse(html)
        
        save_txt('/root/sentence.txt', info_list[1])
        
        # 处理子网页
        print(f"开始解析第{i}页")
        sub_poem_list = sub_page_request_parse(session, info_list[0], ua)
        save_txt('/root/poems.txt', sub_poem_list)
    
    print("******************爬取完成*********************")
    print(f"共爬取{i}页古诗词名句，保存在如下路径：/root/sentence.txt")
    print(f"共爬取{i}页古诗词，保存在如下路径：/root/poems.txt")


创建和启动mysql指令我写为了一个脚本，这样可以方便的启动和关闭mysql服务，脚本如下：
```bash
#!/bin/bash

# 启动 MySQL 服务
service mysql start

# 执行 MySQL 命令
mysql -u root -p123123 <<EOF
CREATE DATABASE IF NOT EXISTS webdb;
USE webdb;
CREATE TABLE IF NOT EXISTS search_index (
    id INT,
    keyword CHAR(20),
    number INT
);
ALTER DATABASE webdb CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
ALTER TABLE search_index CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
EOF

echo "MySQL database and table setup completed."
```
这个脚本会启动mysql服务，然后创建一个名为webdb的数据库，然后在这个数据库中创建一个名为search_index的表，表中有三个字段，分别是id、keyword和number，然后将数据库和表的字符集设置为utf8mb4。 保存为`setup_mysql.sh`文件。在bash中执行`bash setup_mysql.sh`即可完成数据库的创建和表的创建。

In [None]:
import requests
from bs4 import BeautifulSoup
import pymysql.cursors

# **************************** Begin *********************************
# 读取本地HTML文件
def get_html():
    with open('/data/workspace/myshixun/step3/web_demo.html', 'r', encoding='utf-8') as file:
        html = file.read()
    return html

# 解析HTML文件
def parse_html(html):
    soup = BeautifulSoup(html, 'lxml')
    # 找到所有表格中的行
    rows = soup.select('table tr')
    info_list = []
    # 跳过第一行表头
    for row in rows[1:]:
        columns = row.find_all('td')
        rank = int(columns[0].get_text())
        keyword = columns[1].get_text()
        index = int(columns[2].get_text())
        info_list.append((rank, keyword, index))
    return info_list

# 保存数据库
def save_mysql(info_list):
    # 连接数据库
    connection = pymysql.connect(
        host='127.0.0.1',
        user='root',
        password='123123',
        database='webdb',
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor
    )

    try:
        with connection.cursor() as cursor:
            # 插入数据
            sql = "INSERT INTO search_index (id, keyword, number) VALUES (%s, %s, %s)"
            cursor.executemany(sql, info_list)
        
        # 提交事务
        connection.commit()
        print('成功插入数据')

        # 打印插入的数据
        # cursor.execute("SELECT * FROM search_index")
        # result = cursor.fetchall()
        # print('id    keyword    number')
        # for row in result:
        #     print(f"{row['id']}    {row['keyword']}    {row['number']}")
    
    finally:
        # 关闭数据库连接
        connection.close()
# **************************** End *********************************

if __name__ =='__main__':
    html = get_html()  # 读取HTML文件
    info_list = parse_html(html)  # 解析HTML文件
    save_mysql(info_list)  # 保存到MySQL数据库
