In [1]:
import requests
import pandas as pd
import re
import time
import os
from lxml import html
import urllib.parse
import glob
import openai
from tqdm.notebook import tqdm
from pydantic import BaseModel, field_validator
from datetime import datetime
from openai import OpenAI

In [2]:
period_urls = [
    "https://meetstamps.com/中国清代邮票目录",
    "https://meetstamps.com/中华民国邮票目录",
    "https://meetstamps.com/中国解放区邮票目录",
    "https://meetstamps.com/中华人民共和国邮票目录"
]

In [None]:
# First, we are going to get all the urls of the stamps
target_urls = []
for url in period_urls:
    collection_name = urllib.parse.unquote(url.split("/")[-1])

    response = requests.get(url)
    if response.status_code == 200:
        page = html.fromstring(response.text)
        links = page.xpath('//*[@id="content"]//p/a/@href')
        for link in links:
            if "archives" in link:
                subcollection_name = "archives"
                info = {
                    "collection_name": collection_name,
                    "subcollection_name": subcollection_name,
                    "url": link,
                }
                target_urls.append(info)
            # I've found that there are some links that cannot be accessed directly. So, I need to get the links from the subcollection page.
            else:
                response = requests.get(link)
                page = html.fromstring(response.text)
                links = page.xpath('//*[@id="content"]//p/a/@href')
                subcollection_name = urllib.parse.unquote(link.split("/")[-1])
                for link in links:
                    if "archives" in link:
                        info = {
                            "collection_name": collection_name,
                            "subcollection_name": subcollection_name,
                            "url": link,
                        }
                        target_urls.append(info)

In [None]:
for url in target_urls:
    output_path = f"data/html/{url['collection_name']}/{url['subcollection_name']}"
    os.makedirs(output_path, exist_ok=True)
    response = requests.get(url["url"])
    if response.status_code == 200:
        page_content = response.text
        with open(f"{output_path}/{url['url'].split('/')[-1]}.html", "w", encoding="utf-8") as f:
            f.write(page_content)
        print(f"Saved {url['url']} to {output_path}/{url['url'].split('/')[-1]}.html")

In [None]:
import glob
import os
import requests
from lxml import html
import pandas as pd

files = glob.glob("data/html/*/*/*.html")
data = []

for file in files:
    try:
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
    except UnicodeDecodeError:
        with open(file, 'r', encoding='gbk') as f:
            content = f.read()

    collection_name = file.split("\\")[-3]
    subcollection_name = file.split("\\")[-2]
    stamp_page = html.fromstring(content)

    page_url = stamp_page.xpath('//*[@class="posttitle"]//a/@href')[0]
    stamp_title = stamp_page.xpath('//*[@class="posttitle"]//text()')[0]
    stamp_description = " ".join(stamp_page.xpath('//*[@class="postentry"]/p/text()'))

    # 获取大图片的 URL
    stamp_images = stamp_page.xpath('//*[@class="postentry"]/p/a/@href')

    for stamp_image in stamp_images:
        # 生成图片保存路径
        image_filename = stamp_image.split("/")[-1]
        image_folder = f"data/images/{collection_name}/{subcollection_name}"
        image_path = os.path.join(image_folder, image_filename)

        # 确保目录存在
        os.makedirs(image_folder, exist_ok=True)

        # 下载并保存图片
        try:
            response = requests.get(stamp_image, stream=True)
            if response.status_code == 200:
                with open(image_path, 'wb') as img_file:
                    for chunk in response.iter_content(1024):
                        img_file.write(chunk)
            else:
                print(f"Failed to download {stamp_image}: {response.status_code}")
        except requests.RequestException as e:
            print(f"Error downloading {stamp_image}: {e}")

        # 记录数据
        stamp_info = {
            "url": page_url,
            "collection_name": collection_name,
            "subcollection_name": subcollection_name,
            "title": stamp_title,
            "image_url": stamp_image,  # 这里是大图片的 URL
            "image_path": image_path,
            "description": stamp_description,
        }
        data.append(stamp_info)

df = pd.DataFrame(data)
df.to_csv("stamp_data.csv", index=False)  # 保存数据到 CSV 文件
