In [None]:
!pip install chardet

In [None]:
import os
import json
import hashlib
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import random
import re
import chardet

In [None]:
BASE_URL = "https://shijuan.zww.cn/chuzhong/yingyu/nianji{}/list/"
OUTPUT_DIR = "exercise{}"
HEADERS = {"User-Agent": "Mozilla/5.0"}

In [None]:
def get_page_links(url):
    response = requests.get(url, headers=HEADERS, timeout=20)
    if response.status_code != 200:
        raise Exception(f"Failed to access {url}")

    soup = BeautifulSoup(response.content, 'html.parser')
    article_links = []

    for link in soup.select("a"):
        href = link.get("href")
        if href and "/chuzhong/yingyu/nianji" in href:
            full_url = "http://shijuan.zww.cn" + href if href.startswith("/") else href
            if not full_url.endswith("list/"):
                article_links.append(full_url)
    
    return list(set(article_links))

In [None]:
tasks = {(1, 47), (2, 48), (3, 49)} # year, total pages
# tasks = {(1, 2), (2, 4), (3, 4)}
article_links = {}
failed_link = {}

for year, total_pages in tasks:
    year_link = BASE_URL.format(year)
    article_links[year] = get_page_links(year_link)
    failed_link[year] = []

    for i in tqdm(range(1, total_pages)):
        try:
            new_part = get_page_links(f"{year_link}{i}.htm")
            article_links[year].extend(new_part)
        except:
            print(f"Failed to get page {i} for year {year}")
            failed_link[year].append(f"{year_link}{i}.htm")
        # add a random delay to avoid being blocked
        time.sleep(random.uniform(1, 3))

    article_links[year] = list(set(article_links[year]))

In [None]:
# write article links to txt file
for year, links in article_links.items():
    with open(f"article_links_{year}.txt", "w") as f:
        for link in links:
            f.write(link + "\n")


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import re
import chardet  # To detect encoding dynamically

# Function to detect the correct encoding
def detect_encoding(content):
    result = chardet.detect(content)
    return result["encoding"] if result["encoding"] else "utf-8"

# Function to clean and normalize text
def clean_text(text):
    # Ensure the text is valid UTF-8
    text = text.encode("utf-8", "ignore").decode("utf-8")
    
    # Replace Chinese punctuation with English equivalents
    replacements = {
        "。": ".", "，": ",", "、": ",", "？": "?", "！": "!", "：": ":", "；": ";",
        "（": "(", "）": ")", "【": "[", "】": "]", "“": "\"", "”": "\"", "‘": "'", "’": "'",
        "《": "<", "》": ">"
    }
    for ch, eng in replacements.items():
        text = text.replace(ch, eng)

    # Replace HTML non-breaking spaces and Chinese full-width space
    text = text.replace("&nbsp;", " ").replace("\u3000", " ")

    # Remove "¡¡¡¡" (encoding artifacts from web scraping)
    text = re.sub(r"¡+", " ", text)
    
    # Ensure no newlines and strip extra spaces
    text = text.replace("\n", " ").strip()
    
    return text

# Function to extract content and generate JSON output
def get_article_data(url):
    response = requests.get(url)
    
    # Detect encoding dynamically and decode response content
    encoding = detect_encoding(response.content)
    response.encoding = encoding  # Ensure correct decoding
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch article: {url}")
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the <div> with class 'content' and extract all <p> tags within it
    content_div = soup.find("div", class_="content")
    
    if not content_div:
        raise Exception("Content not found")
    
    paragraphs = content_div.find_all("p")
    
    # Process each <p> tag
    cleaned_paragraphs = []
    for p_tag in paragraphs:
        # Within each <p>, find all <u> tags
        for u_tag in p_tag.find_all('u'):
            # Replace <u> tag with its text wrapped in underscores
            u_tag.replace_with(f'_{u_tag.get_text()}_')
        
        # Clean and extract text from the modified <p> tag
        cleaned_paragraphs.append(clean_text(p_tag.get_text(strip=True)))
    
    # Create the final JSON object
    result = {
        "url": url,
        "content": cleaned_paragraphs
    }
    
    return result


In [None]:
failed_link = {}

for year in range(1, 4):
    os.makedirs(f"zww.cn_nianji{year}", exist_ok=True)
    filtered_list = []
    failed_link[year] = []
    with open("article_links_{}.txt".format(year), "r") as f:
        for line in f:
            filtered_list.append(line.strip())   

    for filtered_link in tqdm(filtered_list):
        try:
            ret = get_article_data(filtered_link)
            filename = filtered_link.split("/")[-1][:-4]
            json.dump(ret, open(f"zww.cn_nianji{year}/{filename}.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)
            time.sleep(random.uniform(1, 3))
        except Exception:
            print(f"Failed to process {filtered_link}")
            failed_link[year].append(filtered_link)
            continue