In [1]:

import requests
import sys
import time
import html
import re
import json
from typing import Dict, Any, List, Tuple
from bs4 import BeautifulSoup
json_path = r"G:\Code\Python\Project\Reader\data\backup\info\2025-10-09.json"
deck_name = "CambridgeDeck"
with open(json_path, "r", encoding="utf-8") as f:
    word_list = json.load(f)


In [2]:
def replace_alnum_with_underscores(match_obj: re.Match) -> str:
    """
    接收一个正则表达式匹配对象，
    并将其中的字母和数字替换为下划线。
    """
    word = match_obj.group(0)
    return ''.join(['_' if char.isalnum() else char for char in word])

def build_html_from_word_info(word_info: Dict[str, Any]) -> Dict[str, str]:
    """
    根据 word_info 构建笔记中各个字段的 HTML 内容。
    """
    # ... (此函数的其他部分与您原脚本类似，为了简洁此处省略了定义和短语部分)
    pos_html_parts: List[str] = []
    pronunciation_parts: List[str] = []
    definition_parts: List[str] = []
    examples_parts: List[str] = []
    blanked_examples_parts: List[str] = []
    
    word_to_highlight = word_info.get("word", "")

    # 处理发音、释义等
    for pos in word_info.get("partOfSpeech", []):
        pos_type = pos.get("type", "")
        pos_title_html = f"<div class='pos-title'>{html.escape(str(pos_type)).capitalize()}</div>" if pos_type else ""
        
        # 发音
        # 发音
        uk = pos.get("pronunciationUK") or {}
        us = pos.get("pronunciationUS") or {}
        audio_lines: List[str] = []
        if uk.get("phonetic") or uk.get("pronUrl"):
            aud = f"UK: {html.escape(uk.get('phonetic',''))}"
            if uk.get("pronUrl"):
                aud += f" <audio controls src=\"{html.escape(uk.get('pronUrl'))}\"></audio>"
            audio_lines.append(f"<div class='audio-row'>{aud}</div>")
        if us.get("phonetic") or us.get("pronUrl"):
            aud = f"US: {html.escape(us.get('phonetic',''))}"
            if us.get("pronUrl"):
                aud += f" <audio controls src=\"{html.escape(us.get('pronUrl'))}\"></audio>"
            audio_lines.append(f"<div class='audio-row'>{aud}</div>")
        if audio_lines:
            pronunciation_parts.extend(audio_lines)
        
        # 释义
        defs = pos.get("definitions") or []
        if defs:
            def_block = [pos_title_html, "<ul>"]
            for d in defs:
                en = (d.get("enMeaning") or "").strip()
                ch = (d.get("chMeaning") or "").strip()
                def_block.append(f"<li><div class='definition-en'>{html.escape(en)}</div><div class='definition-ch'>{html.escape(ch)}</div></li>")
            def_block.append("</ul>")
            definition_parts.append("".join(def_block))
        # 处理词性/定义/短语
    
    for pos in word_info.get("partOfSpeech", []):
        pos_type = pos.get("type", "")
        part_lines: List[str] = []
        part_lines.append(f"<div class='pos-title'>{html.escape(str(pos_type)).capitalize()}</div>")

        # 发音
        uk = pos.get("pronunciationUK") or {}
        us = pos.get("pronunciationUS") or {}
        audio_lines: List[str] = []
        if uk.get("phonetic") or uk.get("pronUrl"):
            aud = f"UK: {html.escape(uk.get('phonetic',''))}"
            if uk.get("pronUrl"):
                aud += f" <audio controls src=\"{html.escape(uk.get('pronUrl'))}\"></audio>"
            audio_lines.append(f"<div class='audio-row'>{aud}</div>")
        if us.get("phonetic") or us.get("pronUrl"):
            aud = f"US: {html.escape(us.get('phonetic',''))}"
            if us.get("pronUrl"):
                aud += f" <audio controls src=\"{html.escape(us.get('pronUrl'))}\"></audio>"
            audio_lines.append(f"<div class='audio-row'>{aud}</div>")
        if audio_lines:
            part_lines.extend(audio_lines)

        # 定义
        defs = pos.get("definitions") or []
        if defs:
            part_lines.append("<ul>")
            for d in defs:
                en = (d.get("enMeaning") or d.get("en") or "").strip()
                ch = (d.get("chMeaning") or d.get("ch") or "").strip()
                part_lines.append(
                    "<li>"
                    f"<div class='definition-en'>{html.escape(en)}</div>"
                    f"<div class='definition-ch'>{html.escape(ch)}</div>"
                    "</li>"
                )
            part_lines.append("</ul>")

        # 短语
        phrases = pos.get("phrases") or []
        phrase_defs = pos.get("phraseDefinitions") or []
        if phrases:
            part_lines.append("<div><b>Phrases:</b><ul>")
            for i, ph in enumerate(phrases):
                pd = phrase_defs[i] if i < len(phrase_defs) else {}
                en = (pd.get("enMeaning") or pd.get("en") or "").strip()
                ch = (pd.get("chMeaning") or pd.get("ch") or "").strip()
                part_lines.append(
                    "<li>"
                    f"<span class='phrase'>{html.escape(ph)}</span> — <span class='definition-en'>{html.escape(en)}</span>"
                    f"<div class='definition-ch'>{html.escape(ch)}</div>"
                    "</li>"
                )
            part_lines.append("</ul></div>")

        pos_html_parts.append("<div>" + "\n".join(part_lines) + "</div>")

    # 处理例句
    for s in word_info.get("sentences", []):
        sentence_text = s.get("notes").strip()
        if not sentence_text:
            continue
        
        # 1. 'Examples' 字段: 目标词加粗
        escaped_sentence = html.escape(sentence_text)
        highlighted = escaped_sentence
        target_word = s.get("text") or word_to_highlight # 用于加粗的目标词
        if target_word:
            try:
                pattern = re.compile(r'\b' + re.escape(html.escape(target_word.strip())) + r'\b', re.IGNORECASE)

                highlighted = pattern.sub(lambda m: f"<strong>{m.group(0)}</strong>", escaped_sentence)
            except re.error:
                pass # 忽略正则错误
        escaped_target = html.escape(target_word.strip())
        if " " in target_word:
            # 多词短语，不加 \b
            pattern_for_blanking = re.compile(re.escape(escaped_target), re.IGNORECASE)
        else:
            # 单词，加边界防止误匹配
            pattern_for_blanking = re.compile(r'\b' + re.escape(escaped_target) + r'\b', re.IGNORECASE)
        # pattern_for_blanking = re.compile(r'\b' + re.escape(target_word) + r'\b', re.IGNORECASE)
        # 2. 'Blanked_Examples' 字段: 所有单词字母替换为下划线
        blanked_sentence = pattern_for_blanking.sub(replace_alnum_with_underscores, sentence_text)
        escaped_blanked = html.escape(blanked_sentence)

        # 来源信息
        book = s.get("bookName") or ""
        meta = f" — 《{html.escape(book)}》" if book else ""

        examples_parts.append(f"<div class='example'><div class='example-text'>{highlighted}</div><div class='example-meta'>{meta}</div></div>")
        blanked_examples_parts.append(f"<div class='example'><div class='example-text'>{escaped_blanked}</div><div class='example-meta'>{meta}</div></div>")

    return {
        "POS_Definitions": "\n".join(pos_html_parts),
        "Pronunciation": "\n".join(pronunciation_parts),
        "Definition": "\n".join(definition_parts),
        "Examples": "\n".join(examples_parts),
        "Blanked_Examples": "\n".join(blanked_examples_parts)
    }

In [3]:
build_html_from_word_info(word_list[5])

{'POS_Definitions': '<div><div class=\'pos-title\'>Adjective</div>\n<div class=\'audio-row\'>UK: / ˌpær.əˈmɪl.ɪ.tri / <audio controls src="https://dictionary.cambridge.org/media/english-chinese-simplified/uk_pron/u/ukp/ukpar/ukparag014.mp3"></audio></div>\n<div class=\'audio-row\'>US: / ˌper.əˈmɪl.ə.ter.i / <audio controls src="https://dictionary.cambridge.org/media/english-chinese-simplified/us_pron/p/par/param/paramilitary.mp3"></audio></div>\n<ul>\n<li><div class=\'definition-en\'>A paramilitary group is organized like an army but is not official and often not legal .</div><div class=\'definition-ch\'>准军事的 ， 非法军事组织的</div></li>\n<li><div class=\'definition-en\'>connected with and helping the official armed forces</div><div class=\'definition-ch\'>与正规军有联系的 ， 辅助军事的</div></li>\n</ul></div>\n<div><div class=\'pos-title\'>Noun [ c ]</div>\n<div class=\'audio-row\'>UK: / ˌpær.əˈmɪl.ɪ.tri / <audio controls src="https://dictionary.cambridge.org/media/english-chinese-simplified/uk_pron/u/ukp/

In [4]:
ANKI_CONNECT_URL = "http://localhost:8765"
MODEL_NAME = "WordType"
REQUEST_TIMEOUT = 2.0

def invoke(action: str, **params):
    try:
        r = requests.post(
            ANKI_CONNECT_URL,
            json={"action": action, "version": 6, "params": params},
            timeout=REQUEST_TIMEOUT
        )
        r.raise_for_status()
        return r.json()
    except requests.RequestException as e:
        print(f"[错误] 无法连接 AnkiConnect（{ANKI_CONNECT_URL}）：{e}")
        sys.exit(1)

In [5]:

# word_list_sub = word_list[0:2]
for word_info in word_list:
    result = {"created": False, "created_note_result": None, "updated": [], "skipped": [], "errors": []}
    word = word_info.get("partOfSpeech")[0].get("wordPrototype", "")
    if " " in word:
        tags = "phrase"
    else:
        tags = "word"
    print(word)
    query = f'deck:"{deck_name}" "Word:{word}"'
    try:
        find_res = invoke("findNotes", query=query)
    except Exception as e:
        result["errors"].append(f"findNotes 调用异常: {e}")

    note_ids = find_res.get("result", []) or []
    if note_ids:
        # res = build_html_from_word_info(word_info)
        # blanked_sentences = res["Blanked_Examples"]
        # pronunciation = res["Pronunciation"]
        # definition = res["Definition"]

        # r = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Blanked_Examples": blanked_sentences}})
        # print(r)

        # r = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Definition": definition}})
        # print(r)

        # r = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Pronunciation": pronunciation}})
        # print(r)

        note_id = note_ids[0]
        # invoke("updateNoteFields", note={"id": note_id, "fields": {"Blanked_Examples": blanked_sentences}})

        res = invoke("updateNoteFields", note={"id": note_id, "fields": {"Tags": tags}})

        # print(res)

        # 验证是否真的更新了
        # info = invoke("notesInfo", notes=[note_id])
        # print(info["result"][0]["fields"]["Blanked_Examples"]["value"])


burst up to six feet


KeyboardInterrupt: 

In [None]:

# word_list_sub = word_list[0:2]
for word_info in reversed(word_list):
    result = {"created": False, "created_note_result": None, "updated": [], "skipped": [], "errors": []}
    word = word_info.get("partOfSpeech")[0].get("wordPrototype", "")
    if " " in word:
        tags = "phrase"
    else:
        tags = "word"
    print(word)
    query = f'deck:"{deck_name}" "Word:{word}"'
    try:
        find_res = invoke("findNotes", query=query)
    except Exception as e:
        result["errors"].append(f"findNotes 调用异常: {e}")

    note_ids = find_res.get("result", []) or []
    if note_ids:
        info_res = invoke("notesInfo", notes=[note_ids[0]])
        blanked_example_sentence_html = info_res.get("result")[0].get("fields").get("Blanked_Examples").get("value") or ""
        if len(blanked_example_sentence_html) == 0:
            example_sentence_html = info_res.get("result")[0].get("fields").get("Examples").get("value") or ""
            # 替换 <strong> 内字母数字为 _ 并去掉标签
            blanked = re.sub(
                r'<strong>(.*?)</strong>',           # 匹配 <strong>...</strong>，非贪婪
                lambda m: re.sub(r'[A-Za-z0-9]', '_', m.group(1)),  # 替换内容
                example_sentence_html,
                flags=re.DOTALL
            )
            upd = invoke(
                "updateNoteFields",
                note={"id": note_ids[0], "fields": {"Blanked_Examples": blanked}}
            )
            print(f"update blank of {word}")


        definition = info_res.get("result")[0].get("fields").get("Definition").get("value") or ""
        if len(definition) == 0:
            pos_definitions = info_res.get("result")[0].get("fields").get("POS_Definitions").get("value") or ""
            soup = BeautifulSoup(pos_definitions, "html.parser")
            for el in soup.select(".audio-row"):
                el.decompose()
            definition = str(soup)
            upd = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Definition": definition}})
            print(f"update definition of {word}")



In [None]:
def extract_raw_sentence(existing_examples_html: str) -> List[str]:
    """
    从已有 Examples 字段的 HTML 中提取所有 <div class='example-text'>...</div> 的纯文本（去标签、unescape）。
    返回文本列表（顺序与出现顺序一致）。
    """
    if not existing_examples_html:
        return []
    # 找到所有 example-text 段落
    matches = re.findall(r"<div\s+class=['\"]example-text['\"]\s*>(.*?)</div>", existing_examples_html, flags=re.S | re.I)
    plain_texts = []
    for m in matches:
        # 去掉 HTML 标签（如 <strong> 等）
        no_tags = re.sub(r"<[^>]+>", "", m)
        unescaped = html.unescape(no_tags).strip()
        if unescaped:
            plain_texts.append(unescaped)
    return plain_texts

In [None]:

for word_info in word_list:
    result = {"created": False, "created_note_result": None, "updated": [], "skipped": [], "errors": []}
    word = word_prototype = word_info.get("partOfSpeech")[0].get("wordPrototype", "")
    print(word)
    query = f'deck:"{deck_name}" "Word:{word}"'
    try:
        find_res = invoke("findNotes", query=query)
    except Exception as e:
        result["errors"].append(f"findNotes 调用异常: {e}")

    note_ids = find_res.get("result", []) or []
    info_res = invoke("notesInfo", notes=[note_ids[0]])
    example_sentence_html = info_res.get("result")[0].get("fields").get("Examples").get("value") or ""
    example_blanked_sentence_html = info_res.get("result")[0].get("fields").get("Blanked_Examples").get("value")
    new_sentence_html = example_sentence_html + example_sentence_html
    new_blanked_sentence_html = example_blanked_sentence_html + example_blanked_sentence_html

    upd = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Examples": new_sentence_html}})
    upd = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Blanked_Examples": new_blanked_sentence_html}})
    break

解决很多例句没有chapter，只有bookname的问题

In [None]:
import re

def fix_meta(html, bookName, chapter):
    need_update = False

    def repl(m):
        nonlocal need_update
        inner = m.group(1)
        cleaned = re.sub(r'《.*?》', '', inner)
        cleaned = re.sub(r'^[\s\-\u2013\u2014:—–]+|[\s\-\u2013\u2014:—–]+$', '', cleaned)
        if not re.search(r'\w', cleaned, flags=re.UNICODE):
            need_update = True
            return f"<div class='example-meta'>——《{bookName}》: {chapter}</div>"
        else:
            return f"<div class='example-meta'>{inner}</div>"

    new_html = re.sub(r'<div\s+class=[\'"]example-meta[\'"]>(.*?)</div>', repl, html, flags=re.DOTALL)
    return new_html, need_update

html = """<div class="example"><div class="example-text">gave unemployed artists stipends, studios and ___________ ______, in effect serving as a scouting network.</div><div class="example-meta"> — 《The Economist [Sep 27th 2025]》</div></div>"""
chapter = "A restaurant scandal sticks in China’s throat"
bookName = "The Economist [Sep 27th 2025]"
new_html, need_update = fix_meta(html, bookName, chapter)
print(new_html)
print(need_update)


In [None]:

from tqdm import tqdm
import math
# word_list_sub = word_list[0:2]
for word_info in tqdm(reversed(word_list), total=len(word_list), desc="Processing words"):
    result = {"created": False, "created_note_result": None, "updated": [], "skipped": [], "errors": []}
    word = word_info.get("partOfSpeech")[0].get("wordPrototype", "")
    query = f'deck:"{deck_name}" "Word:{word}"'
    print(word)
    try:
        find_res = invoke("findNotes", query=query)
    except Exception as e:
        result["errors"].append(f"findNotes 调用异常: {e}")
    note_ids = find_res.get("result", []) or []
    if note_ids:
        chapter = word_info.get("sentences")[0].get("chapter")
        # 如果 chapter 是 None、NaN 或空字符串，就跳过
        if chapter is None or (isinstance(chapter, float) and math.isnan(chapter)) or str(chapter).strip() == "":
            continue

        bookName = word_info.get("sentences")[0].get("bookName")
        info_res = invoke("notesInfo", notes=[note_ids[0]])
        blanked_example_sentence_html = info_res.get("result")[0].get("fields").get("Blanked_Examples").get("value") or ""
        example_sentence_html = info_res.get("result")[0].get("fields").get("Examples").get("value") or ""
        need_update = False
        example_sentence_html, need_update = fix_meta(example_sentence_html, bookName, chapter)
        if need_update :
            print(example_sentence_html)
            upd = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Examples": example_sentence_html}})
        blanked_example_sentence_html, need_update  = fix_meta(blanked_example_sentence_html, bookName, chapter)
        if need_update:
            print(blanked_example_sentence_html)
            upd = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Blanked_Examples": blanked_example_sentence_html}})





In [None]:

query = f'deck:"{deck_name}" "Word:{"performance venue"}"'
find_res = invoke("findNotes", query=query)
find_res

blanked_example_sentence_html = info_res.get("result")[0].get("fields").get("Blanked_Examples").get("value") or ""
print(blanked_example_sentence_html)
blanked_example_sentence_html, need_update = fix_meta(blanked_example_sentence_html, bookName, chapter)
print(blanked_example_sentence_html)

note_ids = find_res.get("result", []) or []
print(note_ids)
upd = invoke("updateNoteFields", note={"id": note_ids[0], "fields": {"Blanked_Examples": blanked_example_sentence_html}})
upd

上传文章音频到anki中，用于周期性复习和听力

1. 遍历notes.json，获得所有文章标题
2. 输入音频所在地址，以及输入拷贝截屏
3. 基于文章标题、音频和截屏，产生anki页面，上传到anki中，并将已有的以json的形式保存。json里面保存音频路径，截屏路径。音频和截屏要保存到json所在的文件夹中，同时做重命名为文章标题。

In [6]:
import os
import datetime
from datetime import date,datetime
import json
from pathlib import Path
import re

# === 配置 ===
AUDIO_BASE_DIR = Path(r"G:\Book\Economist")  # Economist 文件夹根目录
AUDIO_PREFIX = "TEco-"                 # 音频文件夹前缀
AUDIO_SUFFIX = "音频"                   # 音频文件夹后缀

In [7]:

def load_latest_dict_list(folder="./saved"):
    """读取文件夹中日期最接近今天的 json 文件"""
    if not os.path.exists(folder):
        return []

    # 找出所有文件名形如 YYYY-MM-DD.json
    files = [
        f for f in os.listdir(folder)
        if f.endswith(".json") and len(f) == len("2025-09-18.json")
    ]

    if not files:
        return []

    # 解析文件名中的日期，找到距离今天最近的
    today = date.today()
    closest_file = None
    closest_diff = None

    for f in files:
        try:
            file_date = datetime.strptime(f[:-5], "%Y-%m-%d").date()
        except ValueError:
            continue
        diff = abs((today - file_date).days)
        if closest_diff is None or diff < closest_diff:
            closest_diff = diff
            closest_file = f

    if closest_file is None:
        return []

    filepath = os.path.join(folder, closest_file)
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def extract_date(s):
    import re
    # 提取括号中的日期部分
    m = re.search(r'\[(.*?)\]', s)
    if not m:
        return datetime.max  # 没找到则放在最后
    date_str = m.group(1)
    # 尝试解析日期
    try:
        return datetime.strptime(date_str, "%b %dth %Y")
    except ValueError:
        # 兼容其他可能的后缀（st, nd, rd, th）
        for suf in ["st", "nd", "rd", "th"]:
            try:
                return datetime.strptime(date_str.replace(suf, ""), "%b %d %Y")
            except ValueError:
                continue
        return datetime.max
    
def list_mp3_files(base_dir: Path):
    mp3_files = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.lower().endswith(".mp3"):
                mp3_files.append(Path(root) / f)
    return mp3_files


In [11]:
cur_notes = load_latest_dict_list(r"G:\Code\Python\Project\Reader\data\backup\notes")

passage = set()
for note in cur_notes:
    if "Economist" in note["bookName"]:
        passage.add((note["bookName"], note["chapter"]))

sorted_passage = sorted(passage, key=lambda x: extract_date(x[0]))
for p in sorted_passage:
    print(p[0], p[1])

The Economist [Sep 6th 2025] Xi Jinping’s anti-American party
The Economist [Sep 13th 2025] Japan’s new leadership struggle is far from business as usual
The Economist [Sep 13th 2025] Charlie Kirk challenged liberals until the day he was murdered
The Economist [Sep 20th 2025] How Israel is losing America
The Economist [Sep 20th 2025] The UN’s grim future
The Economist [Sep 20th 2025] The surprising nature of protest in China
The Economist [Sep 20th 2025] The health benefits of sunlight may outweigh the risk of skin cancer
The Economist [Sep 20th 2025] What Elon Musk gets wrong about Europe’s hard right
The Economist [Sep 20th 2025] Don’t fret over China’s new climate targets
The Economist [Sep 27th 2025] A restaurant scandal sticks in China’s throat
The Economist [Sep 27th 2025] A made-in-China plan for world domination
The Economist [Sep 27th 2025] Could Tony Blair run Gaza?
The Economist [Sep 27th 2025] Would you shelter under Pakistan’s nuclear umbrella?
The Economist [Sep 27th 2025