In [1]:
import pandas as pd
import html
import pathlib
from typing import Set, Dict, TextIO
import base64
import json
from pypinyin import pinyin, lazy_pinyin, Style
from __future__ import annotations

In [2]:
# Sample DataFrame with HTML entities
df = pd.concat(map(pd.read_excel, pathlib.Path("input").glob("*.xls")))
cols_to_unescape = ["题目类型", "题干", "选项", "答案"]
df[cols_to_unescape] = df[cols_to_unescape].map(html.unescape, "ignore")

In [3]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,题号,题目类型,题干,选项,答案
0,15139,判断题,体育锻炼的负荷量无论是强度、时间还是密度都要因人、因时而异，应根据自身的实际情况安排运动负荷。,,正确
1,15140,判断题,同一个人对运动负荷量的承受能力是一成不变的。,,错误
2,15141,判断题,体育锻炼对人体的积极作用在短时间内就能取得成效。,,错误
3,15142,判断题,耐力可分为有氧耐力和无氧耐力两种。,,正确
4,15146,判断题,体质健康测试中的体重指数（BMI）=体重（千克）/身高2（米2）\n,,正确
...,...,...,...,...,...
928,142759,单选题,1932年国立浙江大学正式设立体育部，当时规定学生__________不及格且不补足学分就不...,A．早操\nB．体操\nC．体测\nD．跑步,A
929,142760,单选题,普通健康人锻炼身体的适宜负荷量，一般采用___________来确定，即以本人最大心率的__...,A．心率百分数、65～85%\nB．心率百分数、60～80%\nC．心率、60～80%\nD...,B
930,142761,单选题,运动效果取决于运动刺激的__________，运动量太小，对机体的影响轻微，运动效果不佳，运...,A．强度与密度、疾病\nB．密度、运动性疾病\nC．强度、运动性疾病\nD．难度、疾病与损伤,B
931,142762,单选题,当患者因意外事故心脏停止跳动时，必须立即实施________________，争取在最短时间...,A．人工呼吸 \nB．心肺复苏术\nC．止血\nD．抗休克,B


In [4]:
class TrieNode:
    count: int
    is_end_of_word: bool
    children: Dict[str, TrieNode]
    content: dict
    prob_index: int
    node_id: int

    def __init__(self, node_id):
        self.children = {}
        self.count = 0
        self.is_end_of_word = False
        self.content = {}
        self.prob_index = -1
        self.node_id = node_id


class Trie:

    def __init__(self):
        self.root = TrieNode(0)
        self.node_count = 1

    def insert(self, word, prob_index):
        node = self.root
        node.count += 1
        for char in word:
            # If the character is not already a child, add it
            if char not in node.children:
                node.children[char] = TrieNode(self.node_count)
                self.node_count += 1
            node = node.children[char]
            node.count += 1
        # Mark the end of a word
        node.is_end_of_word = True
        node.prob_index = prob_index

    def search(self, word):
        node = self.root
        for char in word:
            # If the character isn't found, the word doesn't exist
            if char not in node.children:
                return False
            node = node.children[char]
        # Return True only if it's the end of a valid word
        return node.is_end_of_word

    def starts_with(self, prefix):
        node = self.root
        for char in prefix:
            # If the character isn't found, the prefix doesn't exist
            if char not in node.children:
                return False
            node = node.children[char]
        return True

In [5]:
probs = df["题干"].to_dict()
h = Trie()
for u, v in probs.items():
    h.insert(''.join(filter(str.isalnum, v)), u)

In [6]:
df.loc[77,'答案']

'D'

In [7]:
def format_prob(index: int):
    s = {}
    s["题干"] = df.loc[index, "题干"]
    if pd.notna(df.loc[index, "选项"]):
        s["选项"] = df.loc[index, "选项"]
    if pd.notna(df.loc[index, "答案"]):
        s["答案"] = df.loc[index, "答案"]
    return s

In [8]:
def Dfs(prefix: str, x: TrieNode) -> int:
    if x.is_end_of_word:
        x.content["prob"] = format_prob(x.prob_index)
    elif len(x.children) == 1:  # 由儿子全权代表
        for u, v in x.children.items():
            return Dfs(prefix + u, v)
    sons = []
    for u, v in x.children.items():
        sons += [Dfs(prefix + u, v)]
    x.content["prefix"] = f"{prefix if prefix else "起始索引"}: \n"
    if sons:
        x.content["sons"] = sons
    x.content["node_id"] = x.node_id
    return prefix, x.node_id


Dfs("", h.root)


def Dfs2(x: TrieNode) -> dict:
    if x.content:
        entries.append(x.content)
    for v in x.children.values():
        Dfs2(v)


entries = []
Dfs2(h.root)

In [11]:
e0s = []
for son in sorted(
    entries[0]["sons"],
    key=lambda y: (lambda x: x if x.isascii() else pinyin(x, style=Style.TONE3)[0][0])(
        y[0][0]
    ),
):
    son = list(son)
    if not son[0][0].isascii():
        son[0] = f"{pinyin(son[0][0])[0][0]}-{son[0]}"
    e0s.append(son)
entries[0]["sons"] = e0s
with open("data.json", "w") as f:
    json.dump(
        entries,
        f,
        ensure_ascii=False,
        indent=4,
    )