In [None]:
import re
import os
import json

from tqdm import tqdm

In [None]:
substitutions = {
    # pattern: replacement
    # roman numerals
    'Ⅰ': 'I',
    'Ⅱ': 'II',
    'Ⅲ': 'III',
    'Ⅳ': 'IV',
    'Ⅴ': 'V',
    'Ⅵ': 'VI',
    'Ⅶ': 'VII',
    'Ⅷ': 'VIII',
    'Ⅸ': 'IX',
    'Ⅹ': 'X',
    'Ⅺ': 'XI',
    'Ⅻ': 'XII',
    'Ⅼ': 'L',
    'Ⅽ': 'C',
    'Ⅾ': 'D',
    'Ⅿ': 'M',
    # circled numbers
    "⒈": "1.", "⒉": "2.", "⒊": "3.", "⒋": "4.", "⒌": "5.", 
    "⒍": "6.", "⒎": "7.", "⒏": "8.", "⒐": "9.", "⒑": "10.",
    "⒒": "11.", "⒓": "12.", "⒔": "13.", "⒕": "14.", "⒖": "15.",
    "⒗": "16.", "⒘": "17.", "⒙": "18.", "⒚": "19.", "⒛": "20.",
    "０": "0", "１": "1", "２": "2", "３": "3", "４": "4",
    "５": "5", "６": "6", "７": "7", "８": "8", "９": "9",
    # whitespace
    '[  ]+': ' ',
    # blanks
    r"_+ *_+":  " <blank> ",
    r"_+ *(\d+) *_+": r" <blank text=\1> ",
    r"\[ *\]": "<blank text=parentheses> ",
    r"\( *\)": "<blank text=parentheses> ",
    r"<blank>\((\d+)\)": "<blank text=\1>",
    " +": " ",
    # specific rules
    r"([A-Za-z]+)(\")([A-Za-z]+)": r"\1'\3",
    # misc
    "∶": ":",
    "．": ".",
    "／": "/",
    "－": "--",
    '\u00AD': '', # soft hyphen, not visible
    "��": " ",
    "�D": "",
    "�C": "",
    r'°Ø': "'",             # Fix incorrect apostrophe encoding
    r'°∞': '"',             # Fix incorrect open quote encoding
    r'°°': ' ',             # Replace double °° with a space
    r'[Ô≥Â¥∏œ∞]': '',       # Remove random corrupted characters
    r'÷–øº': '',            # Remove other corrupted characters
    r'\"¢\"': '',           # Remove incorrect double quotes
}

def PreProcessText(text_arr):
    new_arr = []
    for i in range(len(text_arr)):
        text = text_arr[i]
        for pattern, replacement in substitutions.items():
            text = re.sub(pattern, replacement, text)
        if text != "":
            new_arr.append(text)
    return new_arr

In [None]:
paths = [
    "zww.cn_nianji1",
    "zww.cn_nianji2",
    "zww.cn_nianji3",
    "zww.cn_zhongkao",
]

In [None]:
files = {}

for path in paths:
    files[path] = []
    for filename in os.listdir(path):
        files[path].append(filename)

In [None]:
contents = {}
for path in paths:
    contents[path] = {}
    for filename in tqdm(files[path]):
        with open(os.path.join(path, filename), "r", encoding="utf-8") as f:
            content = PreProcessText(json.load(f)["content"])
            if len(content) > 0:
                contents[path][filename] = content

In [None]:
for path in paths:
    os.makedirs(os.path.join('zww_parsed', path), exist_ok=True)
    for filename in tqdm(files[path]):
        path_to_file = os.path.join('zww_parsed', path, filename[:-5] + '.txt')
        if filename not in contents[path]:
            continue
        with open(path_to_file, "w", encoding="utf-8") as f:
            for line in contents[path][filename]:
                f.write(line + '\n')

In [None]:
question_text_set = {}
for path in paths:
    question_text_set[path] = {}
    for file in os.listdir(os.path.join('zww_parsed', path)):
        with open(os.path.join('zww_parsed', path, file), "r", encoding="utf-8") as f:
            question_text_set[path][file] = f.readlines()

# Multiple choice

In [None]:
def ExtractMultipleChoiceQs(text_arr):
    start_pattern = re.compile(r"^(.*?(单项选择|选择填空|语法选择|单项填空).*)$")
    # End pattern: Matches a line with at least 5 Chinese characters, ignoring ones inside parentheses
    end_pattern = re.compile(r"^(?=(?:[^\(\)]*[\u4e00-\u9fa5]){2,}).*$")

    extracting = False
    extracted_lines = []
    for line in text_arr:
        if re.match(r"^\s*$", line):
            continue
        if start_pattern.match(line):  # Found the start of the MCQ section
            extracting = True
        elif extracting and end_pattern.match(re.sub("不填", "", line)):  # Detects the next section (ignoring Chinese inside parentheses)
            # rule: avoid the instruction line right after the question
            if not (extracted_lines and (end_pattern.match(re.sub("不填", "", extracted_lines[-1])) or start_pattern.match(extracted_lines[-1]))):
                extracting = False
        if extracting:
            # extracted_lines.extend(re.sub('([A-Za-z])(\d+.)', r'\1\n\2', line).split('\n'))
            tmp = re.sub('([A-Za-z\.])(\d+.)', r'\1\n\2', line)
            if tmp != line:
                extracted_lines.extend([x + '\n' for x in tmp.split('\n')])
            else:
                extracted_lines.append(line)

    return extracted_lines

In [None]:
multiple_choice_questions = {}

for path in paths:
    multiple_choice_questions[path] = {}
    for file in question_text_set[path]:
        tmp = ExtractMultipleChoiceQs(question_text_set[path][file])
        if len(tmp) > 0:
            multiple_choice_questions[path][file] = tmp

for path in paths:
    os.makedirs(os.path.join('zww_parsed', 'multiple_choice', path), exist_ok=True)
    for file in tqdm(multiple_choice_questions[path]):
        path_to_file = os.path.join('zww_parsed', 'multiple_choice', path, file[:-4] + '.txt')
        with open(path_to_file, "w", encoding="utf-8") as f:
            f.writelines([line for line in multiple_choice_questions[path][file]])


In [None]:
ExtractMultipleChoiceQs(question_text_set["zww.cn_zhongkao"]["565.txt"])

In [None]:
def ParseMCQs(text_arr):
    processed_text_arr = []
    for line in text_arr:
        line = re.sub(r'\ba\.', 'A.', line)
        line = re.sub(r'\bb\.', 'B.', line)
        line = re.sub(r'\bc\.', 'C.', line)
        line = re.sub(r'\bd\.', 'D.', line)
        line = re.sub(r'A([^.]*)B([^.]*)C([^.]*)D', r'A.\1 B.\2 C.\3 D.', line)
        line = re.sub(r'A([^.]*)B([^.]*)C', r'A.\1 B.\2 C.', line)
        line = re.sub(r',\.', r'\.', line)
        processed_text_arr.append(line)

    patterns = [
        (
            "one_has_D",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\s*"       # Capture one-line text (lazy match, stops at A.)
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)\s*"      # Capture choice C
                r"D\.\s*([^\n]+)" 
            )
        ),
        (
            "one",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\s*"      # Capture one-line text (lazy match, stops at A.)
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)"      # Capture choice C
            )
        ),
        (
            "two_has_D",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\n"
                r"(.+?)\s*"      
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)\s*"      # Capture choice C
                r"D\.\s*([^\n]+)" 
            )
        ),
        (
            "two",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\n"
                r"(.+?)\s*"
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)"      # Capture choice C
            )
        ),
        (
            "three_has_D",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\n"
                r"(.+?)\n"  
                r"(.+?)\s*"      
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)\s*"      # Capture choice C
                r"D\.\s*([^\n]+)" 
            )
        ),
        (
            "three",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\n"
                r"(.+?)\n"
                r"(.+?)\s*"  
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)"      # Capture choice C
            )
        ),
        (
            "four_has_D",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\n"
                r"(.+?)\n"  
                r"(.+?)\n"
                r"(.+?)\s*"       
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)\s*"      # Capture choice C
                r"D\.\s*([^\n]+)" 
            )
        ),
        (
            "four",
            re.compile(
                r"(\d+)\.\s*"          # Match question number (e.g., "1.")
                r"(.+?)\n"
                r"(.+?)\n"
                r"(.+?)\n"
                r"(.+?)\s*"   
                r"A\.\s*([^\n]+)\s*"   # Capture choice A
                r"B\.\s*([^\n]+)\s*"   # Capture choice B
                r"C\.\s*([^\n]+)"      # Capture choice C
            )
        ),
    ]

    mcqs = []
    question_set = set()

    text_to_process = ''.join(processed_text_arr)
    for name, pattern in patterns:
        for match in pattern.finditer(text_to_process):
            question_number = match.group(1)  # e.g., "1"
            num_groups = len(match.groups())
            text_arr = []
            for i in range(2, num_groups - 4 + 1 if "has_D" in name else num_groups - 3 + 1):
                text_arr.append(match.group(i).strip())
            question_text = re.sub(r"\s+", " ", "\n".join(text_arr)),  # Extracted question text
            if "has_D" in name:
                choices = {
                    "A": re.sub(r"\s+", " ", match.group(num_groups - 3).strip()),
                    "B": re.sub(r"\s+", " ", match.group(num_groups - 2).strip()),
                    "C": re.sub(r"\s+", " ", match.group(num_groups - 1).strip()),
                    "D": re.sub(r"\s+", " ", match.group(num_groups).strip()),
                }
            else:
                choices = {
                    "A": re.sub(r"\s+", " ", match.group(num_groups - 2).strip()),
                    "B": re.sub(r"\s+", " ", match.group(num_groups - 1).strip()),
                    "C": re.sub(r"\s+", " ", match.group(num_groups).strip()),
                }

            # Store as dictionary
            if match.group() not in question_set:
                mcqs.append({
                    "number": question_number,
                    "question": question_text,
                    "choices": choices
                })
                question_set.add(match.group())

    return mcqs


In [None]:
parsed_mcqs = {}
failed_files = {}

for path in paths:
    parsed_mcqs[path] = {}
    failed_files[path] = []
    os.makedirs(os.path.join('zww_parsed', 'parsed_multiple_choice', path), exist_ok=True)
    for file in (pbar := tqdm(multiple_choice_questions[path])):
        pbar.set_description(f"Parsing {path}:{file}")
        path_to_file = os.path.join('zww_parsed', 'parsed_multiple_choice', path, file[:-4] + '.json')
        result = ParseMCQs(multiple_choice_questions[path][file])
        if result == []:
            failed_files[path].append(file)
        else:
            parsed_mcqs[path][file] = result
            with open(path_to_file, "w", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False, indent=4)
    print(f"Failed to parse {len(failed_files[path])} files in {path}")