In [7]:
import json
import os
import re
import pypandoc
import subprocess
from docx import Document

def extract_and_convert_images(docx_path, image_dir="media"):  # Changed to "media" to match pandoc
    doc = Document(docx_path)
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    images_map = {}  # map original filename to final filename
    count = 1
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            image_part = rel._target
            ext = image_part.content_type.split("/")[-1]  # e.g., wmf, png, jpeg
            filename = f"image{count}.{ext}"  # Removed underscore to match pandoc naming
            filepath = os.path.join(image_dir, filename)

            # Write original image
            with open(filepath, "wb") as f:
                f.write(image_part.blob)

            final_filename = filename
            if ext in ["wmf", "emf", "x-wmf"]:
                png_filename = filename.rsplit(".", 1)[0] + ".png"
                png_path = os.path.join(image_dir, png_filename)
                try:
                    subprocess.run(["magick", filepath, png_path], check=True)
                    if os.path.exists(png_path):
                        final_filename = png_filename
                        # Delete the original WMF file after successful conversion
                        os.remove(filepath)
                except Exception as e:
                    print("Convert error:", e)
                    # If conversion failed, keep the original

            images_map[filename] = final_filename
            count += 1

    return images_map

def convert_docx_to_latex(docx_path, output_tex_path):
    """
    Convert DOCX -> LaTeX using pandoc
    """
    try:
        pypandoc.convert_file(docx_path, 'latex', outputfile=output_tex_path)
        with open(output_tex_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error converting DOCX to LaTeX: {e}")
        return ""

def split_blocks(text):
    """
    Split text into blocks: text / image / math
    """
    blocks = []
    pattern = re.compile(
        r'\\includegraphics\[.*?\]\{(.*?)\}'  # image
        r'|(\$.*?\$|\\\(.+?\\\)|\\\[.+?\\\])'  # math
    )
    last = 0
    for m in pattern.finditer(text):
        start, end = m.span()
        if start > last:
            txt = text[last:start].strip()
            if txt:
                blocks.append({"type": "text", "content": txt})
        if m.group(1):  # image
            blocks.append({"type": "image", "src": m.group(1)})
        elif m.group(2):  # math
            blocks.append({"type": "math", "content": m.group(2)})
        last = end
    if last < len(text):
        txt = text[last:].strip()
        if txt:
            blocks.append({"type": "text", "content": txt})
    return blocks

def parse_latex_to_json(latex_content):
    """
    Parse LaTeX into JSON: questions + options
    """
    question_blocks = re.split(r'\\textbf\{CÃ¢u \d+\:\}', latex_content)[1:]
    questions = []

    for idx, block in enumerate(question_blocks, 1):
        parts = re.split(r'\\begin\{quote\}', block, maxsplit=1)
        question_text = parts[0].strip()
        options_block = parts[1].split(r'\\end{quote}')[0] if len(parts) > 1 else ""

        blocks = split_blocks(question_text)

        option_pattern = re.compile(
            r'\\textbf\{(\\ul\{)?([A-D])\.}(.*?)(?=(?:\\textbf|\Z))', re.DOTALL)
        options = []
        correct = None
        for m in option_pattern.finditer(options_block):
            is_correct = m.group(1) is not None
            label = m.group(2)
            opt_content = m.group(3).replace('\n', ' ').strip(' .')
            opt_blocks = split_blocks(opt_content)
            options.append({"label": label, "blocks": opt_blocks})
            if is_correct:
                correct = label

        questions.append({
            "id": idx,
            "blocks": blocks,
            "options": options,
            "correct": correct
        })

    return questions

def update_image_srcs(questions, images_map, image_dir="media"):
    """
    Update image src in blocks to use converted PNG paths
    """
    for question in questions:
        for block in question["blocks"]:
            if block["type"] == "image":
                basename = os.path.basename(block["src"])
                if basename in images_map:
                    block["src"] = os.path.join(image_dir, images_map[basename])
        for option in question["options"]:
            for block in option["blocks"]:
                if block["type"] == "image":
                    basename = os.path.basename(block["src"])
                    if basename in images_map:
                        block["src"] = os.path.join(image_dir, images_map[basename])

if __name__ == "__main__":
    docx_file = "test.docx"
    tex_output = "temp.tex"
    json_output = "output.json"

    # Step 1: extract and convert images
    images_map = extract_and_convert_images(docx_file, "media")

    # Step 2: convert DOCX -> LaTeX
    latex_content = convert_docx_to_latex(docx_file, tex_output)

    # Step 3: parse LaTeX to JSON
    questions = parse_latex_to_json(latex_content)

    # Step 4: update image srcs to PNG
    update_image_srcs(questions, images_map, "media")

    # Step 5: save JSON
    with open(json_output, "w", encoding="utf-8") as f:
        json.dump({"questions": questions}, f, ensure_ascii=False, indent=4)

    print("Extracted", len(questions), "questions")

Extracted 50 questions
