In [1]:
import json
import re

def parse_markdown_to_jsonl(file_path):
    with open(file_path, "r") as file:
        content = file.read()

    # Split the content into questions
    questions = re.split(r'\n\n(?=\d+\s)', content.strip())

    jsonl_data = []

    for question in questions:
        # Extract question number, text, and options
        match = re.match(r'(\d+)\s(.*?)(?:\n\(1\)|\n\$\$)', question, re.DOTALL)
        if match:
            question_number = match.group(1)
            question_text = match.group(2).strip()

            # Extract options
            options = re.findall(r'\((\d+)\)\s(.*?)(?=\n\(\d+\)|\n\$\$|\Z)', question, re.DOTALL)
            
            # Extract LaTeX if present
            latex_match = re.search(r'\$\$(.*?)\$\$', question, re.DOTALL)
            latex = latex_match.group(1).strip() if latex_match else None

            # Create JSON object
            question_obj = {
                "question_number": int(question_number),
                "question_text": question_text,
                "options": [{"option_number": int(opt[0]), "option_text": opt[1].strip()} for opt in options],
                "latex": latex
            }

            jsonl_data.append(json.dumps(question_obj))

    return "\n".join(jsonl_data)

# Usage
file_path = "/Users/ob1/Desktop/PSLE/PSLE Maths 2023/proc/mmd-export.md"
jsonl_output = parse_markdown_to_jsonl(file_path)

# Write to a new file
output_file = "/Users/ob1/projects/sgllm/output/questions.jsonl"
with open(output_file, "w") as f:
    f.write(jsonl_output)

print(f"JSONL file created: {output_file}")

JSONL file created: /Users/ob1/projects/sgllm/output/questions.jsonl


In [2]:
import json
import re
from typing import Dict, List, Any

def parse_question_text(text: str) -> Dict[str, Any]:
    # Extract question number and text
    match = re.match(r"(\d+)\s+(.*?)\n", text)
    if not match:
        raise ValueError("Unable to extract question number and text")
    question_number, question_text = match.groups()

    # Extract options
    options = re.findall(r"\((\d+)\)\s+(.*?)(?=\n\(|\n$|\Z)", text, re.DOTALL)
    
    # Extract image path
    image_match = re.search(r"!\[\]\((.*?)\)", text)
    image_path = image_match.group(1) if image_match else None

    # Construct the JSON object
    question_data = {
        "question_number": int(question_number),
        "question_text": question_text.strip(),
        "options": [{"number": int(num), "text": text.strip()} for num, text in options],
        "image_path": image_path
    }

    # Construct the full JSON object
    json_object = {
        "chain_of_thought": "Let's analyze the figure and count the angles smaller than 90°.",
        "correct_answers": [],  # This should be filled by the solving process
        "question_data": question_data
    }

    return json_object

# Example usage
text = """3 How many of the marked angles in the figure are smaller than $90^{\circ}$ ?
(1) 5
(2) 2
(3) 3
(4) 4
![](https://cdn.mathpix.com/cropped/2024_09_03_e2d3ee458809490b0c4ag-1.jpg?height=359&width=504&top_left_y=1719&top_left_x=914)"""

parsed_json = parse_question_text(text)
print(json.dumps(parsed_json, indent=2))

{
  "chain_of_thought": "Let's analyze the figure and count the angles smaller than 90\u00b0.",
  "correct_answers": [],
  "question_data": {
    "question_number": 3,
    "question_text": "How many of the marked angles in the figure are smaller than $90^{\\circ}$ ?",
    "options": [
      {
        "number": 1,
        "text": "5"
      },
      {
        "number": 2,
        "text": "2"
      },
      {
        "number": 3,
        "text": "3"
      },
      {
        "number": 4,
        "text": "4\n![](https://cdn.mathpix.com/cropped/2024_09_03_e2d3ee458809490b0c4ag-1.jpg?height=359&width=504&top_left_y=1719&top_left_x=914)"
      }
    ],
    "image_path": "https://cdn.mathpix.com/cropped/2024_09_03_e2d3ee458809490b0c4ag-1.jpg?height=359&width=504&top_left_y=1719&top_left_x=914"
  }
}
