In [1]:
import json
import re

def parse_markdown_to_jsonl(file_path):
    with open(file_path, "r") as file:
        content = file.read()

    # Split the content into questions
    questions = re.split(r'\n\n(?=\d+\s)', content.strip())

    jsonl_data = []

    for question in questions:
        # Extract question number, text, and options
        match = re.match(r'(\d+)\s(.*?)(?:\n\(1\)|\n\$\$)', question, re.DOTALL)
        if match:
            question_number = match.group(1)
            question_text = match.group(2).strip()

            # Extract options
            options = re.findall(r'\((\d+)\)\s(.*?)(?=\n\(\d+\)|\n\$\$|\Z)', question, re.DOTALL)
            
            # Extract LaTeX if present
            latex_match = re.search(r'\$\$(.*?)\$\$', question, re.DOTALL)
            latex = latex_match.group(1).strip() if latex_match else None

            # Create JSON object
            question_obj = {
                "question_number": int(question_number),
                "question_text": question_text,
                "options": [{"option_number": int(opt[0]), "option_text": opt[1].strip()} for opt in options],
                "latex": latex
            }

            jsonl_data.append(json.dumps(question_obj))

    return "\n".join(jsonl_data)

# Usage
file_path = "/Users/ob1/Desktop/PSLE/PSLE Maths 2023/proc/mmd-export.md"
jsonl_output = parse_markdown_to_jsonl(file_path)

# Write to a new file
output_file = "/Users/ob1/projects/sgllm/output/questions.jsonl"
with open(output_file, "w") as f:
    f.write(jsonl_output)

print(f"JSONL file created: {output_file}")

JSONL file created: /Users/ob1/projects/sgllm/output/questions.jsonl
