In [8]:
import json
import re
from typing import Dict, List, Any

def parse_question_text(text: str) -> Dict[str, Any]:
    # Extract question number and text
    match = re.match(r"(\d+)\s+(.*?)\n", text)
    if not match:
        raise ValueError("Unable to extract question number and text")
    question_number, question_text = match.groups()

    # Extract options
    options = re.findall(r"\((\d+)\)\s+(.*?)(?=\n\(|\n$|\Z)", text, re.DOTALL)
    
    # Extract image path
    image_match = re.search(r"!\[\]\((.*?)\)", text)
    image_path = image_match.group(1) if image_match else None

    # Construct the JSON object
    question_data = {
        "question_number": int(question_number),
        "question_text": question_text.strip(),
        "options": [{"number": int(num), "text": text.strip()} for num, text in options],
        "image_path": image_path
    }

    return question_data

# Example usage
text = """7 The circle has centre $O$.
AOE and COF are straight lines. Which pair of lines shows its radius and diameter?
![](https://cdn.mathpix.com/snip/images/EvYLPTzoMc2_WX4ixc6XAFmtTTafTJAkVe6bmcj6niI.original.fullsize.png)
Radius $\quad$ Diameter
(1) $AE \quad OC$
(2) $AO \quad BD$
(3) $BD \quad AE$
(4) $OE \quad FC$"""

parsed_json = parse_question_text(text)
print(json.dumps(parsed_json, indent=2))

{
  "question_number": 7,
  "question_text": "The circle has centre $O$.",
  "options": [
    {
      "number": 1,
      "text": "$AE \\quad OC$"
    },
    {
      "number": 2,
      "text": "$AO \\quad BD$"
    },
    {
      "number": 3,
      "text": "$BD \\quad AE$"
    },
    {
      "number": 4,
      "text": "$OE \\quad FC$"
    }
  ],
  "image_path": "https://cdn.mathpix.com/snip/images/EvYLPTzoMc2_WX4ixc6XAFmtTTafTJAkVe6bmcj6niI.original.fullsize.png"
}


In [2]:
import json
import re
from typing import Dict, List, Any

def parse_question_text(text: str) -> Dict[str, Any]:
    # Extract question number and text
    match = re.match(r"(\d+)\s+(.*?)\n", text)
    if not match:
        raise ValueError("Unable to extract question number and text")
    question_number, question_text = match.groups()

    # Extract options
    options = re.findall(r"\((\d+)\)\s+(.*?)(?=\n\(|\n$|\Z)", text, re.DOTALL)
    
    # Extract image path
    image_match = re.search(r"!\[\]\((.*?)\)", text)
    image_path = image_match.group(1) if image_match else None

    # Construct the JSON object
    question_data = {
        "question_number": int(question_number),
        "question_text": question_text.strip(),
        "options": [{"number": int(num), "text": text.strip()} for num, text in options],
        "image_path": image_path
    }

    return question_data

# Example usage
qn = """16 Find the value of $565+39$

Ans: $\qquad$

17 Find the value of $10.12-8.99$

Ans: $\qquad$"""

parsed_json = parse_question_text(qn)
print(json.dumps(parsed_json, indent=2))

{
  "question_number": 16,
  "question_text": "Find the value of $565+39$",
  "options": [],
  "image_path": null
}


In [13]:
import re
import json
from typing import List, Dict, Any

def parse_markdown_to_jsonl(file_path: str) -> str:
    """
    Parse a markdown file containing multiple questions into a JSONL string.

    Args:
        file_path (str): Path to the markdown file.

    Returns:
        str: A JSONL string containing parsed question data.

    Raises:
        FileNotFoundError: If the specified file is not found.
        ValueError: If there's an error parsing the markdown content.
    """
    try:
        with open(file_path, "r") as file:
            content = file.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"The file at {file_path} was not found.")

    # Split the content into questions
    questions = re.split(r"\n\n(?=\d+\s)", content.strip())

    jsonl_data: List[str] = []

    for i, question in enumerate(questions):
        try:
            question_data = parse_single_question(i, question)
            jsonl_data.append(json.dumps(question_data))
        except ValueError as e:
            print(f"Error parsing question: {e}")

    return "\n".join(jsonl_data)

def parse_single_question(i: int, question: str) -> Dict[str, Any]:
    """
    Parse a single question from markdown format into a dictionary.

    Args:
        question (str): The markdown text of a single question.

    Returns:
        Dict[str, Any]: A dictionary containing parsed question data.

    Raises:
        ValueError: If unable to extract required information from the question.
    """
    # Extract question number and text
    match = re.match(r"(\d+)\s+(.*?)(?:\n\(1\)|\n\$\$|\Z)", question, re.DOTALL)
    if not match:
        raise ValueError(f"Unable to extract question number and text for question {i+1}")
    question_number, question_text = match.groups()

    # Extract options
    options = re.findall(r"\((\d+)\)\s+(.*?)(?=\n\(\d+\)|\n\$\$|\Z)", question, re.DOTALL)

    # Extract LaTeX if present
    latex_match = re.search(r"\$\$(.*?)\$\$", question, re.DOTALL)
    latex = latex_match.group(1).strip() if latex_match else None

    # Extract image path if present
    image_match = re.search(r"!\[\]\((.*?)\)", question)
    image_path = image_match.group(1) if image_match else None

    # Construct the question data dictionary
    question_data: Dict[str, Any] = {
        "question_number": int(question_number),
        "question_text": question_text.strip(),
        "options": [{"option_number": int(num), "option_text": text.strip()} for num, text in options],
        "latex": latex,
        "image_path": image_path
    }

    return question_data

# Usage TODO:
file_path = "/Users/ob1/projects/sgllm/qna-proc/2021/raw_mmd.md"
jsonl_output = parse_markdown_to_jsonl(file_path)

# Write to a new file
output_file = "/Users/ob1/projects/sgllm/qna-proc/2021/questions.jsonl"
with open(output_file, "w") as f:
    f.write(jsonl_output)

print(f"JSONL file created: {output_file}")

Error parsing question: Unable to extract question number and text for question 1
JSONL file created: /Users/ob1/projects/sgllm/qna-proc/2021/questions.jsonl


In [12]:
import re
import json
from typing import List, Dict, Any

def parse_openended_questions_markdown_to_jsonl(file_path: str) -> str:
    """
    Parse a markdown file containing multiple questions into a JSONL string.

    Args:
        file_path (str): Path to the markdown file.

    Returns:
        str: A JSONL string containing parsed question data.

    Raises:
        FileNotFoundError: If the specified file is not found.
        ValueError: If there's an error parsing the markdown content.
    """
    try:
        with open(file_path, "r") as file:
            content = file.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"The file at {file_path} was not found.")

    # Split the content into questions
    questions = re.split(r"\n\n(?=\d+\s)", content.strip())

    jsonl_data: List[str] = []

    for i, question in enumerate(questions):
        try:
            question_data = parse_single_question(i, question)
            jsonl_data.append(json.dumps(question_data))
        except ValueError as e:
            print(f"Error parsing question: {e}")

    return "\n".join(jsonl_data)

def parse_single_question(i: int, question: str) -> Dict[str, Any]:
    """
    Parse a single question from markdown format into a dictionary.

    Args:
        question (str): The markdown text of a single question.

    Returns:
        Dict[str, Any]: A dictionary containing parsed question data.

    Raises:
        ValueError: If unable to extract required information from the question.
    """
    # Extract question number and text
    match = re.match(r"(\d+)\s+(.*?)(?:\n\(1\)|\n\$\$|\Z)", question, re.DOTALL)
    if not match:
        raise ValueError(f"Unable to extract question number and text for question {i+1}")
    question_number, question_text = match.groups()

    # Extract options
    # options = re.findall(r"\((\d+)\)\s+(.*?)(?=\n\(\d+\)|\n\$\$|\Z)", question, re.DOTALL)

    # Extract LaTeX if present
    # latex_match = re.search(r"\$\$(.*?)\$\$", question, re.DOTALL)
    # latex = latex_match.group(1).strip() if latex_match else None

    # Extract answer if present (e.g. \nAns: $\qquad$) and remove it from question text
    answer_match = re.search(r"Ans:\s+(.*)", question)
    answer_units = answer_match.group(1).strip() if answer_match else None
    question_text = question_text.replace(f"Ans: {answer_units}", "").strip()

    # Extract image path if present
    image_match = re.search(r"!\[\]\((.*?)\)", question)
    image_path = image_match.group(1) if image_match else None
    # remove image_path from question text
    question_text = question_text.replace(f"![]({image_path})", "").strip()

    # Construct the question data dictionary
    question_data: Dict[str, Any] = {
        "question_number": int(question_number),
        "question_text": question_text.strip(),
        # "options": [{"option_number": int(num), "option_text": text.strip()} for num, text in options],
        # "latex": latex,
        "image_path": image_path,
        # "answer_units": answer_units
    }

    return question_data

# Usage TODO:
file_path = "/Users/ob1/projects/sgllm/qna-proc/2021/p1-bookB/rawmd.md"
jsonl_output = parse_openended_questions_markdown_to_jsonl(file_path)

# Write to a new file
output_file = "/Users/ob1/projects/sgllm/qna-proc/2021/p1-bookB/questions.jsonl"
with open(output_file, "w") as f:
    f.write(jsonl_output)

print(f"JSONL file created: {output_file}")

Error parsing question: Unable to extract question number and text for question 1
JSONL file created: /Users/ob1/projects/sgllm/qna-proc/2021/p1-bookB/questions.jsonl
