In [5]:
import os
import json
import base64
from mistralai import Mistral
from dotenv import load_dotenv, find_dotenv

# 1. Automatically find and load the .env file
env_file = find_dotenv()
if env_file:
    load_dotenv(env_file)
    print(f"‚úÖ Loaded .env from: {env_file}")
else:
    print("‚ö†Ô∏è Warning: No .env file found.")

# 2. Get API Key
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

if not MISTRAL_API_KEY:
    raise ValueError("‚ùå MISTRAL_API_KEY is missing. Please check your .env file.")

client = Mistral(api_key=MISTRAL_API_KEY)
print("‚úÖ Mistral Client Initialized Successfully")

‚úÖ Loaded .env from: c:\Users\sprin\Desktop\Thendral\ML Portfolio Projects\math-tutor-ocr\.env
‚úÖ Mistral Client Initialized Successfully


In [6]:
def extract_exam_data(pdf_path):
    # --- STEP 1: OCR (Get the text & layout) ---
    print(f"üöÄ Reading {pdf_path} with Mistral OCR...")
    
    with open(pdf_path, "rb") as f:
        pdf_data = base64.b64encode(f.read()).decode("utf-8")

    # Call Mistral OCR
    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{pdf_data}"
        },
        include_image_base64=True 
    )
    
    # Combine all pages into one large text block for analysis
    full_markdown = ""
    for page in ocr_response.pages:
        full_markdown += f"\n\n--- Page {page.index} ---\n\n"
        full_markdown += page.markdown
        
    print(f"‚úÖ OCR Complete. Extracted {len(full_markdown)} characters.")

    # --- STEP 2: JSON STRUCTURING ---
    print("üß† Analyzing text with Mistral Large to extract structured questions...")
    
    prompt = f"""
    You are an expert math teacher. 
    I have the raw text of a Leaving Cert Math Exam Paper below.

    YOUR TASK:
    Extract the math questions into a perfectly formatted JSON object.
    
    REQUIRED JSON STRUCTURE:
    {{
        "2024": [
            {{
                "id": "2024_q1_a",
                "topic": "Algebra", 
                "text": "The full question text here. Use LaTeX for math, wrapped in single dollar signs like $ x^2 $."
            }},
            {{
                "id": "2024_q1_b",
                "topic": "Complex Numbers", 
                "text": "Solve for z: $ z^2 + 4 = 0 $"
            }}
        ]
    }}

    RULES:
    1. "topic": Infer the topic (e.g., Algebra, Calculus, Statistics) based on the question.
    2. "text": Ensure LaTeX is correct. Keep it simple.
    3. If there are diagrams mentioned, just include the text description for now.

    INPUT TEXT:
    {full_markdown}
    """

    chat_response = client.chat.complete(
        model="mistral-large-latest", 
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"}
    )
    
    return json.loads(chat_response.choices[0].message.content)

In [8]:
# Configuration
PDF_PATH = "test.pdf"  # <--- CHECK THIS FILENAME
OUTPUT_JSON_PATH = "../data/questions.json"

if os.path.exists(PDF_PATH):
    try:
        # 1. Run Extraction
        new_data = extract_exam_data(PDF_PATH)
        
        # 2. Save Data
        os.makedirs(os.path.dirname(OUTPUT_JSON_PATH), exist_ok=True)
        with open(OUTPUT_JSON_PATH, "w") as f:
            json.dump(new_data, f, indent=4)
            
        print(f"üéâ Success! Data saved to: {OUTPUT_JSON_PATH}")
        print("Preview:", json.dumps(new_data, indent=2)[:500] + "...")
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
else:
    print(f"‚ö†Ô∏è PDF not found at: {os.path.abspath(PDF_PATH)}")
    print("Please move your PDF file into the 'data' folder.")

üöÄ Reading test.pdf with Mistral OCR...
‚úÖ OCR Complete. Extracted 20365 characters.
üß† Analyzing text with Mistral Large to extract structured questions...
üéâ Success! Data saved to: ../data/questions.json
Preview: {
  "2025": [
    {
      "id": "2025_q1_a",
      "topic": "Algebra",
      "text": "Solve the following inequality for $x \\in \\mathbb{R}$: $$|x - 3| \\leq 12$$"
    },
    {
      "id": "2025_q1_b",
      "topic": "Algebra",
      "text": "Multiply out and simplify: $$(4x - 10\\sqrt{x}) \\quad (2x + 5\\sqrt{x} - 7)$$"
    },
    {
      "id": "2025_q1_c",
      "topic": "Algebra",
      "text": "$(2x + 3)$ is a factor of $4x^{3} - 12x^{2} - 7x + 30$. Use this information to find the three so...


In [12]:
import os
import json
import base64
import re
from mistralai import Mistral
from dotenv import load_dotenv, find_dotenv
from pydantic import BaseModel, Field
from typing import List, Optional

# --- 1. SETUP & AUTH ---
load_dotenv(find_dotenv())
api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)

# Configuration
PDF_PATH = "test.pdf"
OUTPUT_JSON_PATH = "../data/questions.json"
IMAGES_DIR = "../data/images"  # Where we will save extracted graphs/diagrams

# --- 2. DEFINE THE SCHEMA (The Structure You Want) ---
class MathQuestion(BaseModel):
    id: str = Field(description="Unique ID like '2024_q1_a'")
    topic: str = Field(description="The mathematical topic (e.g. Algebra, Calculus)")
    text: str = Field(description="The question text with LaTeX math wrapped in $...$")
    image_id: Optional[str] = Field(description="The ID of any diagram/graph mentioned (e.g., 'img-123'), or null if none")

class ExamPaper(BaseModel):
    year: int
    questions: List[MathQuestion]

# --- 3. HELPER: Save Base64 Image to Disk ---
def save_image_from_ocr(image_base64, image_id, output_dir):
    """Decodes base64 and saves it as a PNG file."""
    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, f"{image_id}.png")
    
    # Mistral usually returns base64 raw string, usually compatible with this:
    img_data = base64.b64decode(image_base64.split(",")[-1]) 
    
    with open(file_path, "wb") as f:
        f.write(img_data)
    return file_path

# --- 4. MAIN EXTRACTION FUNCTION ---
import time
from PIL import Image
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from mistralai import SDKError

# --- HELPER 1: PRE-FILTER TINY IMAGES ---
def is_likely_junk(image_path):
    """
    Checks if an image is too small to be a useful diagram.
    Saves API calls by filtering out bullet points, lines, and icons locally.
    """
    try:
        with Image.open(image_path) as img:
            width, height = img.size
            # If image is smaller than 50x50 pixels, it's likely noise or a bullet point
            if width < 50 or height < 50:
                return True
            # If aspect ratio is extreme (very thin line), it's likely a separator
            aspect = width / height
            if aspect > 10 or aspect < 0.1:
                return True
    except Exception:
        return True # If we can't open it, it's junk
    return False

# --- HELPER 2: ROBUST API CALL WITH RETRY ---
# This decorator will automatically wait 2s, then 4s, then 8s if a 429 error happens
@retry(
    retry=retry_if_exception_type(SDKError), # Catch API errors
    wait=wait_exponential(multiplier=1, min=2, max=30), 
    stop=stop_after_attempt(5)
)
def call_vision_model(client, base64_image):
    prompt = """
    Look at this image from a math exam.
    Return ONLY the word 'YES' or 'NO'.

    Is this a useful diagram (like a graph, function, shape, geometry, or data chart)?
    
    Answer NO if it is:
    - A blank grid
    - Empty lined paper
    - An empty box
    - A logo, barcode, or small icon
    - Just text instructions
    """
    
    return client.chat.complete(
        model="pixtral-12b-2409",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
                ]
            }
        ]
    )

# --- HELPER 3: THE VISION CHECK ---
def is_useful_image(client, image_path):
    # 1. Local Check (Free & Fast)
    if is_likely_junk(image_path):
        print(f"   Skipping tiny/junk image: {os.path.basename(image_path)}")
        return False

    # 2. API Check (Slower & Costly)
    try:
        with open(image_path, "rb") as f:
            base64_image = base64.b64encode(f.read()).decode('utf-8')
        
        # Add a small delay to be polite to the API rate limit
        time.sleep(1) 
        
        response = call_vision_model(client, base64_image)
        answer = response.choices[0].message.content.strip().upper()
        return "YES" in answer

    except Exception as e:
        print(f"   ‚ö†Ô∏è Vision check failed for {image_path}: {e}")
        # If API fails after retries, assume it's valid to be safe, or False to be strict
        return True 

# --- MAIN FUNCTION ---
def extract_exam_with_schema(pdf_path):
    print(f"üöÄ Reading {pdf_path} with Mistral OCR...")

    with open(pdf_path, "rb") as f:
        pdf_data = base64.b64encode(f.read()).decode("utf-8")

    # A. OCR PROCESS
    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{pdf_data}"
        },
        include_image_base64=True 
    )

    full_markdown = ""
    saved_images_map = {}

    print("üì∏ Saving raw images...")
    for page in ocr_response.pages:
        full_markdown += f"\n\n--- Page {page.index} ---\n\n"
        full_markdown += page.markdown
        
        for img in page.images:
            local_path = save_image_from_ocr(img.image_base64, img.id, IMAGES_DIR)
            saved_images_map[img.id] = local_path 

    # B. VISION FILTER (Updated with Retry & Pre-check)
    print("üïµÔ∏è‚Äç‚ôÄÔ∏è Running Smart Vision Filter...")
    
    valid_images_map = {}
    
    # Iterate through images
    for img_id, img_path in saved_images_map.items():
        if is_useful_image(client, img_path):
            valid_images_map[img_id] = img_path
            print(f"   ‚úÖ Kept useful image: {img_id}")
        else:
            # print(f"   üóëÔ∏è Removed junk image: {img_id}")
            pass

    # C. LLM STRUCTURING
    print("üß† Structuring data with Mistral Large...")
    
    valid_ids_str = ", ".join(valid_images_map.keys())
    
    prompt = f"""
    Analyze the math exam text below.
    Extract all questions into the defined JSON structure.
    
    ### IMAGE RULES:
    The ONLY valid image IDs available to use are: [{valid_ids_str}].
    If a question references an ID NOT in this list, set 'image_id': null.

    TEXT CONTENT:
    {full_markdown}
    """

    chat_response = client.chat.complete(
        model="mistral-large-latest",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"} 
    )
    
    # D. POST-PROCESSING
    raw_json = json.loads(chat_response.choices[0].message.content)
    
    final_questions = []
    questions_list = []
    
    if "questions" in raw_json:
        questions_list = raw_json["questions"]
    elif isinstance(raw_json, list):
        questions_list = raw_json
    else:
        for key, val in raw_json.items():
            if isinstance(val, list):
                questions_list = val
                break

    for q in questions_list:
        if "image_id" in q and q["image_id"] in valid_images_map:
             q["image_url"] = valid_images_map[q["image_id"]]
        else:
             q["image_url"] = None
        q.pop("image_id", None)
        final_questions.append(q)

    return { "questions": final_questions }
# --- 5. EXECUTION ---
if os.path.exists(PDF_PATH):
    try:
        data = extract_exam_with_schema(PDF_PATH)
        
        os.makedirs(os.path.dirname(OUTPUT_JSON_PATH), exist_ok=True)
        with open(OUTPUT_JSON_PATH, "w") as f:
            json.dump(data, f, indent=4)
            
        print(f"üéâ Success! Extracted {len(data['questions'])} questions.")
        print(f"üìÇ Images saved to: {IMAGES_DIR}")
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
else:
    print("‚ö†Ô∏è PDF not found.")

üöÄ Reading test.pdf with Mistral OCR...
üì∏ Saving raw images...
üïµÔ∏è‚Äç‚ôÄÔ∏è Running Smart Vision Filter...
   ‚úÖ Kept useful image: img-10.jpeg
   ‚úÖ Kept useful image: img-12.jpeg
   ‚úÖ Kept useful image: img-31.jpeg
   ‚úÖ Kept useful image: img-39.jpeg
   ‚úÖ Kept useful image: img-42.jpeg
   ‚úÖ Kept useful image: img-53.jpeg
   ‚úÖ Kept useful image: img-54.jpeg
üß† Structuring data with Mistral Large...
üéâ Success! Extracted 10 questions.
üìÇ Images saved to: ../data/images
