In [32]:
import os
import requests
import json
from pathlib import Path
from dotenv import load_dotenv
import time

In [33]:
# Set DocuMagnetIR as the root folder
root_dir = Path.cwd().parent  # Assuming notebook is at DocuMagnetIR/Notebooks/
if root_dir.name != "DocuMagnetIR":
    # Navigate up until we find the root folder
    while root_dir.name != "DocuMagnetIR" and root_dir != root_dir.parent:
        root_dir = root_dir.parent
    
    if root_dir.name != "DocuMagnetIR":
        raise Exception("Could not find DocuMagnetIR root directory")

# Define file paths
pdf_path = root_dir / "data" / "sample_papers" / "midterm-old.pdf"

# Check if the file exists
if not pdf_path.exists():
    raise FileNotFoundError(f"PDF file not found at {pdf_path}")

In [34]:
env_path = root_dir / ".env"  # Adjust if your .env is in a different location
load_dotenv(dotenv_path=env_path)

True

In [35]:
# Load API credentials from environment variables or a secure config file
# For demonstration, we'll use environment variables
app_id = os.environ.get("MATHPIX_APP_ID")
app_key = os.environ.get("MATHPIX_APP_KEY")
if not app_id or not app_key:
    raise ValueError("Mathpix API credentials not found. Please set MATHPIX_APP_ID and MATHPIX_APP_KEY environment variables.")


In [1]:
import os
import json
import requests
import time
from pathlib import Path
from dotenv import load_dotenv

# Set DocuMagnetIR as the root folder
root_dir = Path.cwd().parent  # Assuming notebook is at DocuMagnetIR/Notebooks/
if root_dir.name != "DocuMagnetIR":
    # Navigate up until we find the root folder
    while root_dir.name != "DocuMagnetIR" and root_dir != root_dir.parent:
        root_dir = root_dir.parent
    
    if root_dir.name != "DocuMagnetIR":
        raise Exception("Could not find DocuMagnetIR root directory")

# Load environment variables from .env file
env_path = root_dir / ".env"
load_dotenv(dotenv_path=env_path)

# Get API credentials
app_id = os.environ.get("MATHPIX_APP_ID")
app_key = os.environ.get("MATHPIX_APP_KEY")

# Define file path
pdf_path = root_dir / "data" / "sample_papers" / "ps1_updated.pdf"

# Comprehensive options with all parameters enabled for experimentation
options = {
    # Core parameters
    "streaming": True,  # Enable streaming for the request
    "metadata": {
        "document_id": "midterm-old",
        "source": "experiment",
        "category": "academic"
    },
    "alphabets_allowed": {
        "en": True,     # English
    },
    "rm_spaces": False,  # Remove extra white space from equations
    "rm_fonts": False,   # Remove font commands from equations
    "idiomatic_eqn_arrays": False,  # Use aligned, gathered, or cases instead of array
    "include_equation_tags": True,  # Include equation number tags
    "include_smiles": True,  # Enable chemistry diagram OCR
    "include_chemistry_as_image": True,  # Return image crop with SMILES in alt-text
    "include_diagram_text": True,  # Enable text extraction from diagrams
    "numbers_default_to_math": True,  # Numbers are always treated as math
    
    # Delimiter settings
    "math_inline_delimiters": ["$", "$"],  # Inline math delimiters
    "math_display_delimiters": ["$$", "$$"],  # Display math delimiters
    
    # Page settings
    "page_ranges": "1-",  # Process all pages
    
    # Processing options
    "enable_spell_check": True,  # Enable predictive mode for English handwriting
    "auto_number_sections": True,  # Automatically number sections
    "remove_section_numbering": False,  # Don't remove existing section numbering
    "preserve_section_numbering": False,  # Keep existing section numbering
    "enable_tables_fallback": True,  # Enable advanced table processing
    "fullwidth_punctuation": False,  # Use halfwidth Unicode for punctuation
    
    # Conversion formats
    "conversion_formats": {
        "md": True,       # Markdown
        "tex.zip": True,   # LaTeX with images in ZIP
    }
}

print("Starting Mathpix PDF processing with all parameters enabled...")

# Step 1: Upload the PDF file
print(f"Uploading PDF: {pdf_path}")
with open(pdf_path, "rb") as file:
    upload_response = requests.post(
        "https://api.mathpix.com/v3/pdf",
        headers={
            "app_id": app_id,
            "app_key": app_key
        },
        data={
            "options_json": json.dumps(options)
        },
        files={
            "file": file
        }
    )

# Process upload response
if upload_response.status_code != 200:
    print(f"Error uploading PDF: {upload_response.text}")
else:
    response_data = upload_response.json()
    pdf_id = response_data.get("pdf_id")
    print(f"PDF uploaded successfully with ID: {pdf_id}")
    print(json.dumps(response_data, indent=4))

    # Step 2: Check processing status
    print("\nChecking processing status...")
    processing_complete = False
    max_attempts = 1
    attempt = 0
    wait_time = 5  # seconds between status checks

    while not processing_complete and attempt < max_attempts:
        status_response = requests.get(
            f"https://api.mathpix.com/v3/pdf/{pdf_id}",
            headers={
                "app_id": app_id,
                "app_key": app_key
            }
        )
        
        if status_response.status_code != 200:
            print(f"Error checking status: {status_response.text}")
            break
            
        status_data = status_response.json()
        status = status_data.get("status")
        
        print(f"Attempt {attempt + 1}/{max_attempts}: Status = {status}")
        
        if status == "completed":
            processing_complete = True
            print("PDF processing completed successfully!")
            print(json.dumps(status_data, indent=4))
        elif status == "error":
            print("Error in PDF processing:")
            print(json.dumps(status_data, indent=4))
            break
        else:
            # Still processing, wait and try again
            print(f"Processing in progress ({status})... waiting {wait_time} seconds")
            time.sleep(wait_time)
            
        attempt += 1

    if not processing_complete:
        print("Timed out waiting for processing to complete")
    else:
        # Step 3: Download the processed results
        print("\nDownloading processed results...")
        results_dir = root_dir / "results" / pdf_id
        results_dir.mkdir(parents=True, exist_ok=True)

        # Get available conversion formats from the status response
        conversion_formats = status_data.get("conversion_formats", {})

        for format_type, format_info in conversion_formats.items():
            if isinstance(format_info, dict) and "url" in format_info:
                download_url = format_info["url"]
                output_path = results_dir / f"result.{format_type.replace('.', '_')}"
                
                print(f"Downloading {format_type} result...")
                
                download_response = requests.get(download_url)
                
                if download_response.status_code == 200:
                    with open(output_path, "wb") as f:
                        f.write(download_response.content)
                    print(f"Downloaded {format_type} to {output_path}")
                else:
                    print(f"Error downloading {format_type}: {download_response.status_code}")

        # Step 4: Retrieve additional data like lines.json (for detailed structure)
        print("\nRetrieving detailed document structure...")
        lines_response = requests.get(
            f"https://api.mathpix.com/v3/pdf/{pdf_id}/lines.json",
            headers={
                "app_id": app_id,
                "app_key": app_key
            }
        )
        
        if lines_response.status_code == 200:
            lines_data = lines_response.json()
            lines_path = results_dir / "lines.json"
            with open(lines_path, "w", encoding="utf-8") as f:
                json.dump(lines_data, f, indent=2)
            print(f"Document structure saved to {lines_path}")
        else:
            print(f"Error retrieving document structure: {lines_response.status_code}")

        print("\nMathpix PDF processing pipeline completed successfully!")
        print(f"All results saved to {results_dir}")

Starting Mathpix PDF processing with all parameters enabled...
Uploading PDF: d:\MSE_DS_JHU\Semester1\Information_Retrieval_Web_Agents\Git\DocuMagnetIR\data\sample_papers\ps1_updated.pdf
PDF uploaded successfully with ID: 2025_04_22_a6cc747d324096981d68g
{
    "pdf_id": "2025_04_22_a6cc747d324096981d68g"
}

Checking processing status...
Attempt 1/1: Status = loaded
Processing in progress (loaded)... waiting 5 seconds
Timed out waiting for processing to complete


In [None]:
pdf_id = "2025_04_22_a6cc747d324096981d68g"
headers = {
  "app_key": app_key,
  "app_id": app_id
}

# get mmd response
url = "https://api.mathpix.com/v3/pdf/" + pdf_id + ".mmd"
response = requests.get(url, headers=headers)
with open(pdf_id + ".md", "w") as f:
    f.write(response.text)

In [3]:
import httpx
import asyncio
import json
import os
import time
import traceback
from pathlib import Path

BASE_URL = "https://api.mathpix.com/v3/pdf"
APP_KEY = "YOUR-APP-KEY"  # Replace with your actual app key
MAX_RETRIES = 5
RETRY_DELAY = 5  # seconds

async def upload_pdf_file(file_path):
    """
    Uploads a PDF file from the local filesystem for processing and retrieves the `pdf_id`.
    """
    print(f"Uploading PDF: {file_path}")
    
    headers = {"app_key": APP_KEY}
    
    # For file uploads, we need to use multipart/form-data
    files = {"file": open(file_path, "rb")}
    
    # Add streaming parameter
    data = {"options_json": json.dumps({"streaming": True})}
    
    try:
        async with httpx.AsyncClient() as client:
            response = await client.post(
                BASE_URL, 
                headers=headers, 
                files=files, 
                data=data
            )
            
            if response.status_code == 200:
                data = response.json()
                print(f"PDF uploaded successfully with ID: {data.get('pdf_id')}")
                print(data)
                return data.get("pdf_id")
            else:
                print(f"Failed to upload PDF: {response.status_code}, {response.text}")
                return None
    except Exception as e:
        print(f"Error uploading PDF: {e}")
        print(traceback.format_exc())
        return None
    finally:
        # Make sure to close the file
        if 'files' in locals() and 'file' in files:
            files['file'].close()

async def check_processing_status(pdf_id):
    """
    Checks the processing status of a PDF.
    """
    url = f"{BASE_URL}/{pdf_id}"
    headers = {"app_key": APP_KEY}
    
    try:
        async with httpx.AsyncClient() as client:
            response = await client.get(url, headers=headers)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Failed to check status: {response.status_code}, {response.text}")
                return None
    except Exception as e:
        print(f"Error checking status: {e}")
        return None

async def wait_for_processing(pdf_id, max_attempts=12, delay=5):
    """
    Waits for the PDF processing to complete by polling the status.
    Returns True if processing completed, False if timed out.
    """
    print("Checking processing status...")
    
    for attempt in range(1, max_attempts + 1):
        print(f"Attempt {attempt}/{max_attempts}: ", end="")
        
        status_data = await check_processing_status(pdf_id)
        
        if not status_data:
            print("Failed to get status")
            await asyncio.sleep(delay)
            continue
        
        status = status_data.get("status")
        print(f"Status = {status}")
        
        if status == "completed":
            print("Processing completed successfully!")
            return True
        elif status in ["error", "failed"]:
            print(f"Processing failed with status: {status}")
            if "error" in status_data:
                print(f"Error details: {status_data['error']}")
            return False
        else:
            print(f"Processing in progress ({status})... waiting {delay} seconds")
            await asyncio.sleep(delay)
    
    print("Timed out waiting for processing to complete")
    return False

async def stream_pdf(pdf_id):
    """
    Streams the processed PDF data using the `pdf_id`.
    """
    url = f"{BASE_URL}/{pdf_id}/stream"
    headers = {"app_key": APP_KEY}
    
    print(f"Starting streaming for PDF ID: {pdf_id}")
    results = []
    
    try:
        async with httpx.AsyncClient(timeout=httpx.Timeout(timeout=None)) as client:
            async with client.stream("GET", url, headers=headers) as response:
                if response.status_code == 200:
                    print("Connected to the stream!")
                    async for line in response.aiter_lines():
                        if line.strip():  # Ignore empty lines
                            try:
                                data = json.loads(line)
                                # Store the complete result
                                results.append(data)
                                
                                # Print a preview (only first 50 chars of text if available)
                                preview = data.copy()
                                if 'text' in preview and isinstance(preview['text'], str) and len(preview['text']) > 50:
                                    preview['text'] = preview['text'][:50] + "..."
                                print(f"Received chunk: {preview}")
                            except json.JSONDecodeError:
                                print(f"Failed to decode line: {line}")
                else:
                    print(f"Failed to connect to stream: {response.status_code}, {response.text}")
        
        return results
    except Exception as e:
        print(f"Streaming error: {e}")
        print(traceback.format_exc())
        return []

async def save_results(results, output_file):
    """
    Saves the results to a file.
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to {output_file}")
        return True
    except Exception as e:
        print(f"Error saving results: {e}")
        print(traceback.format_exc())
        return False

async def process_pdf(file_path, output_dir="results"):
    """
    Main function to process a PDF file with streaming.
    """
    print("Starting Mathpix PDF processing with streaming enabled...")
    
    # 1. Upload the PDF
    pdf_id = await upload_pdf_file(file_path)
    if not pdf_id:
        return False
    
    # 2. Stream the results (no need to wait for processing completion)
    results = await stream_pdf(pdf_id)
    
    if not results:
        print("No results were streamed. Trying to check status...")
        # If streaming didn't work, fall back to waiting for processing
        processing_completed = await wait_for_processing(pdf_id)
        if not processing_completed:
            print("PDF processing failed")
            return False
    
    # 3. Save the results
    file_name = Path(file_path).stem
    output_file = os.path.join(output_dir, f"{file_name}_results.json")
    success = await save_results(results, output_file)
    
    return success

async def main():
    # Replace with your PDF file path
    pdf_path = r"d:\MSE_DS_JHU\Semester1\Information_Retrieval_Web_Agents\Git\DocuMagnetIR\data\sample_papers\ps1_updated.pdf"
    
    success = await process_pdf(pdf_path)
    if success:
        print("PDF processing completed successfully")
    else:
        print("PDF processing failed")

if __name__ == "__main__":
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop