### Experiment1: MistralOCR + SLM/LLM (OpenRouter) + LLM_as_a_judge 
- Maintainer : Shivargha Bandopadhyay
- Date : 10/02/2026
- Modules : MistralOCR, Langchain

In [31]:
import os
from time import time
from mistralai import Mistral 
from langchain_core.messages import HumanMessage,SystemMessage,AIMessage
from langchain_core.prompts import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langgraph.graph import StateGraph, END, START

from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List,Annotated, Literal,TypedDict,Optional


load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [25]:
## MISTRAL OCR ###
'''
Local Files Need to be uploaded to Mistral OCR - storage
Use Mistral OCR API to extract text from the uploaded file URL Link
'''
client = Mistral(api_key=MISTRAL_API_KEY)
def upload_file(file_path:str) -> str:

    filename = file_path.split("\\")[-1]

    ## Upload File to Mistral OCR Storage ##
    uploaded_file = client.files.upload(
        file = {
            "file_name": filename,
            "content": open(file_path, "rb")
        },
        purpose = 'ocr'
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id)
    return signed_url.url

def get_ocr_response(file_url:str):

    ## Get OCR Response ##
    ocr_response = client.ocr.process(
        model = 'mistral-ocr-latest',
        document = {
            "type": "document_url",
            "document_url":file_url
        },
        include_image_base64 = True    
    )

    return ocr_response

url = upload_file("D:\\Tvarah\\resume_extraction\\data\\twocolumn\\Praseetha.k.pdf")
ocr_response = get_ocr_response(url)
ocr_response 


OCRResponse(pages=[OCRPageObject(index=0, markdown='Praseetha K\nDevSecOps Engineer\n\n- praseetha.k76@gmail.com\n- +91-7899674842\n- Bangalore, India\n- linkedIn\n\nKEY SKILLS\n\n- AWS | Jenkins |\n- Terraform | Nagios |\n- Shell Scripting / Python |\n- Azure | Kubernetes |\n- Docker | Linux |\n- Ansible | security Testing |\n- Git | Akamai |\n- mesos marathon |\n- Krutrim Cloud |\n- prometheus | grafana\n\nCERTIFICATES\n\n- Certified Ethical Hacker — ECC5920436718 |\n- RHCS &amp; RHCE Certification | CompTIA Security+ — COMP001021988350 |\n- Azure Administrator | Associate\n\nSUMMARY\n\nDevOps Engineer with 10+ years of experience in architecture design, automation, and scalable system deployments. Expertise in containers, microservices, and embedding security best practices throughout the development lifecycle."\n\nRELEVANT PROFESSIONAL EXPERIENCE\n\n## DevOps Engineer\n\nOLA\n\n07/2024 – Present | Bangalore, India\n\nIn this role, I delivered end-to-end infrastructure solutions, au

In [33]:
### Prompt Templates ###
system_prompt = SystemMessagePromptTemplate.from_template(
    """
You are an expert AI Resume Parser & Data Extraction Specialist. 
Your goal is to extract structured candidate data from raw Resume Markdown text with 100% precision.
### CORE EXTRACTION RULES:
1.  **Truthfulness:** Extract ONLY what is explicitly stated in the text. Do not infer or hallucinate data. If a field (like "LinkedIn URL" or "End Date") is missing, leave it as null/None. Do not invent "N/A" or placeholders.
2.  **Date Normalization:** * Convert all dates to `YYYY-MM` format (e.g., "August 2022" -> "2022-08").
    * If a candidate writes "Present", "Current", or "Till Date", set the `end_date` to None.
3.  **Name Extraction:**
    * If the resume has a header like "Resume of John Doe", extract only "John Doe".
    * Do not include titles like "Mr.", "Dr.", or suffixes like "PMP" in the `full_name` field.
4.  **Work Experience:**
    * Split "Role" and "Company" if they appear on the same line (e.g., "Software Engineer | Google" -> Role: "Software Engineer", Company: "Google").
    * If a description contains "References available upon request", ignore that line.

### HANDLING EDGE CASES:
1. **Multi-Role:** If a candidate held multiple roles at the same company, treat them as separate objects in `work_experience`.
2. **Education:** Every new university/college in the education section should be treated as a separate object in `education`.
3. If you dont find any data for a field, leave it as None.

You will receive the Resume Markdown below. Populate the data model accurately.
"""   
)

human_prompt = HumanMessagePromptTemplate.from_template(
    """
    Here is the Resume Markdown:
    <resume_markdown>
    {resume_markdown}
    </resume_markdown>
"""
)

prompt_template = ChatPromptTemplate.from_messages(
    [system_prompt, human_prompt]
)


In [32]:
### Pydantic Structures ###
class personal_info(BaseModel):
    full_name: str = Field(...,description="Full name of the candidate")
    first_name: str = Field(...,description="First name of the candidate")
    middle_name: str|None = Field(default=None,description="Middle name of the candidate,if not present then return None")
    last_name: str|None = Field(default=None,description="Last name of the candidate, if not present then return None")
    date_of_birth: str|None = Field(default=None,description="Date of birth of the candidate, if not present then return None")
    gender: str|None = Field(default=None,description="Gender of the candidate, if not present then return None")
    nationality: str|None = Field(default=None,description="Nationality of the candidate, if not present then return None")
    work_authorization: str|None = Field(default=None,description="Work authorization of the candidate, if not present then return None")


In [36]:
llm_model = ChatOpenAI(
    base_url = "https://openrouter.ai/api/v1",
    api_key = OPENROUTER_API_KEY,
    model = "mistralai/ministral-14b-2512",
)
chain = prompt_template | llm_model.with_structured_output(personal_info)
chain.invoke({"resume_markdown": ocr_response.pages[0].markdown})

personal_info(full_name='Praseetha K', first_name='Praseetha', middle_name=None, last_name='K', date_of_birth=None, gender=None, nationality='Indian', work_authorization=None)