### Experiment1: Resume Parsing -  MistralOCR + SLM/LLM (OpenRouter) + LLM_as_a_judge 
- Maintainer : Shivargha Bandopadhyay
- Date : 10/02/2026
- Modules : MistralOCR, Langchain  
- Pipeline : MistralOCR -> LLM (Ministral 14B) -> LLM_as_a_judge (Phi4 14B) -> Loop(on Failure, to MistralOCR)

In [49]:
import os
from time import time
from mistralai import Mistral 
from langchain_core.messages import HumanMessage,SystemMessage,AIMessage
from langchain_core.prompts import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langgraph.graph import StateGraph, END, START

from dotenv import load_dotenv
import json
from pydantic import BaseModel, Field
from typing import List,Annotated, Literal,TypedDict,Optional


load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [38]:
## MISTRAL OCR ###
'''
Local Files Need to be uploaded to Mistral OCR - storage
Use Mistral OCR API to extract text from the uploaded file URL Link
'''
client = Mistral(api_key=MISTRAL_API_KEY)
def upload_file(file_path:str) -> str:

    filename = file_path.split("\\")[-1]

    ## Upload File to Mistral OCR Storage ##
    uploaded_file = client.files.upload(
        file = {
            "file_name": filename,
            "content": open(file_path, "rb")
        },
        purpose = 'ocr'
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id)
    return signed_url.url

def get_ocr_response(file_url:str):

    ## Get OCR Response ##
    ocr_response = client.ocr.process(
        model = 'mistral-ocr-latest',
        document = {
            "type": "document_url",
            "document_url":file_url
        },
        include_image_base64 = True    
    )

    return ocr_response

url = upload_file("D:\\Tvarah\\resume_extraction\\data\\DE_HARSHRAJ[3y_0m]_Good Resume.pdf")
ocr_response = get_ocr_response(url)
ocr_response 


OCRResponse(pages=[OCRPageObject(index=0, markdown='Harsh Raj\n\nNoida, Uttar Pradesh, India | rajharsh5450@gmail.com | (+91) 8340643631 | LinkedIn\n\n# EXPERIENCE\n\nData Engineer 1, MAQ Software - Noida, Uttar Pradesh, India\nJuly 2023 - Present\n\n- Design, Build, and Optimise Data Pipelines: Implemented a Medallion Architecture in Azure Databricks for a Sales and Revenue Organisation, migrating and processing data from SQL Server and on-premises CSV files. Designed and optimised pipelines by focussing on data quality, lineage, and scalability, enabling integration of over 80,000 sales-related records daily.\n- Modernised ETL Architecture: Redesigned and optimized a Microsoft Fabric architecture for an ISV client to process 500 GB of data reducing their end-to-end data refresh duration to 70 minutes from 4 hours. Enabled end-to-end ingestion, processing, and reporting across 3 workspaces, improving scaling and data quality.\n- CI/CD Pipeline for Microsoft Fabric Deployment: Develope

In [None]:
### Prompt Templates ###
system_prompt = SystemMessagePromptTemplate.from_template(
    """
You are an expert AI Resume Parser & Data Extraction Specialist. 
Your goal is to extract structured candidate data from raw Resume Markdown text with 100% precision.
### CORE EXTRACTION RULES:
1.  **Truthfulness:** Extract ONLY what is explicitly stated in the text. Do not infer or hallucinate data. If a field (like "LinkedIn URL" or "End Date") is missing, leave it as null/None. Do not invent "N/A" or placeholders.
2.  **Date Normalization:** * Convert all dates to `YYYY-MM` format (e.g., "August 2022" -> "2022-08").
    * If a candidate writes "Present", "Current", or "Till Date", set the `end_date` to None.
3.  **Name Extraction:**
    * If the resume has a header like "Resume of John Doe", extract only "John Doe".
    * Do not include titles like "Mr.", "Dr.", or suffixes like "PMP" in the `full_name` field.
4.  **Work Experience:**
    * Split "Role" and "Company" if they appear on the same line (e.g., "Software Engineer | Google" -> Role: "Software Engineer", Company: "Google").
    * If a description contains "References available upon request", ignore that line.

### HANDLING EDGE CASES:
1. **Multi-Role:** If a candidate held multiple roles at the same company, treat them as separate objects in `work_experience`.
2. **Education:** Every new university/college in the education section should be treated as a separate object in `education`.
3. If you dont find any data for a field, leave it as None.
4. For City, Country, State - Analyze based on the current work location, if current location comes out remote,\
    try to find out current location, current country, current state from previous work experiences, if not found then return None for all three.

You will receive the Resume Markdown below. Populate the data model accurately.
"""   
)

human_prompt = HumanMessagePromptTemplate.from_template(
    """
    Here is the Resume Markdown:
    <resume_markdown>
    {resume_markdown}
    </resume_markdown>
"""
)

prompt_template = ChatPromptTemplate.from_messages(
    [system_prompt, human_prompt]
)


In [83]:
### Pydantic Structures ###

## Personal Information Class ##
class personalInfo(BaseModel):
    full_name: str = Field(...,description="Full name of the candidate")
    first_name: str = Field(...,description="First name of the candidate")
    middle_name: str|None = Field(default=None,description="Middle name of the candidate,if not present then return None")
    last_name: str|None = Field(default=None,description="Last name of the candidate, if not present then return None")
    date_of_birth: str|None = Field(default=None,description="Date of birth of the candidate, if not present then return None")
    gender: str|None = Field(default=None,description="Gender of the candidate, if not present then return None")
    nationality: str|None = Field(default=None,description="Nationality of the candidate, if not present then return None")
    work_authorization: str|None = Field(default=None,description="Work authorization of the candidate, if not present then return None")

## Contact Information Class ##
class contactInfo(BaseModel):
    primary_email : str|None = Field(default=None,description="Primary email address of the candidate, if not present then return None")
    secondary_email : str|None = Field(default=None,description="Secondary email address of the candidate, if not present then return None")
    primary_phone_number : str|None = Field(default=None,description="Primary phone number of the candidate, if not present then return None")
    secondary_phone_number : str|None = Field(default=None,description="Secondary phone number of the candidate, if not present then return None")
    country_code : str|None = Field(default=None,description="Country code of the candidate, if not present then return None")
    current_city : str|None = Field(default=None,description="Current city of the candidate, if not present then return None")
    current_state : str|None = Field(default=None,description="Current state of the candidate, if not present then return None")
    current_country : str|None = Field(default=None,description="Current country of the candidate, if not present then return None")
    postal_address: str|None = Field(default=None,description="Address of the candidate, if not present then return None")

### Education class ###
class educationInfo(BaseModel):
    institution_name: str|None = Field(default=None,description="Institution name of the candidate, if not present then return None")
    institution_type: str|None = Field(default=None,description="Institution type (University,College,School,Bootcamp,etc), if not present then return None")
    institution_country : str|None = Field(default=None,description="Country of the institution, if not present then return None")
    degree: str|None = Field(default=None,description="Degree of the candidate with respect to the institution,\
        example: B.E, B.Tech, M.E, M.Tech, Ph.D, etc. if not present then return None")

    field_of_study: str|None = Field(default=None,description="Field of study of the candidate with respect to the institution, \
        example: Computer Science, Information Technology, etc. if not present then return None")
    specialisation : str|None = Field(default=None,description="Specialisation of the candidate with respect to the institution, \
        example: Artificial Intelligence, Marketing, Finance etc. if not present then return None")
    education_level: str|None = Field(default=None,description="Education level of the candidate with respect to the institution, \
        example: high_school / bachelors / masters / phd / diploma/bootcamp/certificate, if not present then return None")
    
    start_date: str|None = Field(default=None,description="Start date at the institution, if not present then return None")
    end_date: str|None = Field(default=None,description="End date of the institution, \
        If mentioned - Present or Ongoing, return None")
    is_current: str|None = Field(default=None,description="Yes, if the candidate is currently pursuing the degree, \
        No, if the candidate has completed the degree, if not present then return None")
    
    grade_or_gpa: str|None = Field(default=None,description="Grade or GPA of the candidate with respect to the institution, \
        if not present then return None")
    mode : str|None = Field(default=None,description="Mode of the institution, \
        example: full_time, part_time, online, etc. if not present then return None")


### Work experience class ###
class workExperienceInfo(BaseModel):
    company_name: str = Field(...,description="Company name of the candidate")
    company_location: str|None = Field(default=None,description="Location of the company, if not present then return None")
    job_title: str|None = Field(default=None,description="Job title of the candidate, if not present then return None")
    employment_type: str|None = Field(default=None,description="Employment type of the candidate with respect to the company,\
        example: full_time, part_time, contract, temporary,internship etc. if not present then return None")
    
    start_date: str|None = Field(default=None,description="Start date of the role, if not present then return None")
    end_date: str|None = Field(default=None,description="End date of the role, if not present then return None")
    is_current_role: str|None = Field(default=None,description="Yes, if the candidate is currently working in the role, if not present then return None")
    role_description: str = Field(...,description="Role description of the role, \
        This should be a brief summary of the role and responsibilities,not more than 200words.")
    
    #team_or_department: str|None = Field(default=None,description="Team or department of the role, if not present then return None")
    #tools_mentioned: list[str] = Field(default=[],description="Tools the candidate has worked on with respect to the role,\
    #Ex:  if not present then return None")
    #technologies_mentioned: list[str] = Field(default=[],description="Technologies the candidate has worked on with respect to the role.\
    # if not present then return None")

class skillsInfo(BaseModel):
    #technical_skills: list[str] = Field(default=[],description="Technical skills of the candidate, if not present then return None")
    programming_languages: list[str] = Field(default=[],description="Programming languages the candidate has worked on,\
        Ex: Python,C,C++ etc if not present then return None")
    frameworks_and_libraries: list[str] = Field(default=[],description="Frameworks and libraries the candidate has worked on, \
        Ex: React,Angular,Django,Flask,Pandas,Pytorch etc if not present then return None")
    tools_and_platforms: list[str] = Field(default=[],description="Tools and platforms the candidate has worked on,\
        Ex: Git,Docker,Kubernetes,Jenkins,Jira,etc. if not present then return None")
    databases: list[str] = Field(default=[],description="Databases the candidate has worked on, \
        Ex: MySQL,PostgreSQL,MongoDB,Oracle,etc. if not present then return None")
    cloud_and_infra: list[str] = Field(default=[],description="Cloud and infra the candidate has worked on,\
        Ex: AWS,Azure,GCP,etc. if not present then return None")
    soft_skills: list[str] = Field(default=[],description="Soft skills of the candidate, \
        Ex: Communication,Teamwork,Leadership,Problem-solving,etc. if not present then return None")
    domain_skills: list[str] = Field(default=[],description="Domain skills of the candidate, \
        Ex: Finance,Healthcare,Education,etc. if not present then return None")
    certified_skills: list[str] = Field(default=[],description="Certified skills of the candidate, if not present then return None")
    
class ResumeData(BaseModel):
    personal_info: personalInfo = Field(...,description="Personal information of the candidate")
    contact_info: contactInfo = Field(...,description="Contact information of the candidate")
    education_info: list[educationInfo] = Field(...,description="Education information of the candidate")
    work_experience_info: list[workExperienceInfo] = Field(...,description="Work experience information of the candidate")
    skills_info: skillsInfo = Field(...,description="Skills information of the candidate")

In [84]:
llm_model = ChatOpenAI(
    base_url = "https://openrouter.ai/api/v1",
    api_key = OPENROUTER_API_KEY,
    model = "mistralai/ministral-14b-2512",
)
chain = prompt_template | llm_model.with_structured_output(ResumeData)
response = chain.invoke({"resume_markdown": ocr_response.pages[0].markdown})
response

ResumeData(personal_info=personalInfo(full_name='Harsh Raj', first_name='Harsh', middle_name=None, last_name='Raj', date_of_birth=None, gender=None, nationality=None, work_authorization=None), contact_info=contactInfo(primary_email='rajharsh5450@gmail.com', secondary_email=None, primary_phone_number='+918340643631', secondary_phone_number=None, country_code='IN', current_city='Noida', current_state='Uttar Pradesh', current_country='India', postal_address=None), education_info=[educationInfo(institution_name='Indian Institute of Information Technology, Kalyani', institution_type='University', institution_country='India', degree='Bachelor of Technology', field_of_study='Computer Science and Engineering', specialisation=None, education_level='bachelors', start_date='2019-07', end_date='2023-06', is_current='No', grade_or_gpa='8.85/10', mode=None), educationInfo(institution_name='Pt.B.P.Saraswati Vidya Mandir, Jharkhand, India', institution_type='School', institution_country='India', degre

In [86]:
print(response.model_dump_json(indent=4))

{
    "personal_info": {
        "full_name": "Harsh Raj",
        "first_name": "Harsh",
        "middle_name": null,
        "last_name": "Raj",
        "date_of_birth": null,
        "gender": null,
        "nationality": null,
        "work_authorization": null
    },
    "contact_info": {
        "primary_email": "rajharsh5450@gmail.com",
        "secondary_email": null,
        "primary_phone_number": "+918340643631",
        "secondary_phone_number": null,
        "country_code": "IN",
        "current_city": "Noida",
        "current_state": "Uttar Pradesh",
        "current_country": "India",
        "postal_address": null
    },
    "education_info": [
        {
            "institution_name": "Indian Institute of Information Technology, Kalyani",
            "institution_type": "University",
            "institution_country": "India",
            "degree": "Bachelor of Technology",
            "field_of_study": "Computer Science and Engineering",
            "specialisation"