### Experiment2: JD Parsing -  MistralOCR + SLM/LLM (OpenRouter) + LLM_as_a_judge 
- Maintainer : Shivargha Bandopadhyay
- Date : 12/02/2026
- Modules : MistralOCR, Langchain  
- Pipeline : MistralOCR -> LLM (Ministral 14B)

In [2]:
import os
from time import time
from mistralai import Mistral 
from langchain_core.messages import HumanMessage,SystemMessage,AIMessage
from langchain_core.prompts import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langgraph.graph import StateGraph, END, START

from dotenv import load_dotenv
import json
from pydantic import BaseModel, Field
from typing import List,Annotated, Literal,TypedDict,Optional


load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [3]:
## MISTRAL OCR ###
'''
Local Files Need to be uploaded to Mistral OCR - storage
Use Mistral OCR API to extract text from the uploaded file URL Link
'''
client = Mistral(api_key=MISTRAL_API_KEY)
def upload_file(file_path:str) -> str:

    filename = file_path.split("\\")[-1]

    ## Upload File to Mistral OCR Storage ##
    uploaded_file = client.files.upload(
        file = {
            "file_name": filename,
            "content": open(file_path, "rb")
        },
        purpose = 'ocr'
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id)
    return signed_url.url

def get_ocr_response(file_url:str):

    ## Get OCR Response ##
    ocr_response = client.ocr.process(
        model = 'mistral-ocr-latest',
        document = {
            "type": "document_url",
            "document_url":file_url
        },
        include_image_base64 = True    
    )

    return ocr_response

url = upload_file("D:\\Tvarah\\resume_extraction\\data\\Job Description\\Data Science\\Lead Data Scientist.pdf")
ocr_response = get_ocr_response(url)
ocr_response 


OCRResponse(pages=[OCRPageObject(index=0, markdown='# Lead Data Scientist\n\nBengaluru\n\nWork Experience: 8+ years\n\n## Mandate Skills:\n\n- Experience in Programming (Python, R, SQL, NoSQL, Spark) with ML tools &amp; Cloud Technology (AWS, Azure, GCP)\n- Experience in Python libraries such as numpy, pandas, scikit-learn, tensor-flow, scapy, scrapy, BERT etc.\n- Good understanding in statistics, and ability to design statistical hypothesis testing to aid formal decision making.\n- Develops predictive models using Machine Learning algorithms (SVM, Random Forest, Neural Network, Decision Tree, Logistic Regression, K-mean Clustering, linear regression, PCA etc.)\n- Engaging with clients, understanding complex problem statements, and offering solutions in the domains of Retail, Pharma, Banking, Insurance, etc.\n- Contribute to internal product development initiatives related to data science.\n- Develop data science roadmap, and guide data scientist to meet their deliverables.\n- Handling

In [11]:
markdown_combined = ' '.join(i.markdown for i in ocr_response.pages) 

In [12]:
### Prompt Templates ###
system_prompt = SystemMessagePromptTemplate.from_template(
    """
### ROLE
You are a Technical Recruiter AI. Your job is to extract structured matching criteria from a messy Job Description.

### INSTRUCTIONS
1. **Ignore the Fluff:** Do not extract text from the "About Us" or "Culture" sections unless it specifies a hard requirement (e.g., "Must work EST timezone").
2. **Distinguish Requirements:**
   - If a skill says "Required", "Must have", or "Proficient in", put it in `mandatory_skills`.
   - If a skill says "Bonus", "Plus", "Good to have", or "Familiarity with", put it in `optional_skills`.
3. **Formatting:**
   - Skills should be single keywords (e.g., "Python", "AWS", "React"). Do not write sentences.
"""   
)

human_prompt = HumanMessagePromptTemplate.from_template(
    """
    Here is the JobDescription Markdown:
    <job_description>
    {job_description_markdown}
    </job_description>
"""
)

prompt_template = ChatPromptTemplate.from_messages(
    [system_prompt, human_prompt]
)


In [17]:
class skillsInfo(BaseModel):
    programming_languages: list[str] = Field(default=[],description="Programming languages as required,\
        Ex: Python,C,C++ etc if not present then return None")
    frameworks_and_libraries: list[str] = Field(default=[],description="Frameworks and libraries as required, \
        Ex: React,Angular,Django,Flask,Pandas,Pytorch etc if not present then return None")
    tools: list[str] = Field(default=[],description="Tools as required,\
        Ex: Git,Docker,Kubernetes,Jenkins,Jira,etc. if not present then return None")
    databases: list[str] = Field(default=[],description="Databases as required, \
        Ex: MySQL,PostgreSQL,MongoDB,Oracle,etc. if not present then return None")
    cloud_and_infra: list[str] = Field(default=[],description="Cloud and infra as required,\
        Ex: AWS,Azure,GCP,etc. if not present then return None")

class JobDescription(BaseModel):
    # --- METADATA (For the UI) ---
    role_title: str = Field(..., description="The standard job title, e.g., 'Senior Backend Engineer'")
    company_name: Optional[str] = Field(default=None, description="Name of the company")
    salary_range: Optional[str] = Field(None, description="e.g. '$120k - $160k'")

    # Hard Skills (The "Must Haves")
    mandatory_skills: skillsInfo = Field(..., description="Technical skills explicitly marked as required")
    
    # Soft Skills / Bonus (The "Nice to Haves")
    optional_skills: List[str] = Field([], description="Skills listed as 'preferred', 'bonus', or 'plus'")
    
    # Experience Constraint (Normalize this!)
    min_years_experience: Optional[int] = Field(None, description="Integer value of required years. e.g. '5+ years' -> 5")
    
    # Education Constraint
    degree_required: Optional[str] = Field(None, description="e.g. 'Bachelor in CS', 'PhD'")

    # --- THE SUMMARY (For the Human) ---
    summary_responsibilities: List[str] = Field(..., description="Top 3-5 core responsibilities summarized")

In [None]:
llm_model = ChatOpenAI(
    base_url = "https://openrouter.ai/api/v1",
    api_key = OPENROUTER_API_KEY,
    model = "mistralai/ministral-14b-2512",
)
chain = prompt_template | llm_model.with_structured_output(JobDescription)
response = chain.invoke({"job_description_markdown": markdown_combined})
print(response.model_dump_json(indent=4))

{
    "role_title": "Lead Data Scientist",
    "company_name": null,
    "salary_range": null,
    "mandatory_skills": {
        "programming_languages": [
            "Python",
            "R",
            "SQL",
            "NoSQL"
        ],
        "frameworks_and_libraries": [
            "numpy",
            "pandas",
            "scikit-learn",
            "tensorflow",
            "scapy",
            "scrapy",
            "BERT",
            "Spark"
        ],
        "tools": [],
        "databases": [],
        "cloud_and_infra": [
            "AWS",
            "Azure",
            "GCP"
        ]
    },
    "optional_skills": [
        "statistics",
        "statistical hypothesis testing",
        "SVM",
        "Random Forest",
        "Neural Network",
        "Decision Tree",
        "Logistic Regression",
        "K-means Clustering",
        "linear regression",
        "PCA",
        "large scale data processing",
        "distributed computing",
        "project ma

In [25]:
### LLM as judge ##
llm_model_judge = ChatOpenAI(
    base_url = "https://openrouter.ai/api/v1",
    api_key = OPENROUTER_API_KEY,
    model = "microsoft/phi-4",
)

class judgeJson(BaseModel):
    grade: str = Field(...,description="Pass or Fail")
    grade_summary: str = Field(...,description="Summary of what things are right or what is wrong with the extraction")

judge_system_prompt = SystemMessagePromptTemplate.from_template(
    '''
    ### ROLE
    You are a strict QA Auditor for a Resume & JD Parsing pipeline. 
    Your job is to compare the SOURCE_TEXT (Markdown) with the EXTRACTED_JSON.

    ### TASK
    Verify the JSON against the text for three specific errors:
    1. **Hallucinations:** Did the JSON invent a skill, job, or degree not present in the text?
    2. **Date Errors:** Are the start/end dates in the JSON supported by the text?
    3. **Missing Critical Data:** Did the JSON return 'null' for a name or email that is clearly visible in the text?

    ### OUTPUT FORMAT
    Return valid JSON with two fields:
    - "verdict": "PASS" or "FAIL"
    - "reason": "Short explanation of the error (if FAIL), otherwise 'looks good'. in about a single line (100 words)"

    ### CONSTRAINTS
    - Ignore minor formatting differences (e.g., "Software Eng." vs "Software Engineer" is acceptable).
    - Be strict about Dates and Numbers.
     
    '''
)

human_prompt_judge = HumanMessagePromptTemplate.from_template(
    '''
    Here is the markdown extraction:
    <markdown>
    {markdown}
    </markdown>

    Here is the Json extraction:
    <jsondata>
    {jsondata}
    </jsondata>
    '''
)

prompt_template_judge = ChatPromptTemplate.from_messages(
    [judge_system_prompt,human_prompt_judge]
)


In [26]:
chain = prompt_template_judge | llm_model_judge.with_structured_output(judgeJson)
response = chain.invoke({"markdown": markdown_combined,"jsondata":response.model_dump_json(indent=4)})
print(response.model_dump_json(indent=4))

{
    "grade": "PASS",
    "grade_summary": "The JSON reflects the information present in the Markdown source text accurately without any hallucination of skills or degrees. It correctly extracts the specified work experience and proficiency in various programming language and machine learning libraries. The extraction omits company names but includes the location 'Bengaluru,' which while not ideal, does not contain any critical errors as company names are not explicitly stated in the source text. The desired role title 'Lead Data Scientist' aligns with the role title extracted from the source text. Dates and numbers are consistent, with no 'null' values present for any critical data such as names or emails, which are not mentioned in the source text but do not contribute to errors as no 'null' critical information is missing. Overall, the extracted JSON accurately reflects key contents, adjusting slightly due to formatting expectations from the source text, consistent with what a stri