### Experiment2: JD Parsing -  MistralOCR + SLM/LLM (OpenRouter) + LLM_as_a_judge 
- Maintainer : Shivargha Bandopadhyay
- Date : 12/02/2026
- Modules : MistralOCR, Langchain  
- Pipeline : MistralOCR -> LLM (Ministral 14B)

In [2]:
import os
from time import time
from mistralai import Mistral 
from langchain_core.messages import HumanMessage,SystemMessage,AIMessage
from langchain_core.prompts import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langgraph.graph import StateGraph, END, START

from dotenv import load_dotenv
import json
from pydantic import BaseModel, Field
from typing import List,Annotated, Literal,TypedDict,Optional


load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [3]:
## MISTRAL OCR ###
'''
Local Files Need to be uploaded to Mistral OCR - storage
Use Mistral OCR API to extract text from the uploaded file URL Link
'''
client = Mistral(api_key=MISTRAL_API_KEY)
def upload_file(file_path:str) -> str:

    filename = file_path.split("\\")[-1]

    ## Upload File to Mistral OCR Storage ##
    uploaded_file = client.files.upload(
        file = {
            "file_name": filename,
            "content": open(file_path, "rb")
        },
        purpose = 'ocr'
    )

    signed_url = client.files.get_signed_url(file_id=uploaded_file.id)
    return signed_url.url

def get_ocr_response(file_url:str):

    ## Get OCR Response ##
    ocr_response = client.ocr.process(
        model = 'mistral-ocr-latest',
        document = {
            "type": "document_url",
            "document_url":file_url
        },
        include_image_base64 = True    
    )

    return ocr_response

url = upload_file("D:\\Tvarah\\resume_extraction\\data\\Job Description\\Data Science\\Lead Data Scientist.pdf")
ocr_response = get_ocr_response(url)
ocr_response 


OCRResponse(pages=[OCRPageObject(index=0, markdown='# Lead Data Scientist\n\nBengaluru\n\nWork Experience: 8+ years\n\n## Mandate Skills:\n\n- Experience in Programming (Python, R, SQL, NoSQL, Spark) with ML tools &amp; Cloud Technology (AWS, Azure, GCP)\n- Experience in Python libraries such as numpy, pandas, scikit-learn, tensor-flow, scapy, scrapy, BERT etc.\n- Good understanding in statistics, and ability to design statistical hypothesis testing to aid formal decision making.\n- Develops predictive models using Machine Learning algorithms (SVM, Random Forest, Neural Network, Decision Tree, Logistic Regression, K-mean Clustering, linear regression, PCA etc.)\n- Engaging with clients, understanding complex problem statements, and offering solutions in the domains of Retail, Pharma, Banking, Insurance, etc.\n- Contribute to internal product development initiatives related to data science.\n- Develop data science roadmap, and guide data scientist to meet their deliverables.\n- Handling

In [11]:
markdown_combined = ' '.join(i.markdown for i in ocr_response.pages) 

In [12]:
### Prompt Templates ###
system_prompt = SystemMessagePromptTemplate.from_template(
    """
### ROLE
You are a Technical Recruiter AI. Your job is to extract structured matching criteria from a messy Job Description.

### INSTRUCTIONS
1. **Ignore the Fluff:** Do not extract text from the "About Us" or "Culture" sections unless it specifies a hard requirement (e.g., "Must work EST timezone").
2. **Distinguish Requirements:**
   - If a skill says "Required", "Must have", or "Proficient in", put it in `mandatory_skills`.
   - If a skill says "Bonus", "Plus", "Good to have", or "Familiarity with", put it in `optional_skills`.
3. **Formatting:**
   - Skills should be single keywords (e.g., "Python", "AWS", "React"). Do not write sentences.
"""   
)

human_prompt = HumanMessagePromptTemplate.from_template(
    """
    Here is the JobDescription Markdown:
    <job_description>
    {job_description_markdown}
    </job_description>
"""
)

prompt_template = ChatPromptTemplate.from_messages(
    [system_prompt, human_prompt]
)


In [14]:
class JobDescription(BaseModel):
    # --- METADATA (For the UI) ---
    role_title: str = Field(..., description="The standard job title, e.g., 'Senior Backend Engineer'")
    company_name: Optional[str] = None
    salary_range: Optional[str] = Field(None, description="e.g. '$120k - $160k'")

    # Hard Skills (The "Must Haves")
    mandatory_skills: List[str] = Field(..., description="Technical skills explicitly marked as required")
    
    # Soft Skills / Bonus (The "Nice to Haves")
    optional_skills: List[str] = Field([], description="Skills listed as 'preferred', 'bonus', or 'plus'")
    
    # Experience Constraint (Normalize this!)
    min_years_experience: Optional[int] = Field(None, description="Integer value of required years. e.g. '5+ years' -> 5")
    
    # Education Constraint
    degree_required: Optional[str] = Field(None, description="e.g. 'Bachelor in CS', 'PhD'")

    # --- THE SUMMARY (For the Human) ---
    summary_responsibilities: List[str] = Field(..., description="Top 3-5 core responsibilities summarized")

In [16]:
llm_model = ChatOpenAI(
    base_url = "https://openrouter.ai/api/v1",
    api_key = OPENROUTER_API_KEY,
    model = "mistralai/ministral-14b-2512",
)
chain = prompt_template | llm_model.with_structured_output(JobDescription)
response = chain.invoke({"job_description_markdown": markdown_combined})
print(response.model_dump_json(indent=4))

{
    "role_title": "Lead Data Scientist",
    "company_name": null,
    "salary_range": null,
    "mandatory_skills": [
        "Python",
        "R",
        "SQL",
        "NoSQL",
        "Spark",
        "AWS",
        "Azure",
        "GCP",
        "numpy",
        "pandas",
        "scikit-learn",
        "tensorflow",
        "statistics",
        "hypothesis testing",
        "SVM",
        "Random Forest",
        "Neural Networks",
        "Decision Trees",
        "Logistic Regression",
        "K-means Clustering",
        "linear regression",
        "PCA",
        "client engagement",
        "project management",
        "effort estimation",
        "data science roadmap",
        "AI analytics programs",
        "large dataset analysis",
        "business insights",
        "stakeholder management",
        "technical strategy",
        "team leadership",
        "process optimization",
        "project audits",
        "utilization tracking"
    ],
    "optional_skil