In [1]:
!pip install -q transformers accelerate bitsandbytes einops
!pip install -q sentence-transformers pdfplumber pymupdf


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, return_full_text=False)


2025-07-28 10:11:00.647619: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753697460.663466    4457 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753697460.668363    4457 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-28 10:11:00.684115: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. 

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

Device set to use cuda:0


In [2]:
# -------------------------------
# 🧩 Section Detection (Your Code)
# -------------------------------
import pdfplumber
import re
from sentence_transformers import SentenceTransformer, util

# Step 1: Text Extraction
def extract_text_from_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Step 2: Known Titles per Category
SECTION_KEYWORDS = {
    "experience": [
        "expérience", "expériences professionnelles", "work experience", "professional experience", "career history"
    ],
    "education": [
        "formation", "études", "éducation", "academic background", "qualifications", "diplômes"
    ],
    "skills": [
        "compétences", "skills", "technical skills", "technologies", "outils"
    ],
    "languages": [
        "langues", "languages", "spoken languages"
    ],
    "certifications": [
        "certifications", "certification", "certified"
    ],
    "summary": [
        "profil", "summary", "about me", "présentation", "professional summary"
    ],
    "contact": [
        "contact", "informations personnelles", "coordonnées", "personal information", "personal informations", "les informations personnelles","contact details", 
    ],
    "langues": [
        "langues", "language", "les langues", "languages"
    ]
}

# Step 3: Match Fuzzy Titles
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def detect_section_title(line):
    line_clean = line.strip().lower()
    line_emb = model.encode(line_clean, convert_to_tensor=True)
    best_score = 0.7  # similarity threshold
    best_section = None

    for section, keywords in SECTION_KEYWORDS.items():
        for kw in keywords:
            kw_emb = model.encode(kw.lower(), convert_to_tensor=True)
            sim = util.pytorch_cos_sim(line_emb, kw_emb).item()
            if sim > best_score:
                best_score = sim
                best_section = section

    return best_section

# Step 4: Segment Text
def segment_cv_text(text):
    lines = text.splitlines()
    sections = {}
    current_section = None
    buffer = []

    for line in lines:
        if not line.strip():
            continue

        possible_section = detect_section_title(line)

        if possible_section:
            if current_section:
                if current_section not in sections:
                    sections[current_section] = []
                sections[current_section].extend(buffer)
            buffer = []
            current_section = possible_section
        else:
            buffer.append(line)

    if current_section:
        if current_section not in sections:
            sections[current_section] = []
        sections[current_section].extend(buffer)

    for sec in sections:
        sections[sec] = "\n".join(sections[sec]).strip()

    return sections

# Wrapper
def process_cv(path_to_pdf):
    text = extract_text_from_pdf(path_to_pdf)
    return segment_cv_text(text)


In [6]:
# -------------------------------
# 🤖 Prompt LLM with Each Section
# -------------------------------
def prompt_section_to_json(section_name, section_text):
    prompt = f"""
Below is the content of a resume's {section_name} section:

{section_text}

Please extract all relevant structured information  if exists in JSON format. Include one of the followings based section name:
- For experience section: job title, company, start_date, end_date, description
- For education section: degree, institution, start_date, end_date, field
- For profile section: summary
- For certifications section: name, institution, date
- For languages: language, level
_ For Skills: skill
- For contact: adress, email, name.

Return only JSON of one the options above .
"""
    print(f"""Below is the content of a resume's {section_name} section:""")
    response = pipe(prompt, max_new_tokens=8192, do_sample=True, temperature=0.4)[0]["generated_text"]
    print(response)
    json_start = response.find("{")
    return response[json_start:] if json_start != -1 else response


In [7]:
# -------------------------------
# 🚀 Full Pipeline Execution
# -------------------------------
pdf_path = "yb.pdf"  # Replace this with your actual PDF path

sections = process_cv(pdf_path)
results = {}

for section, content in sections.items():
    print(f"\n\n=== {section.upper()} ===\n")
    json_data = prompt_section_to_json(section, content)
    results[section] = json_data
    #print(json_data)


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




=== SUMMARY ===

Below is the content of a resume's summary section:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



{
  "profile": {
    "summary": "Full-Stack Software Engineer with experience building scalable, AI-powered web applications using Java, JavaScript, React, Spring Boot, FastAPI, and AWS. Eager to contribute in dynamic environments."
  },
  "skills": [
    "Java",
    "JavaScript",
    "React",
    "Spring Boot",
    "FastAPI",
    "AWS"
  ]
}


=== EXPERIENCE ===

Below is the content of a resume's experience section:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[
  {
    "job_title": "Full Stack Software Engineer",
    "company": "Expersi",
    "start_date": "December 2023",
    "end_date": "Present",
    "description": "Contributed to the development and maintenance of scalable backend solutions and REST APIs using Python (FastAPI) and Java (Spring Boot), and assisted in building front-end components with ReactJS, Selenium for data ingestion of 5 platforms and pipeline support and AWS. Participated in the implementation of an internal AI platform integrating LLMs and RAG pipelines, leveraging AWS services such as Bedrock, Redshift, Lambda, API Gateway, and S3. Fine-tuning (4 times) and deploying custom machine learning models into production to enable search and data retrieval features. Fixed bugs and implemented new features while following clean code principles and established design patterns. Participated in Agile Scrum ceremonies, including sprint planning and retrospectives, and collaborated in code reviews to support high development 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[
  {
    "degree": "Master’s Degree in Engineering – Internet of Things: Software and Analytics",
    "institution": "ENSIAS (National School of Computer Science and Systems Analysis)",
    "start_date": "2023",
    "field": "Internet of Things: Software and Analytics",
    "minor": "Software and Analytics"
  },
  {
    "degree": "Bachelor's Degree in Computer and Mathematical Sciences",
    "institution": "Ibn Tofail University",
    "start_date": "2021",
    "field": "Computer and Mathematical Sciences",
    "minor": "Computer and Mathematical Sciences"
  }
]


=== CERTIFICATIONS ===

Below is the content of a resume's certifications section:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[
  {
    "certifications": [
      {
        "name": "AWS Certified Solutions Architect - Associate",
        "institution": "Amazon Web Services Training and Certification",
        "date": "2024"
      },
      {
        "name": "AWS Certified Cloud Practitioner",
        "institution": "Amazon Web Services Training and Certification",
        "date": "2023"
      }
    ]
  }
]


=== SKILLS ===

Below is the content of a resume's skills section:

[
  {
    "skills": [
      {
        "name": "Typescript",
        "type": "Front End"
      },
      {
        "name": "Javascript",
        "type": "Front End"
      },
      {
        "name": "React",
        "type": "Front End"
      },
      {
        "name": "Java",
        "type": "Backend"
      },
      {
        "name": "Spring Boot",
        "type": "Backend"
      },
      {
        "name": "Python",
        "type": "Backend"
      },
      {
        "name": "FastAPI",
        "type": "Backend"
      },
      {
        "name":

In [8]:
# -------------------------------
# 🚀 Full Pipeline Execution
# -------------------------------
pdf_path = "ah.pdf"  # Replace this with your actual PDF path

sections = process_cv(pdf_path)
results = {}

for section, content in sections.items():
    print(f"\n\n=== {section.upper()} ===\n")
    json_data = prompt_section_to_json(section, content)
    results[section] = json_data
    #print(json_data)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




=== CONTACT ===

Below is the content of a resume's contact section:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



{
  "experience": [
    {
      "job_title": "Directeur et Chef de projet IT",
      "company": "Achraf HADJ TAIEB",
      "start_date": null,
      "end_date": null,
      "description": "Au niveau Sqaud - Chapters de compétences qui regroupent les collaborateurs en fonction de leurs compétences (mobile, front-end, back-end, sharepoint, UX/UI, Scrum, QA/tests, Architecture)\nAssurer la mise en place de nouvelles pratiques visant à créer de la valeur : story mapping, design thinking, user research\nValider les spécifications fonctionnelles : Epic et les Users stories.\nChallenger la solution technique et garantir une architecture micro-services.\nAssurer le bon déroulement du sprint avec le Scrum master.\nAssister à la rétro et être force de proposition.\nContribuer aux comités d’architecture, stratégie et schéma directeur IT.\nAssurer la mise en place du devops et migration cloud\nMise en place de pratiques visant à améliorer la qualité (TDD, BDD, revue de code, automatisation des te

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



{
  "experience": [
    {
      "job_title": "Director IT and Project Manager",
      "company": "Startup",
      "start_date": "April 2019",
      "end_date": "Dec 2020",
      "description": "Provide a solution of invoicing and payment online (Web and Mobile) to respond to the legal obligations of the dematerialized invoice based on the Blockchain technology. Assure the board meeting (CODIR), Define the strategic objectives of the solution and the company, Estimate the charges, Allocate the resources, Estimate the annual budget necessary, Ensure the coordination between the Development, Marketing, Finance, Sales and Compliance teams, Validate the new features (functionalities), Prepare the reports, Identify the gaps and anticipate the risks, Prepare the delivery strategy (GO TO Market), Ensure the coordination between the RH, OPS, ETL, Legal, Security, Editors of solutions and suppliers teams, Ensure the passage of the projects to the different committees of validation."
    },
    

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



{
  "experience": [
    {
      "job_title": "Manager - Digitalisation des processus RH",
      "company": "AXA Assurance",
      "start_date": "Janv 2021",
      "end_date": "Novembre 2021",
      "description": "Sous La direction du Head of IT je suis en charge des projets transverses SIRH : Norkom, Pleiades, Smart working, Workflow- Self-service, Interfaces API …"
    },
    {
      "job_title": "IT Manager",
      "company": "Ville de Bruxelles",
      "start_date": "mars 2017",
      "end_date": "Avril. 2019",
      "description": "Digital Transformation Program"
    },
    {
      "job_title": "Chef de Projet",
      "company": "La poste courrier",
      "start_date": "Janv. 2014",
      "end_date": "Mars 2017",
      "description": "MaBox RH : Portail web et applications Mobiles"
    },
    {
      "job_title": "Assistant Chef de projet",
      "company": "Vinci Energie",
      "start_date": "Janv. 2013",
      "end_date": "Dec.2013",
      "description": "Projet de migration"


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



{
  "experience": [
    {
      "job_title": "Director of Project",
      "company": "Allianz Trade - Euler Hermes",
      "start_date": "Nov 2021",
      "end_date": "Dec 2024",
      "description": "Director of project on the Qirin program (cloud migration, microservices AWS)."
    },
    {
      "job_title": "Digital RH Project Manager",
      "company": "AXA Assurance",
      "start_date": "unknown",
      "end_date": "unknown",
      "description": "Digitalization of HR processes and management of transverse projects (API, workflows)."
    },
    {
      "job_title": "Digital Transformation Manager",
      "company": "Ville de Bruxelles",
      "start_date": "unknown",
      "end_date": "unknown",
      "description": "Digital transformation and management of critical applications in transverse data architecture."
    },
    {
      "job_title": "IT Project Manager",
      "company": "Finbill (Startup)",
      "start_date": "unknown",
      "end_date": "unknown",
      "descripti

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[
  {
    "experience": [
      {
        "job_title": "Chef d’équipe de 3 consultants",
        "company": "Groupe SOS",
        "start_date": null,
        "end_date": null,
        "description": "Aider à tester et à gérer les améliorations\nDéveloppement des processus guidés pour la gestion administrative\nElaboration de la fiche de paie"
      }
    ]
  }
]


=== LANGUAGES ===

Below is the content of a resume's languages section:

{
	"languages": [
		{
			"language": "Anglais",
			"level": "Bonne maitrise"
		}
	]
}


In [9]:
# -------------------------------
# 🚀 Full Pipeline Execution
# -------------------------------
pdf_path = "ybfr.pdf"  # Replace this with your actual PDF path

sections = process_cv(pdf_path)
results = {}

for section, content in sections.items():
    print(f"\n\n=== {section.upper()} ===\n")
    json_data = prompt_section_to_json(section, content)
    results[section] = json_data
    #print(json_data)


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




=== SUMMARY ===

Below is the content of a resume's summary section:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



```json
{
  "profile": {
    "summary": "Ingénieur Logiciel Full Stack avec une expérience dans le développement d’applications web évolutives et basées sur l’intelligence artificielle, en utilisant Java, JavaScript, React, Spring Boot, FastAPI et AWS. Motivé à contribuer dans des environnements dynamiques."
  }
}
```


=== EXPERIENCE ===

Below is the content of a resume's experience section:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




{
  "experience": [
    {
      "job_title": "Ingénieur Full Stack Java/React/AWS",
      "company": "Expersi",
      "start_date": "December 2023",
      "end_date": "Present",
      "description": "Développement et maintenance de solutions backend évolutives et d’API REST en Java (Spring Boot) et Python (FastAPI). Contribution à la création de composants front-end avec ReactJS, et utilisation de Selenium pour l’ingestion de données issues de 5 plateformes, le support des pipelines, ainsi que l’intégration sur AWS. Participation à la mise en œuvre d’une plateforme d’intelligence artificielle interne intégrant des LLMs et des pipelines RAG, en exploitant les services AWS tels que Bedrock, Redshift, Lambda, API Gateway et S3. Réalisation de 4 fine-tunings et déploiement de modèles de machine learning personnalisés en production, permettant des fonctionnalités avancées de recherche et de récupération de données. Correction de bugs et développement de nouvelles fonctionnalités dans le r

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



{
  "experience": [
    {
      "job_title": "Ingénieur Full Stack Java/React",
      "company": "Zsoft Consulting",
      "start_date": "March 2023",
      "end_date": "December 2023",
      "description": "Participation au développement d'un système d'information des ressources humaines (SIRH) basé sur Spring Boot, ReactJS, Liquibase et des services AWS tels que S3 et RDS. Identification et résolution de dizaines de bugs, ainsi que mise en œuvre de nouvelles fonctionnalités dans le respect des principes du clean code et des design patterns établis. Maintenance et amélioration d'outils internes pour la gestion des employés et des processus métier, contribuant à une meilleure efficacité opérationnelle (2 preuves de concept réalisées). Mise en place d'un système centralisé de journalisation et de supervision via la stack ELK, afin de renforcer l'observabilité et la détection précoce des incidents. Participation active aux rituels Agile (daily stand-ups, planification de sprint), garant

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



JSON:
{
  "education": [
    {
      "degree": "Master’s Degree in Engineering – Internet of Things: Software and Analytics",
      "institution": "ENSIAS (National School of Computer Science and Systems Analysis)",
      "start_date": "2023",
      "field": "Internet of Things: Software and Analytics",
      "minor": "Software and Analytics"
    },
    {
      "degree": "Bachelor’s Degree",
      "institution": "Ibn Tofail University",
      "start_date": "2021",
      "field": "Computer and Mathematical Sciences",
      "minor": "Computer and Mathematical Sciences"
    }
  ]
}


=== CERTIFICATIONS ===

Below is the content of a resume's certifications section:

{
  "certifications": [
    {
      "name": "AWS Certified Solutions Architect – Associate",
      "institution": "Amazon Web Services Training and Certification",
      "date": "2024"
    },
    {
      "name": "AWS Certified Cloud Practitioner",
      "institution": "Amazon Web Services Training and Certification",
      "d