In [1]:
!pip install -q transformers accelerate bitsandbytes einops
!pip install -q pymupdf


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=8192,
    do_sample=False,
    return_full_text=False
)


2025-07-22 23:16:41.463949: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753226201.489751    2980 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753226201.499609    2980 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-22 23:16:41.708990: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [2]:
def preprocess_cv_text(text):
    import re
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

In [3]:
import fitz

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text= "\n".join(page.get_text() for page in doc).strip()
    return preprocess_cv_text(text)


In [4]:
def build_prompt(cv_text):
    return f"""
Tu es un extracteur d'informations structuré. À partir du texte du CV ci-dessous, fournis un objet JSON valide contenant les champs suivants :

- name
- profile
- phone
- email
- address
- experience (liste de: company, title, start_date, end_date, description)
- education (liste de: university, degree, start_date, end_date, description)
- certifications (liste de: title, organization, date, description)
- skills (liste de chaînes)

Donne uniquement un JSON valide avec tout le text du CV n’oublie aucun mot, sans commentaire ni texte en plus.

CV:
\"\"\"{cv_text}\"\"\"
"""


In [5]:

pdf_path = "CV_Achraf_HT_Directeur et Chef de projet IT_2025.pdf"

cv_text = extract_text_from_pdf(pdf_path)
prompt = build_prompt(cv_text)

output = pipe(prompt)[0]["generated_text"]
print(output)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



```json
{
  "name": "Achraf HADJ TAIEB",
  "profile": "Directeur IT et Chef de projet expérimenté, certifié Agile Scrum PSPO I, avec plus de 16 ans d’expérience dans la gestion de projets stratégiques et la conduite de transformations digitales.",
  "phone": "+33 6 44 18 07 72",
  "email": "achraf.ht@gmail.com",
  "address": {
    "location": "Paris",
    "carbonne": "9.78 g CO₂e"
  },
  "experience": [
    {
      "company": "Allianz Trade",
      "title": "Directeur et Chef de projet IT",
      "start_date": "Nov 2021",
      "end_date": "Dec 2024",
      "description": "Programme Qirin - Document Services"
    },
    {
      "company": "AXA Assurance",
      "title": "Squad RH de AXA France",
      "start_date": "Janv 2021",
      "end_date": "Novembre 2021",
      "description": "Digitalisation des processus RH"
    },
    {
      "company": "Finbill",
      "title": "Project Manager",
      "start_date": "Avril. 2019",
      "end_date": "Dec 2020",
      "description": "Stratup d

In [6]:
def build_prompt_position(cv_text):
    return f"""
You are an information extractor. Do not generate or summarize. Your job is to locate and extract named entities and sections from a CV **exactly as they appear in the text**.

Your goal is to extract the following entities by identifying the **first word and last word** that mark the boundaries of each entity or section. Do not infer, rephrase, or invent content. Use only what is present in the CV.

### Extract the following fields:

- name: first word, last word  
- email: first word, last word  
- phone: first word, last word  
- address: first word, last word (if available)  
- profile_description: first word, last word  
- education (for each degree): first word, last word  
- experience (for each job):  
    - company_name: first word, last word  
    - position_title: first word, last word  
    - start_date: first word, last word  
    - end_date: first word, last word  
    - description: first word, last word  
- projects (for each project):  
    - title: first word, last word  
    - description: first word, last word  

### Output Format:
Return your result in this **valid JSON** format:

```json
{{
  "name": {{"start": "Achraf", "end": "TAIEB"}},
  "email": {{"start": "achraf.ht", "end": ".com"}},
  "phone": {{"start": "+33", "end": "72"}},
  "profile_description": {{"start": "Directeur", "end": "conformité."}},
  "education": [
    {{"start": "Maitrise", "end": "Sfax"}},
    {{"start": "Master", "end": "Sfax"}},
    {{"start": "Formation", "end": "PSPO1"}}
  ],
  "experiences": [
    {{
      "company_name": {{"start": "Allianz", "end": "Trade"}},
      "position_title": {{"start": "Manager", "end": "owner"}},
      "start_date": {{"start": "Nov", "end": "2021"}},
      "end_date": {{"start": "Dec", "end": "2024"}},
      "description": {{"start": "Allianz", "end": "production"}}
    }}
  ],
  "projects": [
    {{
      "title": {{"start": "Projet", "end": "Audit"}},
      "description": {{"start": "Audit", "end": "pilotage"}}
    }}
  ]
}}

CV:
\"\"\"{cv_text}\"\"\"
"""


In [None]:

pdf_path = "CV_Achraf_HT_Directeur et Chef de projet IT_2025.pdf"


cv_text1 = extract_text_from_pdf(pdf_path)
prompt = build_prompt_position(cv_text)

output = pipe(prompt)[0]["generated_text"]
print(output)


In [None]:
!pip install PyPDF2

In [7]:
import re
import json
from typing import List, Dict
from PyPDF2 import PdfReader

# 1. Extract and clean text from PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

# 2. Chunking with overlap
def chunk_text(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunks.append(" ".join(words[start:end]))
        start = end - overlap  # overlap to avoid cutting useful info
    return chunks

# 3. Classify which sections a chunk contains
def classify_chunk(chunk: str) -> List[str]:
    prompt = f"""
This is a chunk of a CV:
\"\"\"{chunk}\"\"\"
Which of the following sections does it contain?
Choose from this list only: ["profile", "contact", "experience", "education", "projects", "skills", "none"]

Return a JSON list, like: ["experience", "projects"]
"""
    output = pipe(prompt)[0]['generated_text']
    time.sleep(2)
    try:
        return json.loads(output.strip())
    except:
        return []

# 4. Generate extraction prompt per section
def get_extraction_prompt(section: str, chunk: str) -> str:
    if section == "experience":
        return f"""
Extract experiences from this CV chunk. For each, return:
- company_name: first word, last word
- position_title: first word, last word
- start_date: first word, last word
- end_date: first word, last word
- description: first word, last word
Only extract from visible text (no guessing or summarizing). Return JSON only.
CV chunk:
\"\"\"{chunk}\"\"\"
"""
    elif section == "profile" or section == "contact":
        return f"""
Extract profile and contact info from this CV chunk. Return:
- name: first word, last word
- email: first word, last word
- phone: first word, last word
- address (if any): first word, last word
- profile_description: first word, last word
Return JSON only.
CV chunk:
\"\"\"{chunk}\"\"\"
"""
    elif section == "education":
        return f"""
Extract education items from this CV chunk. For each, return:
- start word
- end word
Only return what exists in text. JSON only.
CV chunk:
\"\"\"{chunk}\"\"\"
"""
    elif section == "projects":
        return f"""
Extract projects from this CV chunk. For each, return:
- title: first word, last word
- description: first word, last word
Only extract based on exact visible text. Return JSON only.
CV chunk:
\"\"\"{chunk}\"\"\"
"""
    else:
        return None

# 5. Merge extracted results
def merge_results(aggregate: Dict, new_data: Dict) -> Dict:
    for key, value in new_data.items():
        if isinstance(value, list):
            aggregate.setdefault(key, []).extend(value)
        elif isinstance(value, dict):
            aggregate[key] = value  # overwrite or update
    return aggregate

# 6. Process the full CV
def process_cv(pdf_path: str):
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text)
    final_result = {}

    for i, chunk in enumerate(chunks):
        print(f"\n--- Processing chunk {i+1}/{len(chunks)} ---")
        section_list = classify_chunk(chunk)

        for section in section_list:
            prompt = get_extraction_prompt(section, chunk)
            if prompt:
                output = pipe(prompt)[0]['generated_text']
                try:
                    parsed = json.loads(output.strip())
                    if not isinstance(parsed, dict):
                        parsed = {section: parsed}  # wrap if it's just a list
                    final_result = merge_results(final_result, parsed)
                except Exception as e:
                    print(f"⚠️ Failed to parse JSON for section '{section}' in chunk {i+1}: {e}")
                    print("Raw output:", output)

    return final_result


In [None]:
result = process_cv("CV_Achraf_HT_Directeur et Chef de projet IT_2025.pdf")
import pprint; pprint.pprint(result)