In [3]:
# -------------------------------
# text extraction
# -------------------------------
# Step 1: Text Extraction
def extract_text_from_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


In [23]:
# -------------------------------
# 🧩 Section Detection
# -------------------------------
import pdfplumber
import re
from sentence_transformers import SentenceTransformer, util



# Step 2: Known Titles per Category
SECTION_KEYWORDS = {
    "experience": [
        "expérience", "expériences professionnelles", "work experience", "professional experience", "career history"
    ],
    "education": [
        "formation", "études", "éducation", "academic background", "qualifications", "diplômes"
    ],
    "skills": [
        "compétences", "skills", "technical skills", "technologies", "outils"
    ],
    "languages": [
        "langues", "languages", "spoken languages"
    ],
    "certifications": [
        "certifications", "certification", "certified"
    ],
    "summary": [
        "profil", "summary", "about me", "présentation", "professional summary"
    ],
    "contact": [
        "contact", "informations personnelles", "coordonnées", "personal information", "personal informations", "les informations personnelles","contact details", 
    ],
    "langues": [
        "langues", "language", "les langues", "languages"
    ]
}

# Step 3: Match Fuzzy Titles
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def detect_section_title(line):
    line_clean = line.strip().lower()
    line_emb = model.encode(line_clean, convert_to_tensor=True)
    best_score = 0.9  # similarity threshold
    best_section = None

    for section, keywords in SECTION_KEYWORDS.items():
        for kw in keywords:
            kw_emb = model.encode(kw.lower(), convert_to_tensor=True)
            sim = util.pytorch_cos_sim(line_emb, kw_emb).item()
            if sim > best_score:
                best_score = sim
                best_section = section

    return best_section

def segment_cv_text(text):
    lines = text.splitlines()
    section_indices = []
    seen_sections = set()

    # Étape 1 : Repérer les titres avec leurs index
    for idx, line in enumerate(lines):
        if not line.strip():
            continue
        section = detect_section_title(line)
        if section and section not in seen_sections:
            print(f"\n🔹 Première occurrence de la section : '{section}' à la ligne {idx} ➤ {line.strip()}")
            section_indices.append((idx, section))
            seen_sections.add(section)

    # Étape 2 : Extraire le contenu entre les titres
    sections = {}
    for i, (start_idx, section_name) in enumerate(section_indices):
        end_idx = section_indices[i + 1][0] if i + 1 < len(section_indices) else len(lines)
        content_lines = lines[start_idx + 1:end_idx]
        content = "\n".join(content_lines).strip()
        sections[section_name] = content

        # Afficher le contenu détecté sous chaque titre
        print(f"📄 Contenu de la section '{section_name}':\n{content}\n{'-'*60}")

    return sections

    
def process_cv(path_to_pdf):
    text = extract_text_from_pdf(path_to_pdf)
    return segment_cv_text(text)


In [20]:
# -------------------------------
# 🚀 Full Pipeline Execution
# -------------------------------
pdf_path = "../yb.pdf"  

sections = process_cv(pdf_path)



Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats



🔹 Première occurrence de la section : 'summary' à la ligne 2 ➤ SUMMARY

🔹 Première occurrence de la section : 'experience' à la ligne 5 ➤ EXPERIENCE

🔹 Première occurrence de la section : 'education' à la ligne 36 ➤ EDUCATION

🔹 Première occurrence de la section : 'certifications' à la ligne 42 ➤ CERTIFICATIONS

🔹 Première occurrence de la section : 'skills' à la ligne 53 ➤ SKILLS
📄 Contenu de la section 'summary':
Full-Stack Software Engineer with experience building scalable, AI-powered web applications using Java, JavaScript, React, Spring Boot, FastAPI, and
AWS. Eager to contribute in dynamic environments.
------------------------------------------------------------
📄 Contenu de la section 'experience':
Full Stack Software Engineer|Expersi|Paris|December 2023 - Present
•Contributed to the development and maintenance of scalable backend solutions and REST APIs using Python (FastAPI) and Java (Spring Boot), and
assisted in building front-end components with ReactJS, Selenium for dat

In [24]:
# -------------------------------
# 🚀 Full Pipeline Execution
# -------------------------------
pdf_path = "../ah.pdf"  

sections = process_cv(pdf_path)




🔹 Première occurrence de la section : 'summary' à la ligne 5 ➤ Présentation

🔹 Première occurrence de la section : 'skills' à la ligne 13 ➤ Compétences clés

🔹 Première occurrence de la section : 'experience' à la ligne 53 ➤ Expériences professionnelles

🔹 Première occurrence de la section : 'languages' à la ligne 380 ➤ Langues
📄 Contenu de la section 'summary':
16 ans d’expérience
Directeur IT et Chef de projet expérimenté, certifié Agile Scrum PSPO I, avec plus de 16 ans d’expérience dans la
gestion de projets stratégiques et la conduite de transformations digitales. Reconnu pour ma rigueur, ma capacité à
piloter des équipes pluridisciplinaires et multiculturelles, j’interviens avec efficacité sur des environnements
complexes, en garantissant un haut niveau de qualité, de performance et de conformité.
Information
Localisation : Paris - Impact Carbonne : 9.78 g CO₂e TJM : 700€ Disponibilité : Immédiate
------------------------------------------------------------
📄 Contenu de la secti