In [3]:
# -------------------------------
# text extraction
# -------------------------------
# Step 1: Text Extraction
def extract_text_from_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


In [23]:
# -------------------------------
# ðŸ§© Section Detection
# -------------------------------
import pdfplumber
import re
from sentence_transformers import SentenceTransformer, util



# Step 2: Known Titles per Category
SECTION_KEYWORDS = {
    "experience": [
        "expÃ©rience", "expÃ©riences professionnelles", "work experience", "professional experience", "career history"
    ],
    "education": [
        "formation", "Ã©tudes", "Ã©ducation", "academic background", "qualifications", "diplÃ´mes"
    ],
    "skills": [
        "compÃ©tences", "skills", "technical skills", "technologies", "outils"
    ],
    "languages": [
        "langues", "languages", "spoken languages"
    ],
    "certifications": [
        "certifications", "certification", "certified"
    ],
    "summary": [
        "profil", "summary", "about me", "prÃ©sentation", "professional summary"
    ],
    "contact": [
        "contact", "informations personnelles", "coordonnÃ©es", "personal information", "personal informations", "les informations personnelles","contact details", 
    ],
    "langues": [
        "langues", "language", "les langues", "languages"
    ]
}

# Step 3: Match Fuzzy Titles
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def detect_section_title(line):
    line_clean = line.strip().lower()
    line_emb = model.encode(line_clean, convert_to_tensor=True)
    best_score = 0.9  # similarity threshold
    best_section = None

    for section, keywords in SECTION_KEYWORDS.items():
        for kw in keywords:
            kw_emb = model.encode(kw.lower(), convert_to_tensor=True)
            sim = util.pytorch_cos_sim(line_emb, kw_emb).item()
            if sim > best_score:
                best_score = sim
                best_section = section

    return best_section

def segment_cv_text(text):
    lines = text.splitlines()
    section_indices = []
    seen_sections = set()

    # Ã‰tape 1 : RepÃ©rer les titres avec leurs index
    for idx, line in enumerate(lines):
        if not line.strip():
            continue
        section = detect_section_title(line)
        if section and section not in seen_sections:
            print(f"\nðŸ”¹ PremiÃ¨re occurrence de la section : '{section}' Ã  la ligne {idx} âž¤ {line.strip()}")
            section_indices.append((idx, section))
            seen_sections.add(section)

    # Ã‰tape 2 : Extraire le contenu entre les titres
    sections = {}
    for i, (start_idx, section_name) in enumerate(section_indices):
        end_idx = section_indices[i + 1][0] if i + 1 < len(section_indices) else len(lines)
        content_lines = lines[start_idx + 1:end_idx]
        content = "\n".join(content_lines).strip()
        sections[section_name] = content

        # Afficher le contenu dÃ©tectÃ© sous chaque titre
        print(f"ðŸ“„ Contenu de la section '{section_name}':\n{content}\n{'-'*60}")

    return sections

    
def process_cv(path_to_pdf):
    text = extract_text_from_pdf(path_to_pdf)
    return segment_cv_text(text)


In [20]:
# -------------------------------
# ðŸš€ Full Pipeline Execution
# -------------------------------
pdf_path = "../yb.pdf"  

sections = process_cv(pdf_path)



Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats



ðŸ”¹ PremiÃ¨re occurrence de la section : 'summary' Ã  la ligne 2 âž¤ SUMMARY

ðŸ”¹ PremiÃ¨re occurrence de la section : 'experience' Ã  la ligne 5 âž¤ EXPERIENCE

ðŸ”¹ PremiÃ¨re occurrence de la section : 'education' Ã  la ligne 36 âž¤ EDUCATION

ðŸ”¹ PremiÃ¨re occurrence de la section : 'certifications' Ã  la ligne 42 âž¤ CERTIFICATIONS

ðŸ”¹ PremiÃ¨re occurrence de la section : 'skills' Ã  la ligne 53 âž¤ SKILLS
ðŸ“„ Contenu de la section 'summary':
Full-Stack Software Engineer with experience building scalable, AI-powered web applications using Java, JavaScript, React, Spring Boot, FastAPI, and
AWS. Eager to contribute in dynamic environments.
------------------------------------------------------------
ðŸ“„ Contenu de la section 'experience':
Full Stack Software Engineer|Expersi|Paris|December 2023 - Present
â€¢Contributed to the development and maintenance of scalable backend solutions and REST APIs using Python (FastAPI) and Java (Spring Boot), and
assisted in building front-en

In [24]:
# -------------------------------
# ðŸš€ Full Pipeline Execution
# -------------------------------
pdf_path = "../ah.pdf"  

sections = process_cv(pdf_path)




ðŸ”¹ PremiÃ¨re occurrence de la section : 'summary' Ã  la ligne 5 âž¤ PrÃ©sentation

ðŸ”¹ PremiÃ¨re occurrence de la section : 'skills' Ã  la ligne 13 âž¤ CompÃ©tences clÃ©s

ðŸ”¹ PremiÃ¨re occurrence de la section : 'experience' Ã  la ligne 53 âž¤ ExpÃ©riences professionnelles

ðŸ”¹ PremiÃ¨re occurrence de la section : 'languages' Ã  la ligne 380 âž¤ Langues
ðŸ“„ Contenu de la section 'summary':
16 ans dâ€™expÃ©rience
Directeur IT et Chef de projet expÃ©rimentÃ©, certifiÃ© Agile Scrum PSPO I, avec plus de 16 ans dâ€™expÃ©rience dans la
gestion de projets stratÃ©giques et la conduite de transformations digitales. Reconnu pour ma rigueur, ma capacitÃ© Ã 
piloter des Ã©quipes pluridisciplinaires et multiculturelles, jâ€™interviens avec efficacitÃ© sur des environnements
complexes, en garantissant un haut niveau de qualitÃ©, de performance et de conformitÃ©.
Information
Localisation : Paris - Impact Carbonne : 9.78 g COâ‚‚e TJM : 700â‚¬ DisponibilitÃ© : ImmÃ©diate
-----------------------