In [None]:
import pdfplumber
import pandas as pd
import re
import os
 
# === Paths ===
input_path = r"C:\Users\inputs\Releve-des-services-agrees-pour-personnes-agees-ACC.pdf"
output_path = r"C:\Users\outputs\structures_hebergement_formatted.xlsx"
 
data = []
 
# Section mapping
section_map = {
    1: "Structures d'hébergement",
    2: "Logements encadrés",
    3: "Centres de jour",
    4: "Clubs Aktiv Plus",
    5: "Services Aide et Soins à domicile",  # We will omit this
    6: "Services repas sur roues",
    7: "Services téléalarme",
    8: "Services activités seniors"
}
 
current_type = "Unknown"  # Tracks whether we're in "Hébergement" or "Centre de jour"
 
with pdfplumber.open(input_path) as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        text = page.extract_text()
        if not text:
            continue
 
        # Normalize text
        text = (
            text.replace('\u2019', "'")
                .replace('\u2013', '-')
                .replace('\u2014', '-')
                .replace('\xa0', ' ')
        )
        text = text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')
 
        # Detect section type (based on headings)
        if re.search(r'Centres?\s+de\s+jour', text, flags=re.IGNORECASE):
            current_type = "Centre de jour"
        elif re.search(r'Structures?\s+d[’\'`]?hébergement', text, flags=re.IGNORECASE):
            current_type = "Hébergement"
 
        # Split each entry by numbering pattern like "1.1", "2.3", "3.1"
        entries = re.split(r'\n(?=\d+\.\d+\s+)', text)
 
        for entry in entries:
            entry = entry.strip()
            if not entry or not re.match(r'^\d+\.\d+', entry):
                continue
 
            # === Extract fields ===
            numero_match = re.match(r'^(\d+)\.(\d+)', entry)  # Capture section number
            if not numero_match:
                continue
 
            section_number = int(numero_match.group(1))
            if section_number == 5:  # Omit Section 5
                continue
 
            numero = numero_match.group(0).strip()
            section_name = section_map.get(section_number, "")
 
            name_match = re.match(r'^\d+\.\d+\s+([A-ZÉÈÀ].+)', entry)
            name = name_match.group(1).split('\n')[0].strip() if name_match else ""
 
            # Remove city prefix if exists: "City - Name" -> "Name"
            if " - " in name:
                name = name.split(" - ", 1)[1].strip()
 
            # === Ville ===
            ville_match = re.search(r'L-\d{4}\s+([A-ZÉÈÀa-z\-]+)', entry)
            ville = ville_match.group(1).strip() if ville_match else ""
 
            # === Adresse ===
            lines = entry.split('\n')
            address = ""
            for i, line in enumerate(lines):
                if re.match(r'L-\d{4}\s+[A-ZÉÈÀa-z\-]+', line.strip()):
                    if i > 0:
                        address = lines[i - 1].strip() + ", " + line.strip()
                    else:
                        address = line.strip()
                    break
 
            # === Email ===
            email_match = re.search(r'[\w\.-]+@[\w\.-]+', entry)
            email = email_match.group(0).strip() if email_match else ""
 
            # === Site web ===
            web_match = re.findall(r'(www\.[\w\.-]+)', entry)
            website = "; ".join(web_match) if web_match else ""
 
            # === Capacité ===
            cap_match = re.search(
                r'capacit[ée]\s*d[’\'`]?accueil\s*[:\-]?\s*(\d+\s*(?:chambres?|lits?|places?|chaises?))',
                entry,
                flags=re.IGNORECASE
            )
            if not cap_match:
                cap_match = re.search(r'(\d+\s*(?:chambres?|lits?|places?|chaises?))', entry, flags=re.IGNORECASE)
            capacity = cap_match.group(1).strip() if cap_match else ""
 
            # === Nombre de logements (optional) ===
            rooms_match = re.search(r'Nombre\s+de\s+logements\s*[:\-]?\s*(.+)', entry)
            rooms = rooms_match.group(1).split('\n')[0].strip() if rooms_match else ""
 
            # === Append extracted info ===
            data.append({
                "Section": section_name,
                "Type": current_type,
                "Numero": numero,
                "Ville": ville,
                "Nom": name,
                "Adresse": address,
                "Email": email,
                "Site web": website,
                "Capacité": capacity,
                "Logements": rooms,
                "Page": page_num
            })
 
# === Convert to DataFrame ===
df = pd.DataFrame(data, columns=[
    "Section", "Type", "Numero", "Ville", "Nom", "Adresse", "Email",
    "Site web", "Capacité", "Logements", "Page"
])
 
# === Save to Excel ===
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_excel(output_path, index=False)
 
print(f"{len(df)} structures extracted and saved to {output_path}")