In [None]:
import pdfplumber
import pandas as pd
import re
import os
import googlemaps
from tqdm import tqdm
from time import sleep

# === CONFIGURATION ===
API_KEY = "AIzaSyDjifOa--9xZsxl7V5QHKcZTaFvu5WRW3k"  # Replace with your valid Google Maps API key

# === PATHS === 
input_pdf = r"../manual_data_imports/Releve-des-services-agrees-pour-personnes-agees-ACC.pdf" #the data of the Ministry of Family is unfortunately only available in a pdf document. This might change in the future.
output_folder = r"../assets"

# === CREATE OUTPUT DIRECTORY ===
os.makedirs(output_folder, exist_ok=True)

# === SECTION MAPPING ===
section_map = {
    1: "Structures d'hébergement",
    2: "Logements encadrés",
    3: "Centres de jour",
    4: "Clubs Aktiv Plus",
    5: "Services Aide et Soins à domicile",  # This section will be omitted
    6: "Services repas sur roues",
    7: "Services téléalarme",
    8: "Services activités seniors"
}

data = []
current_type = "Unknown"  # Tracks section type (Hébergement, Centre de jour, etc.)

# === EXTRACT TEXT FROM PDF ===
with pdfplumber.open(input_pdf) as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        text = page.extract_text()
        if not text:
            continue

        # Normalize text
        text = (
            text.replace('\u2019', "'")
                .replace('\u2013', '-')
                .replace('\u2014', '-')
                .replace('\xa0', ' ')
        )
        text = text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')

        # Detect current section type
        if re.search(r'Centres?\s+de\s+jour', text, flags=re.IGNORECASE):
            current_type = "Centre de jour"
        elif re.search(r'Structures?\s+d[’\'`]?hébergement', text, flags=re.IGNORECASE):
            current_type = "Hébergement"

        # Split entries like "1.1", "2.3", etc.
        entries = re.split(r'\n(?=\d+\.\d+\s+)', text)

        for entry in entries:
            entry = entry.strip()
            if not entry or not re.match(r'^\d+\.\d+', entry):
                continue

            numero_match = re.match(r'^(\d+)\.(\d+)', entry)
            if not numero_match:
                continue

            section_number = int(numero_match.group(1))
            if section_number == 5:
                continue  # Skip section 5

            numero = numero_match.group(0).strip()
            section_name = section_map.get(section_number, "")

            # === Name ===
            name_match = re.match(r'^\d+\.\d+\s+([A-ZÉÈÀ].+)', entry)
            name = name_match.group(1).split('\n')[0].strip() if name_match else ""
            if " - " in name:
                name = name.split(" - ", 1)[1].strip()

            # === City ===
            ville_match = re.search(r'L-\d{4}\s+([A-ZÉÈÀa-z\-]+)', entry)
            ville = ville_match.group(1).strip() if ville_match else ""

            # === Address ===
            lines = entry.split('\n')
            address = ""
            for i, line in enumerate(lines):
                if re.match(r'L-\d{4}\s+[A-ZÉÈÀa-z\-]+', line.strip()):
                    if i > 0:
                        address = lines[i - 1].strip() + ", " + line.strip()
                    else:
                        address = line.strip()
                    break

            # === Email ===
            email_match = re.search(r'[\w\.-]+@[\w\.-]+', entry)
            email = email_match.group(0).strip() if email_match else ""

            # === Website ===
            web_match = re.findall(r'(www\.[\w\.-]+)', entry)
            website = "; ".join(web_match) if web_match else ""

            # === Capacity ===
            cap_match = re.search(
                r'capacit[ée]\s*d[’\'`]?accueil\s*[:\-]?\s*(\d+\s*(?:chambres?|lits?|places?|chaises?))',
                entry,
                flags=re.IGNORECASE
            )
            if not cap_match:
                cap_match = re.search(r'(\d+\s*(?:chambres?|lits?|places?|chaises?))', entry, flags=re.IGNORECASE)
            capacity = cap_match.group(1).strip() if cap_match else ""

            # === Logements ===
            rooms_match = re.search(r'Nombre\s+de\s+logements\s*[:\-]?\s*(.+)', entry)
            rooms = rooms_match.group(1).split('\n')[0].strip() if rooms_match else ""

            # === Append ===
            data.append({
                "Section": section_name,
                "Type": current_type,
                "Numero": numero,
                "Ville": ville,
                "Nom": name,
                "Adresse": address,
                "Email": email,
                "Site web": website,
                "Capacité": capacity,
                "Logements": rooms,
                "Page": page_num
            })

# === CREATE DATAFRAME ===
df = pd.DataFrame(data)

# === ADD FULL ADDRESS (for better geocoding accuracy) ===
df["Full_Address"] = df["Adresse"].fillna('') + ", " + df["Ville"].fillna('')

# === GOOGLE MAPS GEOCODING ===
gmaps = googlemaps.Client(key=API_KEY)
df["lat"] = None
df["long"] = None

for i, row in tqdm(df.iterrows(), total=len(df), desc="Geocoding addresses"):
    full_address = row["Full_Address"]
    if pd.notna(full_address) and full_address.strip():
        try:
            geocode_result = gmaps.geocode(full_address)
            if geocode_result:
                location = geocode_result[0]["geometry"]["location"]
                df.at[i, "lat"] = location["lat"]
                df.at[i, "long"] = location["lng"]
        except Exception as e:
            print(f"Error for '{full_address}': {e}")
            sleep(1)
    sleep(0.1)

# === SPLIT INTO MULTIPLE CSV FILES BY CHAPTER (1., 2., 3., etc.) ===
df["Chapter"] = df["Numero"].apply(lambda x: x.split('.')[0] if isinstance(x, str) and '.' in x else None)

# Mapping chapter numbers → filenames
chapter_filename_map = {
    "1": "hebergements_latlon.csv",        # Structures d'hébergement
    "2": "logementsEncadres_latlon.csv",   # Logements encadrés
    "3": "centresJour_latlon.csv",         # Centres de jour
    "4": "activePlus_latlon.csv",          # Clubs Aktiv Plus
    "7": "alarmes_latlon.csv",             # Services téléalarme
    "8": "activities_latlon.csv"           # Services activités seniors
    # Note: 5 and 6 omitted as per your extraction logic
}

for chapter, subdf in df.groupby("Chapter"):
    filename = chapter_filename_map.get(chapter, f"chapter_{chapter}.csv")  # default if unmapped
    output_csv = os.path.join(output_folder, filename)
    subdf.to_csv(output_csv, index=False, encoding="utf-8-sig", sep=';', quoting=1)
    print(f"💾 Saved {output_csv} ({len(subdf)} rows)")

print("\n✅ All CSV files created successfully in:")
print(output_folder)
