In [35]:
import pandas as pd
import re
from typing import List, Dict, Union

## Configuration

In [36]:
combined_patterns = {
    "road_configuration": r"(En Y|En T|En X|Hors intersection|Place|A plus de 4 branches)",
    "lighting": r"(Plein jour|Crépuscule ou aube|Nuit avec éclairage public allumé|Nuit sans éclairage public)",
    "weather": r"météo (Normale|Pluie légère|Temps couvert|Pluie forte)",
    "surface": r"surface chaussée : (Normale|Mouillée|Non renseigné|Autre|Corps gras - huile|Enneigée|Flaques)"
}

vehicle_patterns = {
    "Cyclomoteur <=50 cm3": r"Cyclomoteur <=50 cm3",
    "Véhicule de tourisme (VT)": r"Véhicule de tourisme \(VT\)",
    "Moto ou sidecar > 50 <= 125 cm3": r"Moto ou sidecar > 50 <= 125 cm3",
    "Moto ou sidecar > 125 cm3": r"Moto ou sidecar > 125 cm3",
    "Scooter <= 50 cm3": r"Scooter <= 50 cm3",
    "Scooter > 125 cm3": r"Scooter > 125 cm3",
    "Scooter > 50 <= 125 cm3": r"Scooter > 50 <= 125 cm3",
    "VU seul 1,5T < PTAC <=3,5T": r"VU seul 1,5T < PTAC <=3,5T",
    "PL > 3,5T + remorque": r"PL > 3,5T \+ remorque",
    "PL seul 3,5T <=": r"PL seul 3,5T <=",
    "Autocar": r"Autocar",
    "Autobus": r"Autobus",
    "Bicyclette": r"Bicyclette",
    "EDP-m": r"EDP-m",
    "Voiturette": r"Voiturette",
    "Quad léger <= 50 cm3": r"Quad léger <= 50 cm3",
    "Tramway": r"Tramway",
    "Autre véhicule": r"Autre véhicule",
    "Tracteur routier + semi-remorque": r"Tracteur routier \+ semi-remorque",
    "PL seul PTAC > 7,5T": r"PL seul PTAC > 7,5T",
    "3 RM > 125 cm3": r"3 RM > 125 cm3",
    "3 RM <= 50 cm3": r"3 RM <= 50 cm3",
    "Vélo par assistance électrique": r"Vélo par assistance électrique",
    "Engin spécial": r"Engin spécial",
    "EDP-sm": r"EDP-sm",
    "Quad lourd > 50 cm3": r"Quad lourd > 50 cm3",
    "Tracteur agricole": r"Tracteur agricole",
    "EDP sans moteur": r"Autre engin de déplacement personnel \(EDP\) sans moteur",
    "Indéterminable": r"Indéterminable",
    "Piéton": r"Piéton",
    "PL seul 3,5T <PTAC <= 7,5T": r"PL seul 3,5T <PTAC <= 7,5T",
    "Nouvel engin de déplacement personnel (EDP) à moteur": r"Nouvel engin de déplacement personnel \(EDP\) à moteur",
    "Tracteur routier": r"Tracteur routier",
    "PL seul PTAC <= 7,5T": r"PL seul PTAC <= 7,5T",
    "3 RM > 50 <= 125 cm3": r"3 RM > 50 <= 125 cm3",
    "Autre engin de déplacement personnel (EDP) sans moteur": r"Autre engin de déplacement personnel \(EDP\) sans moteur"
}

parties_sex = {
    "Usager Masculin": r"\b1 usager Masculin(?: de \d+ ans)?(?:\s\(.+?\))?(?!\spassager)",
    "Usager Féminin": r"\b1 usager Feminin(?: de \d+ ans)?(?:\s\(.+?\))?(?!\spassager)",
}

## Functions

In [37]:
def extract_accident_details(resume: str) -> Dict[str, Union[str, List[str]]]:
    """Extract accident details from a resume string using predefined patterns."""
    if not isinstance(resume, str) or pd.isna(resume):
        return {}

    # Extract combined characteristics
    extracted_data = {
        key: (m.group(1) if (m := re.search(pattern, resume)) else None)
        for key, pattern in combined_patterns.items()
    }

    # Extract vehicle types (exclude "Piéton" in one go)
    all_vehicles = [v for v, pat in vehicle_patterns.items() if re.search(pat, resume)]
    extracted_data["vehicle_types"] = [v for v in all_vehicles if v != "Piéton"]

    # Extract driver sex (first that matches)
    extracted_data["driver_sex"] = next(
        ("Masculin" if key == "Usager Masculin" else "Feminin"
         for key, pat in parties_sex.items()
         if re.search(pat, resume)),
        None
    )

    return extracted_data

In [38]:
def enrich_data_from_resume(df: pd.DataFrame) -> pd.DataFrame:
    """Enrich the DataFrame with extracted information from the 'Résumé' column."""
    # Apply extraction
    details = df["Résumé"].apply(extract_accident_details)
    extracted_df = pd.json_normalize(details)

    # Convert vehicle_types to a comma-separated string
    if "vehicle_types" in extracted_df.columns:
        extracted_df["vehicle_types"] = extracted_df["vehicle_types"].apply(
            lambda x: ", ".join(x) if isinstance(x, list) else ""
        )
    else:
        extracted_df["vehicle_types"] = ""

    # Concatenate
    df_enriched = pd.concat([df, extracted_df], axis=1)

    # Drop unwanted columns
    columns_to_drop = [
        "Résumé",
        "Coordonnées",
        "Nom arrondissement",
        "arronco",
        "arrondgeo",
        "Coordonnées.1",
        "Champ13",
        "PV",
        "IdUsager",
        "Nom arrondissement.1",
        "Id accident"
        # Also remove location_context if it got created as a column
        "location_context",
    ]
    # Use errors="ignore" in case some columns don't exist
    df_enriched.drop(columns=columns_to_drop, axis=1, errors="ignore", inplace=True)

    return df_enriched

## Load the data and enrich the dataframe

In [39]:
if __name__ == "__main__":
    df = pd.read_csv("../data/accidents.csv", encoding="utf-8", sep=";")

    df["Résumé"] = df["Résumé"].apply(
        lambda x: re.sub(r"\s+", " ", x.strip()) if isinstance(x, str) else x
    )

    df_enriched = enrich_data_from_resume(df)
    df_enriched.to_csv("../data/accidents-enriched.csv", index=False)
    print("Enriched data saved to 'accidents-enriched.csv'")

Enriched data saved to 'accidents-enriched.csv'
