In [1]:
import pandas as pd
import re
from typing import List, Dict, Union
import numpy as n

Define a function to extract accident details from a résumé text.
This function uses improved regular expressions to identify patterns in the text, such as severity, location context, vehicle types, and parties involved.
It returns a dictionary of these extracted details.

In [2]:
def extract_accident_details_improved(resume: str) -> Dict[str, Union[str, bool, List[str]]]:
    """
    Extracts accident details from a résumé string using improved regular expressions.

    This function takes a string (representing a resume or text containing accident information)
    and uses regular expressions to find and extract specific details about an accident.
    It returns a dictionary where keys are the types of details extracted (e.g., severity,
    location, vehicle types) and values are the extracted information.

    Args:
        resume: A string containing the accident details.

    Returns:
        A dictionary containing the extracted accident details. If no details are found
        or if the input is not a string, an empty dictionary is returned.
    """
    

    # Initialize an empty dictionary to store the extracted accident details.
    result = {}

    # Define regular expression patterns for different accident characteristics.
    # Each key in this dictionary represents a piece of information we want to extract,
    # and the value is the corresponding regular expression pattern to find it.
    # The parentheses in the regex define "capture groups", which extract the specific
    # part of the text we're interested in.
    combined_patterns = {
        "severity": r"Accident (Léger non mortel|Grave non mortel|Mortel)",
        "location_context": r"(En agglomération|Hors agglomération)",
        "road_configuration": r"(En Y|En T|En X|Hors intersection|Place|A plus de 4 branches)",
        "lighting": r"(Plein jour|Crépuscule ou aube|Nuit avec éclairage public allumé|Nuit sans éclairage public)",
        "weather": r"météo (Normale|Pluie légère|Temps couvert|Pluie forte)",
        "surface": r"surface chaussée : (Normale|Mouillée|Non renseigné|Autre|Corps gras - huile|Enneigée|Flaques)"
    }

    # Iterate through each pattern and try to find a match in the resume.
    for key, pattern in combined_patterns.items():
        # Search for the pattern in the resume string.
        match = re.search(pattern, resume)
        # If a match is found, extract the captured group (the specific detail) and
        # store it in the result dictionary. If no match is found, store None.
        result[key] = match.group(1) if match else None

    # Define regular expression patterns for different types of vehicles involved in the accident.
    # The keys are descriptive names for each vehicle type, and the values are the
    # exact strings to search for in the resume.
    vehicle_patterns = {
        "Cyclomoteur <=50 cm3": r"Cyclomoteur <=50 cm3",
        "Véhicule de tourisme (VT)": r"Véhicule de tourisme \(VT\)",
        "Moto ou sidecar > 50 <= 125 cm3": r"Moto ou sidecar > 50 <= 125 cm3",
        "Moto ou sidecar > 125 cm3": r"Moto ou sidecar > 125 cm3",
        "Scooter <= 50 cm3": r"Scooter <= 50 cm3",
        "Scooter > 125 cm3": r"Scooter > 125 cm3",
        "Scooter > 50 <= 125 cm3": r"Scooter > 50 <= 125 cm3",
        "VU seul 1,5T < PTAC <=3,5T": r"VU seul 1,5T < PTAC <=3,5T",
        "PL > 3,5T + remorque": r"PL > 3,5T \+ remorque",
        "PL seul 3,5T <=": r"PL seul 3,5T <=",
        "Autocar": r"Autocar",
        "Autobus": r"Autobus",
        "Bicyclette": r"Bicyclette",
        "EDP-m": r"EDP-m",
        "Voiturette": r"Voiturette",
        "Quad léger <= 50 cm3": r"Quad léger <= 50 cm3",
        "Tramway": r"Tramway",
        "Autre véhicule": r"Autre véhicule",
        "Tracteur routier + semi-remorque": r"Tracteur routier \+ semi-remorque",
        "PL seul PTAC > 7,5T" : r"PL seul PTAC > 7,5T",
        "3 RM > 125 cm3": r"3 RM > 125 cm3",
        "3 RM <= 50 cm3": r"3 RM <= 50 cm3",
        "Vélo par assistance électrique": r"Vélo par assistance électrique",
        "Engin spécial": r"Engin spécial",
        "EDP-sm": r"EDP-sm",
        "Quad lourd > 50 cm3": r"Quad lourd > 50 cm3",
        "Tracteur agricole": r"Tracteur agricole",
        "EDP sans moteur": r"Autre engin de déplacement personnel \(EDP\) sans moteur",
        "Indéterminable" : r"Indéterminable",
        "Piéton": r"Piéton",
        "PL seul 3,5T <PTAC <= 7,5T": r"PL seul 3,5T <PTAC <= 7,5T",
        "Nouvel engin de déplacement personnel \(EDP\) à moteur": r"Nouvel engin de déplacement personnel \(EDP\) à moteur",
        "Tracteur routier": r"Tracteur routier",
        "PL seul PTAC <= 7,5T": r"PL seul PTAC <= 7,5T",
        "3 RM > 50 <= 125 cm3": r"3 RM > 50 <= 125 cm3",
        "Autre engin de déplacement personnel \(EDP\) sans moteur": r"Autre engin de déplacement personnel \(EDP\) sans moteur",
    }
    # Combine all the vehicle patterns into a single regular expression.
    # The "|" acts as an "OR" operator, so it will find any of the vehicle types.
    all_vehicle_patterns_regex = "|".join(vehicle_patterns.values())
    # Find all occurrences of the vehicle types in the resume.
    result["vehicle_types"] = re.findall(all_vehicle_patterns_regex, resume)

    # Define regular expression patterns for different parties involved in the accident.
    parties_patterns = {
        "Piéton Féminin": r"\b1 Piéton Feminin\b",
        "Piéton Masculin": r"\b1 Piéton Masculin\b",
        "Piéton": r"\b1 Piéton(?!\s(?:Feminin|Masculin))\b", # Matches '1 Piéton' but not followed by Feminin or Masculin
        "Usager Masculin": r"\b1 usager Masculin(?: de \d+ ans)?(?:\s\(.+?\))?(?!\spassager)", # Avoid matching passengers
        "Usager Féminin": r"\b1 usager Feminin(?: de \d+ ans)?(?:\s\(.+?\))?(?!\spassager)", # Avoid matching passengers
        "Passager Masculin": r"avec \d+ passager(?:s)? Masculin",
        "Passager Féminin": r"avec \d+ passager(?:s)? Feminin",
        "Passager": r"avec (\d+) passager", # Captures the number of passengers
        "Bicyclette": r"\b1\s(?:Bic|Bicyclette)\b",
        "Véhicule de tourisme (VT)": r"heurte 1 Véhicule de tourisme \(VT\)",
    }

    # Initialize an empty list to store the parties involved.
    result["parties_involved"] = []
    # Iterate through each party pattern and try to find matches in the resume.
    for key, pattern in parties_patterns.items():
        # Find all occurrences of the pattern in the resume.
        matches = re.findall(pattern, resume)
        # If matches are found, process them based on the type of party.
        if matches:
            # For "Passager", the regex captures the number of passengers, so we add
            # a descriptive string including the count.
            if key == "Passager":
                for count in matches:
                    result["parties_involved"].append(f"avec {count} passager{'s' if int(count) > 1 else ''}")
            # For other party types, we simply add the party type to the list for each match found.
            else:
                result["parties_involved"].extend([key] * len(matches))

    return result

# --- Main part of the script ---
if __name__ == "__main__":
    # Load the CSV file containing accident data.
    # Specify the encoding and separator used in the file.
    df = pd.read_csv("./data/accidents.csv", encoding='utf-8', sep=";")

    # Normalize spaces in the 'Résumé' column to ensure consistency.
    # This replaces multiple spaces with a single space and removes leading/trailing spaces.
    df['Résumé'] = df['Résumé'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()) if isinstance(x, str) else x)

    # Apply the improved extraction function to each 'Résumé' in the DataFrame.
    # This creates a new column 'Résumé_Details_Improved' containing the extracted details as dictionaries.
    df['Résumé_Details_Improved'] = df['Résumé'].apply(extract_accident_details_improved)

    # Create new columns in the DataFrame from the extracted details in the 'Résumé_Details_Improved' column.
    # Each key in the extracted dictionaries becomes a new column.
    df_improved = pd.concat([df, df['Résumé_Details_Improved'].apply(pd.Series)], axis=1)

    # Remove the original 'Résumé_Details_Improved' and 'Résumé_Details' columns as they are no longer needed.
    # The 'errors='ignore'' argument ensures that the code doesn't break if these columns don't exist.
    df_improved = df_improved.drop(['Résumé_Details_Improved'], axis=1, errors='ignore')
    df_improved = df_improved.drop(['Résumé_Details'], axis=1, errors='ignore')

    # Display the first 15 rows of the DataFrame with the extracted information in a Markdown table format.
    # This helps to easily view the results.
    print(df_improved[['Résumé', 'severity', 'location_context', 'road_configuration','lighting', 'weather','surface','vehicle_types','parties_involved']].head(15).to_markdown(index=False))

    # Save the enriched DataFrame to a new CSV file.
    # The 'index=False' argument prevents writing the DataFrame index to the CSV file.
    df_improved.to_csv('accidents-enriched-improved.csv', index = False)

  "Nouvel engin de déplacement personnel \(EDP\) à moteur": r"Nouvel engin de déplacement personnel \(EDP\) à moteur",
  "Autre engin de déplacement personnel \(EDP\) sans moteur": r"Autre engin de déplacement personnel \(EDP\) sans moteur",
  "Nouvel engin de déplacement personnel \(EDP\) à moteur": r"Nouvel engin de déplacement personnel \(EDP\) à moteur",
  "Autre engin de déplacement personnel \(EDP\) sans moteur": r"Autre engin de déplacement personnel \(EDP\) sans moteur",


TypeError: expected string or bytes-like object, got 'float'

In [None]:
# Check if the input is valid. If it's not a string or is missing, return an empty dictionary.
    if not isinstance(resume, str) or pd.isna(resume):
        return {}