In [None]:
import pandas as pd
import os
import re
import json

def create_department_csv(department_code):
    """
    Creates a CSV file containing all communes for a given department.
    
    Args:
        department_code (str): The department code (e.g., '06', '75', '2A')
    
    Returns:
        str: The path of the created file
    """
    # Read source file
    # Specify dtypes for code columns to preserve leading zeros and avoid .0 suffix
    dtype_dict = {
        'code_insee': str,
        'dep_code': str,
        'code_postal': str,
        'codes_postaux': str,
        'reg_code': str,
        'canton_code': str,
        'epci_code': str
    }
    
    df = pd.read_csv('communes-france-2025.csv', dtype=dtype_dict, low_memory=False)
    
    # Filter by department
    df_dept = df[df['dep_code'] == str(department_code)]
    
    # Create directory if not exists
    os.makedirs('departements', exist_ok=True)
    
    # Create filename
    output_file = f'departements/communes-departement{department_code}.csv'
    
    # Save file
    df_dept.to_csv(output_file, index=False)
    
    print(f"CSV created: {output_file} ({len(df_dept)} communes)")
    
    return output_file


def create_department_json(department_code):
    """
    Creates a JSON file containing all gas stations for a given department.
    
    Args:
        department_code (str): The department code (e.g., '06', '75', '2A')
    
    Returns:
        str: The path of the created file
    """
    # Read source JSON file
    with open('prix-carburants-quotidien.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Filter stations by department code
    filtered_data = [station for station in data if station.get('dep_code') == str(department_code)]
    
    # Create directory if not exists
    os.makedirs('departements', exist_ok=True)
    
    # Create filename
    output_file = f'departements/stations-departement{department_code}.json'
    
    # Save file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_data, f, ensure_ascii=False)
    
    print(f"JSON created: {output_file} ({len(filtered_data)} stations)")
    
    return output_file


def create_department_files(department_code):
    """
    Creates both CSV and JSON files for a given department.
    
    Args:
        department_code (str): The department code (e.g., '06', '75', '2A')
    
    Returns:
        tuple: (csv_file_path, json_file_path)
    """
    csv_file = create_department_csv(department_code)
    json_file = create_department_json(department_code)
    return csv_file, json_file

In [None]:
import subprocess
import sys
import os
import re

def run_mapping(department_code):
    """
    Runs the RML mapping for a specific department.
    
    Args:
        department_code (str): The department code.
    """
    # 1. Create the department CSV and JSON files
    csv_file_path, json_file_path = create_department_files(department_code)
    
    # 2. Update the mapping file
    mapping_file = 'mapping-optimized.ttl'
    
    # Ensure mappings directory exists
    os.makedirs('mappings', exist_ok=True)
    temp_mapping_file = f'mappings/mapping_{department_code}.ttl'
    
    with open(mapping_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Replace CSV source file (communes)
    new_content = re.sub(
        r'rml:source\s+"[^"]+\.csv"\s*;',
        f'rml:source "{csv_file_path}" ;',
        content
    )
    
    # Replace JSON source file (stations)
    new_content = re.sub(
        r'rml:source\s+"[^"]+\.json"\s*;',
        f'rml:source "{json_file_path}" ;',
        new_content
    )
    
    with open(temp_mapping_file, 'w', encoding='utf-8') as f:
        f.write(new_content)
        
    print(f"Mapping created: {temp_mapping_file}")
    
    # 3. Run the Docker command
    # Ensure outputs directory exists
    os.makedirs('outputs', exist_ok=True)
    output_file = f'outputs/output_{department_code}.ttl'
    
    print(f"Running RML mapper for department {department_code}...")
    
    # Get absolute path for volume mount
    current_dir = os.getcwd()
    
    # Docker command
    cmd = [
        "docker", "run", "--rm",
        "-v", f"{current_dir}:/data",
        "-w", "/data",
        "rmlio/rmlmapper-java:latest",
        "--mappingfile", temp_mapping_file,
        "--outputfile", output_file,
        "--serialization", "turtle"
    ]
    
    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            encoding='utf-8'
        )
        
        return_code = result.returncode
        
        if return_code == 0:
            if os.path.exists(output_file):
                print(f"Mapping completed: {output_file}")
            else:
                print("Error: Docker command succeeded but output file was not created.")
        else:
            print(f"Docker execution failed with return code: {return_code}")
            print("Error output:")
            print(result.stderr)
            
    except Exception as e:
        print(f"An error occurred during execution: {e}")

In [None]:
import time
import os
from tqdm import tqdm

# Load departments list
df_communes = pd.read_csv('communes-france-2025.csv')
# Get unique department codes and sort them
departments = sorted(df_communes['dep_code'].astype(str).unique().tolist())

# Dictionary to store results: dep_code -> (time_taken, line_count)
results = {}

# Select a subset for demonstration (e.g., first 5) to avoid long execution time
# Uncomment the next line to run for all departments
# target_departments = departments 
target_departments = departments

print(f"Running benchmark for {len(target_departments)} departments...")

for dep in tqdm(target_departments):
    
    start_time = time.perf_counter()
    
    # Run the mapping
    run_mapping(dep)
    
    end_time = time.perf_counter()
    execution_time = end_time - start_time
    
    # Count lines in output file
    output_file = f'outputs/output_{dep}.ttl'
    line_count = 0
    if os.path.exists(output_file):
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                line_count = sum(1 for _ in f)
        except Exception as e:
            print(f"Error counting lines: {e}")
    
    results[dep] = (execution_time, line_count)
    print(f"Department {dep}: {execution_time:.4f}s, {line_count} lines")

print("\nBenchmark completed.")

In [None]:
# Create a DataFrame from the results
data = []
for dep, (exec_time, lines) in results.items():
    data.append({
        'Department': dep,
        'Time (s)': exec_time,
        'Lines': lines
    })

df_results = pd.DataFrame(data)

# Sort by Time
df_results_sorted = df_results.sort_values(by='Time (s)', ascending=False)

# Display the sorted table
print("Results sorted by execution time:")
try:
    display(df_results_sorted)
except NameError:
    print(df_results_sorted)

In [2]:
from pathlib import Path
import os
def merge_ttl_files(output_dir='outputs', merged_file='output_merged.ttl'):
    """
    Merge all TTL files from output directory into one file.
    Intelligently handles prefixes to avoid duplication.
    """
    # Get all .ttl files in the outputs directory
    output_path = Path(output_dir)
    ttl_files = sorted(output_path.glob('output_*.ttl'))
    
    if not ttl_files:
        print("No output files found to merge.")
        return
    
    print(f"Found {len(ttl_files)} files to merge...")
    
    # Collect prefixes and triples
    prefixes = set()
    triples = []
    
    for ttl_file in ttl_files:
        print(f"Processing {ttl_file.name}...")
        with open(ttl_file, 'r', encoding='utf-8') as f:
            current_section = 'prefixes'
            for line in f:
                stripped = line.strip()
                
                # Identify prefix lines
                if stripped.startswith('@prefix'):
                    prefixes.add(line)
                # Empty lines separate prefixes from triples
                elif stripped == '' and current_section == 'prefixes':
                    current_section = 'triples'
                # Collect triples (non-prefix, non-empty lines)
                elif stripped and current_section == 'triples':
                    triples.append(line)
    
    # Write merged file
    with open(merged_file, 'w', encoding='utf-8') as f:
        # Write unique prefixes first
        for prefix in sorted(prefixes):
            f.write(prefix)
        
        # Add blank line separator
        f.write('\n')
        
        # Write all triples
        for triple in triples:
            end = ""
            if triple.endswith('.\n'):
               end = "\n"
            f.write(triple + end)
            
    
    print(f"\nMerge completed!")
    print(f"Output file: {merged_file}")
    print(f"Total prefixes: {len(prefixes)}")
    print(f"Total triple lines: {len(triples)}")
    
    # Count total file size
    file_size = os.path.getsize(merged_file)
    print(f"File size: {file_size:,} bytes ({file_size / 1024 / 1024:.2f} MB)")

# Execute merge
merge_ttl_files()

Found 101 files to merge...
Processing output_01.ttl...
Processing output_02.ttl...
Processing output_03.ttl...
Processing output_04.ttl...
Processing output_05.ttl...
Processing output_06.ttl...
Processing output_07.ttl...
Processing output_08.ttl...
Processing output_09.ttl...
Processing output_10.ttl...
Processing output_11.ttl...
Processing output_12.ttl...
Processing output_13.ttl...
Processing output_14.ttl...
Processing output_15.ttl...
Processing output_16.ttl...
Processing output_17.ttl...
Processing output_18.ttl...
Processing output_19.ttl...
Processing output_21.ttl...
Processing output_22.ttl...
Processing output_23.ttl...
Processing output_24.ttl...
Processing output_25.ttl...
Processing output_26.ttl...
Processing output_27.ttl...
Processing output_28.ttl...
Processing output_29.ttl...
Processing output_2A.ttl...
Processing output_2B.ttl...
Processing output_30.ttl...
Processing output_31.ttl...
Processing output_32.ttl...
Processing output_33.ttl...
Processing output_34