In [None]:
#!/usr/bin/env python3

import pandas as pd
import geopandas as gpd
import numpy as np
import os
import sys
import warnings
from datetime import datetime
from scipy import stats
from pathlib import Path

warnings.filterwarnings('ignore')

EXCEL_PATH = r"\assessment_of_wells_chile\data\DGA\DGA_dataset_analysis_output\DGA_Data_Clean_With_Spatial_Joins.xlsx"
OUTPUT_FOLDER = r"\assessment_of_wells_chile\data\Census\Census_Analysis_Output"
GDB_PATH = r"\assessment_of_wells_chile\arcgis\assessment_of_wells_chile\Default.gdb"
LAYER_CENSO_2024 = "Censo_24_Merge"
LAYER_CENSO_2017 = "Microdatos_Censo_2017_Merge"
SHP_CUENCAS = r"\assessment_of_wells_chile\data\Basins\Cuencas_BNA\Cuencas_BNA.shp"
SHP_SHAC = r"\assessment_of_wells_chile\data\Aquifers\INV_ACUIFEROS_SHAC_202302\INV_ACUIFEROS_SHAC.shp"

TARGET_CRS = "EPSG:4326"
SHAC_BUFFER_DISTANCE = 200

REFERENCE_LAYERS = [
    {
        'path': GDB_PATH,
        'layer_name': 'CHL_Municipalities',
        'prefix': 'Muni',
        'name_col': 'NAME',
        'code_col': 'Code_Muni',
        'native_crs': 'EPSG:3857',
        'is_gdb': True
    },
    {
        'path': GDB_PATH,
        'layer_name': 'CHL_Regions',
        'prefix': 'Region',
        'name_col': 'NAME',
        'code_col': 'ID',
        'native_crs': 'EPSG:3857',
        'is_gdb': True
    },
    {
        'path': SHP_CUENCAS,
        'layer_name': None,
        'prefix': 'Cuenca',
        'name_col': 'NOM_CUEN',
        'code_col': 'COD_CUEN',
        'native_crs': 'EPSG:32719',
        'is_gdb': False
    },
    {
        'path': SHP_SHAC,
        'layer_name': None,
        'prefix': 'SHAC',
        'name_col': 'SHAC',
        'code_col': 'COD_SHAC',
        'native_crs': 'EPSG:32719',
        'is_gdb': False
    }
]

COL_AREA_17 = 'AREA'
COL_AREA_24 = 'AREA_C'
COL_PERS_17 = 'TOTAL_PERS'
COL_PERS_24 = 'n_per'
COL_VIVIENDAS_17 = 'VIV_OCUPA_'
COL_VIVIENDAS_24 = 'n_vp_ocupada'
COL_VIVIENDAS_TOTALES_17 = 'TOTAL_VIVI'
COL_VIVIENDAS_TOTALES_24 = 'n_vp'

WATER_SOURCES = {
    'red_publica': {
        'name': 'Red Publica',
        'name_short': 'RedPublica',
        'col_2017': 'VIV_AGUA_R',
        'col_2024': 'n_fuente_agua_publica',
        'description': 'Viviendas con origen del agua por red publica',
        'priority': 1
    },
    'pozo': {
        'name': 'Pozo o Noria',
        'name_short': 'Pozo',
        'col_2017': 'VIV_AGUA_P',
        'col_2024': 'n_fuente_agua_pozo',
        'description': 'Viviendas con origen del agua por pozo o noria',
        'priority': 2
    },
    'camion': {
        'name': 'Camion Aljibe',
        'name_short': 'Camion',
        'col_2017': 'VIV_AGUA_C',
        'col_2024': 'n_fuente_agua_camion',
        'description': 'Viviendas con origen del agua por camion aljibe',
        'priority': 3
    },
    'rio': {
        'name': 'Rio/Vertiente/Estero',
        'name_short': 'Rio',
        'col_2017': 'VIV_AGUA_1',
        'col_2024': 'n_fuente_agua_rio',
        'description': 'Viviendas con origen del agua por rio, vertiente, estero, canal, lago, etc.',
        'priority': 4
    }
}

def create_output_folder(path):
    Path(path).mkdir(parents=True, exist_ok=True)
    subfolders = ['Excel', 'Shapefiles', 'Reportes', 'Hotspots', 'Statistics', 'Water_Analysis', 'Census_Data']
    for f in subfolders:
        Path(os.path.join(path, f)).mkdir(parents=True, exist_ok=True)

def load_reference_layer(layer_config):
    path = layer_config['path']
    prefix = layer_config['prefix']
    native_crs = layer_config['native_crs']
    is_gdb = layer_config['is_gdb']
    layer_name = layer_config.get('layer_name')
    name_col = layer_config['name_col']
    code_col = layer_config['code_col']
    
    try:
        if is_gdb:
            gdf = gpd.read_file(path, layer=layer_name)
        else:
            gdf = gpd.read_file(path)
        
        if gdf.crs is None:
            gdf = gdf.set_crs(native_crs)
        
        if gdf.crs.to_string() != TARGET_CRS:
            gdf = gdf.to_crs(TARGET_CRS)
        
        available_cols = gdf.columns.tolist()
        
        if name_col not in available_cols:
            for col in available_cols:
                if 'name' in col.lower() or 'nom' in col.lower():
                    name_col = col
                    break
        
        if code_col not in available_cols:
            for col in available_cols:
                if 'cod' in col.lower() or 'id' in col.lower():
                    code_col = col
                    break
        
        cols_to_keep = ['geometry']
        if name_col in gdf.columns:
            cols_to_keep.append(name_col)
        if code_col in gdf.columns and code_col != name_col:
            cols_to_keep.append(code_col)
        
        gdf = gdf[cols_to_keep].copy()
        
        rename_dict = {}
        if name_col in gdf.columns:
            rename_dict[name_col] = f'{prefix}_Name'
        if code_col in gdf.columns and code_col != name_col:
            rename_dict[code_col] = f'{prefix}_Code'
        
        gdf = gdf.rename(columns=rename_dict)
        
        return gdf, prefix
        
    except Exception as e:
        print(f"ERROR loading {prefix}: {str(e)}")
        return None, prefix

def perform_spatial_join(gdf_points, gdf_polygons, prefix):
    if gdf_points is None or len(gdf_points) == 0:
        return gdf_points
    
    if gdf_polygons is None or len(gdf_polygons) == 0:
        return gdf_points
    
    if gdf_points.crs != gdf_polygons.crs:
        gdf_polygons = gdf_polygons.to_crs(gdf_points.crs)
    
    cols_to_drop = [col for col in gdf_points.columns if col.startswith('index_')]
    if cols_to_drop:
        gdf_points = gdf_points.drop(columns=cols_to_drop)
    
    gdf_points = gdf_points.reset_index(drop=True)
    gdf_polygons = gdf_polygons.reset_index(drop=True)
    
    try:
        gdf_joined = gpd.sjoin(
            gdf_points, 
            gdf_polygons, 
            how='left', 
            predicate='within'
        )
        
        cols_to_drop = [col for col in gdf_joined.columns if col.startswith('index_')]
        if cols_to_drop:
            gdf_joined = gdf_joined.drop(columns=cols_to_drop)
        
        gdf_joined = gdf_joined.reset_index(drop=True)
        
        return gdf_joined
        
    except Exception as e:
        print(f"ERROR in spatial join: {str(e)}")
        return gdf_points

def buffer_shac_for_spatial_join(gdf_shac, buffer_distance_meters=200):
    original_crs = gdf_shac.crs
    
    gdf_projected = gdf_shac.to_crs('EPSG:32719')
    
    gdf_buffered = gdf_projected.copy()
    gdf_buffered['geometry'] = gdf_projected.geometry.buffer(buffer_distance_meters)
    
    gdf_buffered = gdf_buffered.to_crs(original_crs)
    
    return gdf_buffered

def get_centroid_gdf(gdf):
    gdf_copy = gdf.copy()
    gdf_copy['_original_geometry'] = gdf_copy.geometry
    gdf_copy['geometry'] = gdf_copy.geometry.centroid
    return gdf_copy

def restore_original_geometry(gdf):
    if '_original_geometry' in gdf.columns:
        gdf['geometry'] = gdf['_original_geometry']
        gdf = gdf.drop(columns=['_original_geometry'])
    return gdf

def add_centroid_coordinates(gdf):
    gdf = gdf.copy()
    centroids = gdf.geometry.centroid
    gdf['Centroid_Lon'] = centroids.x
    gdf['Centroid_Lat'] = centroids.y
    return gdf

def calculate_water_statistics(gdf, groupby_col, year_suffix, area_filter=None):
    if groupby_col not in gdf.columns:
        return pd.DataFrame()
    
    gdf_filtered = gdf[gdf[groupby_col].notna()].copy()
    
    if area_filter is not None and f'Is_Rural_{year_suffix}' in gdf_filtered.columns:
        if area_filter == 'Rural':
            gdf_filtered = gdf_filtered[gdf_filtered[f'Is_Rural_{year_suffix}'] == 1]
        elif area_filter == 'Urban':
            gdf_filtered = gdf_filtered[gdf_filtered[f'Is_Rural_{year_suffix}'] == 0]
    
    if len(gdf_filtered) == 0:
        return pd.DataFrame()
    
    agg_dict = {
        f'N_Blocks_{year_suffix}': (groupby_col, 'count'),
        f'Total_Personas_{year_suffix}': (f'Personas_{year_suffix}', 'sum'),
        f'Total_Viviendas_{year_suffix}': (f'Viviendas_{year_suffix}', 'sum'),
    }
    
    for src_key, src_info in WATER_SOURCES.items():
        col_name = f"{src_info['name_short']}_{year_suffix}"
        if col_name in gdf_filtered.columns:
            agg_dict[f'Total_{src_info["name_short"]}_{year_suffix}'] = (col_name, 'sum')
    
    if f'Is_Rural_{year_suffix}' in gdf_filtered.columns:
        agg_dict[f'Rural_Blocks_{year_suffix}'] = (f'Is_Rural_{year_suffix}', 'sum')
    
    stats = gdf_filtered.groupby(groupby_col).agg(**agg_dict).reset_index()
    
    stats[f'Urban_Blocks_{year_suffix}'] = stats[f'N_Blocks_{year_suffix}'] - stats.get(f'Rural_Blocks_{year_suffix}', 0)
    
    stats[f'Avg_Persons_Per_Household_{year_suffix}'] = (
        stats[f'Total_Personas_{year_suffix}'] / stats[f'Total_Viviendas_{year_suffix}'].replace(0, np.nan)
    ).round(2)
    
    for src_key, src_info in WATER_SOURCES.items():
        col_total = f"Total_{src_info['name_short']}_{year_suffix}"
        col_pct = f"Pct_{src_info['name_short']}_{year_suffix}"
        if col_total in stats.columns:
            stats[col_pct] = (
                stats[col_total] / stats[f'Total_Viviendas_{year_suffix}'].replace(0, np.nan) * 100
            ).round(2)
    
    return stats

def calculate_national_water_statistics(gdf, year_suffix, area_filter=None):
    gdf_filtered = gdf.copy()
    
    if area_filter is not None and f'Is_Rural_{year_suffix}' in gdf_filtered.columns:
        if area_filter == 'Rural':
            gdf_filtered = gdf_filtered[gdf_filtered[f'Is_Rural_{year_suffix}'] == 1]
        elif area_filter == 'Urban':
            gdf_filtered = gdf_filtered[gdf_filtered[f'Is_Rural_{year_suffix}'] == 0]
    
    results = {
        f'N_Blocks_{year_suffix}': len(gdf_filtered),
        f'Total_Personas_{year_suffix}': gdf_filtered[f'Personas_{year_suffix}'].sum(),
        f'Total_Viviendas_{year_suffix}': gdf_filtered[f'Viviendas_{year_suffix}'].sum(),
    }
    
    if f'Is_Rural_{year_suffix}' in gdf_filtered.columns:
        results[f'Rural_Blocks_{year_suffix}'] = gdf_filtered[f'Is_Rural_{year_suffix}'].sum()
        results[f'Urban_Blocks_{year_suffix}'] = len(gdf_filtered) - results[f'Rural_Blocks_{year_suffix}']
    
    for src_key, src_info in WATER_SOURCES.items():
        col_name = f"{src_info['name_short']}_{year_suffix}"
        if col_name in gdf_filtered.columns:
            results[f'Total_{src_info["name_short"]}_{year_suffix}'] = gdf_filtered[col_name].sum()
    
    results[f'Avg_Persons_Per_Household_{year_suffix}'] = (
        results[f'Total_Personas_{year_suffix}'] / results[f'Total_Viviendas_{year_suffix}']
    ) if results[f'Total_Viviendas_{year_suffix}'] > 0 else 0
    
    for src_key, src_info in WATER_SOURCES.items():
        col_total = f"Total_{src_info['name_short']}_{year_suffix}"
        col_pct = f"Pct_{src_info['name_short']}_{year_suffix}"
        if col_total in results:
            results[col_pct] = (
                results[col_total] / results[f'Total_Viviendas_{year_suffix}'] * 100
            ) if results[f'Total_Viviendas_{year_suffix}'] > 0 else 0
    
    return results

def calculate_change_metrics(df, suffix_17='17', suffix_24='24'):
    df = df.copy()
    
    if f'N_Blocks_{suffix_17}' in df.columns and f'N_Blocks_{suffix_24}' in df.columns:
        df['Cambio_Blocks'] = df[f'N_Blocks_{suffix_24}'] - df[f'N_Blocks_{suffix_17}']
        df['Cambio_Blocks_Pct'] = (df['Cambio_Blocks'] / df[f'N_Blocks_{suffix_17}'].replace(0, np.nan)) * 100
    
    if f'Total_Viviendas_{suffix_17}' in df.columns and f'Total_Viviendas_{suffix_24}' in df.columns:
        df['Cambio_Viviendas'] = df[f'Total_Viviendas_{suffix_24}'] - df[f'Total_Viviendas_{suffix_17}']
        df['Cambio_Viviendas_Pct'] = (df['Cambio_Viviendas'] / df[f'Total_Viviendas_{suffix_17}'].replace(0, np.nan)) * 100
    
    if f'Total_Personas_{suffix_17}' in df.columns and f'Total_Personas_{suffix_24}' in df.columns:
        df['Cambio_Personas'] = df[f'Total_Personas_{suffix_24}'] - df[f'Total_Personas_{suffix_17}']
        df['Cambio_Personas_Pct'] = (df['Cambio_Personas'] / df[f'Total_Personas_{suffix_17}'].replace(0, np.nan)) * 100
    
    if f'Avg_Persons_Per_Household_{suffix_17}' in df.columns and f'Avg_Persons_Per_Household_{suffix_24}' in df.columns:
        df['Cambio_Avg_Persons_Per_Household'] = df[f'Avg_Persons_Per_Household_{suffix_24}'] - df[f'Avg_Persons_Per_Household_{suffix_17}']
    
    for src_key, src_info in WATER_SOURCES.items():
        col_17 = f"Total_{src_info['name_short']}_{suffix_17}"
        col_24 = f"Total_{src_info['name_short']}_{suffix_24}"
        
        if col_17 in df.columns and col_24 in df.columns:
            df[f'Cambio_{src_info["name_short"]}'] = df[col_24] - df[col_17]
            df[f'Cambio_{src_info["name_short"]}_Pct'] = (
                df[f'Cambio_{src_info["name_short"]}'] / df[col_17].replace(0, np.nan)
            ) * 100
            
            pct_17 = f'Pct_{src_info["name_short"]}_{suffix_17}'
            pct_24 = f'Pct_{src_info["name_short"]}_{suffix_24}'
            
            if pct_17 in df.columns and pct_24 in df.columns:
                df[f'Cambio_PctViv_{src_info["name_short"]}'] = df[pct_24] - df[pct_17]
    
    return df

def identify_water_scarcity_hotspots(df, name_col):
    df = df.copy()
    
    df['Hotspot_Type'] = 'Normal'
    df['Hotspot_Score'] = 0
    df['Hotspot_Description'] = ''
    
    pozo_cambio = 'Cambio_Pozo'
    camion_cambio = 'Cambio_Camion'
    rio_cambio = 'Cambio_Rio'
    red_cambio = 'Cambio_RedPublica'
    
    pozo_pct_cambio = 'Cambio_Pozo_Pct'
    camion_pct_cambio = 'Cambio_Camion_Pct'
    
    required_cols = [pozo_cambio, camion_cambio, rio_cambio]
    if not all(col in df.columns for col in required_cols):
        return df
    
    mask_camion_increase = df[camion_cambio] > 0
    mask_pozo_decrease = df[pozo_cambio] < 0
    mask_rio_decrease = df[rio_cambio] < 0
    mask_red_decrease = df[red_cambio] < 0 if red_cambio in df.columns else pd.Series([False] * len(df))
    
    mask_critical = mask_camion_increase & mask_pozo_decrease & mask_rio_decrease
    df.loc[mask_critical, 'Hotspot_Type'] = 'Critical_Water_Stress'
    df.loc[mask_critical, 'Hotspot_Score'] = 5
    df.loc[mask_critical, 'Hotspot_Description'] = 'Pozo decreases, Camion increases, Rio decreases - Severe water stress'
    
    mask_aquifer_drought = mask_pozo_decrease & mask_camion_increase & ~mask_critical
    df.loc[mask_aquifer_drought, 'Hotspot_Type'] = 'Potential_Aquifer_Drought'
    df.loc[mask_aquifer_drought, 'Hotspot_Score'] = 4
    df.loc[mask_aquifer_drought, 'Hotspot_Description'] = 'Well access decreased, water truck increased - Groundwater depletion'
    
    mask_surface_stress = mask_camion_increase & mask_rio_decrease & ~mask_critical & ~mask_aquifer_drought
    df.loc[mask_surface_stress, 'Hotspot_Type'] = 'Surface_Water_Stress'
    df.loc[mask_surface_stress, 'Hotspot_Score'] = 3
    df.loc[mask_surface_stress, 'Hotspot_Description'] = 'Camion increases, Rio decreases - Surface water stress'
    
    mask_general_stress = mask_pozo_decrease & mask_rio_decrease & ~mask_camion_increase & ~mask_critical
    df.loc[mask_general_stress, 'Hotspot_Type'] = 'General_Water_Decline'
    df.loc[mask_general_stress, 'Hotspot_Score'] = 3
    df.loc[mask_general_stress, 'Hotspot_Description'] = 'Both Pozo and Rio decrease - General water availability decline'
    
    mask_emerging = mask_camion_increase & ~mask_pozo_decrease & ~mask_rio_decrease
    df.loc[mask_emerging, 'Hotspot_Type'] = 'Emerging_Water_Gap'
    df.loc[mask_emerging, 'Hotspot_Score'] = 2
    df.loc[mask_emerging, 'Hotspot_Description'] = 'Camion increases while other sources stable - Infrastructure gap'
    
    pct_pozo_24 = 'Pct_Pozo_24'
    pct_camion_24 = 'Pct_Camion_24'
    pct_rio_24 = 'Pct_Rio_24'
    
    if all(col in df.columns for col in [pct_pozo_24, pct_camion_24, pct_rio_24]):
        mask_high_dependency = (
            (df[pct_pozo_24] > 30) | 
            (df[pct_camion_24] > 10) | 
            (df[pct_rio_24] > 20)
        )
        df.loc[mask_high_dependency & (df['Hotspot_Type'] == 'Normal'), 'Hotspot_Type'] = 'High_Alt_Source_Dependency'
        df.loc[mask_high_dependency & (df['Hotspot_Score'] == 0), 'Hotspot_Score'] = 1
        df.loc[mask_high_dependency & (df['Hotspot_Description'] == ''), 'Hotspot_Description'] = 'High dependency on alternative water sources'
    
    if camion_pct_cambio in df.columns:
        high_camion_increase = df[camion_pct_cambio] > 50
        df.loc[high_camion_increase, 'Hotspot_Score'] = df.loc[high_camion_increase, 'Hotspot_Score'] + 1
    
    return df

def identify_well_increase_areas(df, name_col):
    df = df.copy()
    
    df['Well_Trend'] = 'Stable'
    df['Well_Trend_Score'] = 0
    df['Well_Trend_Description'] = ''
    
    pozo_cambio = 'Cambio_Pozo'
    pozo_pct_cambio = 'Cambio_Pozo_Pct'
    pct_pozo_24 = 'Pct_Pozo_24'
    
    if pozo_cambio not in df.columns:
        return df
    
    mask_significant_increase = (df[pozo_cambio] > 0) & (df[pozo_pct_cambio] > 20)
    df.loc[mask_significant_increase, 'Well_Trend'] = 'Significant_Well_Increase'
    df.loc[mask_significant_increase, 'Well_Trend_Score'] = 3
    df.loc[mask_significant_increase, 'Well_Trend_Description'] = 'Significant increase in well-dependent households (>20%)'
    
    mask_moderate_increase = (df[pozo_cambio] > 0) & (df[pozo_pct_cambio] > 10) & (df[pozo_pct_cambio] <= 20)
    df.loc[mask_moderate_increase & (df['Well_Trend'] == 'Stable'), 'Well_Trend'] = 'Moderate_Well_Increase'
    df.loc[mask_moderate_increase & (df['Well_Trend_Score'] == 0), 'Well_Trend_Score'] = 2
    df.loc[mask_moderate_increase & (df['Well_Trend_Description'] == ''), 'Well_Trend_Description'] = 'Moderate increase in well-dependent households (10-20%)'
    
    mask_slight_increase = (df[pozo_cambio] > 0) & (df[pozo_pct_cambio] <= 10)
    df.loc[mask_slight_increase & (df['Well_Trend'] == 'Stable'), 'Well_Trend'] = 'Slight_Well_Increase'
    df.loc[mask_slight_increase & (df['Well_Trend_Score'] == 0), 'Well_Trend_Score'] = 1
    df.loc[mask_slight_increase & (df['Well_Trend_Description'] == ''), 'Well_Trend_Description'] = 'Slight increase in well-dependent households (<10%)'
    
    mask_decrease = df[pozo_cambio] < 0
    df.loc[mask_decrease, 'Well_Trend'] = 'Well_Decrease'
    df.loc[mask_decrease, 'Well_Trend_Score'] = -1
    df.loc[mask_decrease, 'Well_Trend_Description'] = 'Decrease in well-dependent households'
    
    return df

def identify_well_to_truck_transition(df, name_col):
    df = df.copy()
    
    df['Transition_Type'] = 'No_Transition'
    df['Transition_Score'] = 0
    df['Transition_Description'] = ''
    
    pozo_cambio = 'Cambio_Pozo'
    camion_cambio = 'Cambio_Camion'
    pct_pozo_17 = 'Pct_Pozo_17'
    pct_camion_24 = 'Pct_Camion_24'
    
    required_cols = [pozo_cambio, camion_cambio]
    if not all(col in df.columns for col in required_cols):
        return df
    
    mask_was_well_based = df[pct_pozo_17] > 15 if pct_pozo_17 in df.columns else pd.Series([True] * len(df))
    mask_pozo_decrease = df[pozo_cambio] < 0
    mask_camion_increase = df[camion_cambio] > 0
    
    mask_critical_transition = mask_was_well_based & mask_pozo_decrease & mask_camion_increase
    df.loc[mask_critical_transition, 'Transition_Type'] = 'Well_to_Truck_Critical'
    df.loc[mask_critical_transition, 'Transition_Score'] = 5
    df.loc[mask_critical_transition, 'Transition_Description'] = 'Was primarily well-based, now shifting to water truck - POTENTIAL AQUIFER DROUGHT'
    
    mask_moderate_transition = ~mask_was_well_based & mask_pozo_decrease & mask_camion_increase
    df.loc[mask_moderate_transition, 'Transition_Type'] = 'Well_to_Truck_Moderate'
    df.loc[mask_moderate_transition, 'Transition_Score'] = 3
    df.loc[mask_moderate_transition, 'Transition_Description'] = 'Well access decreasing, water truck increasing - Groundwater stress'
    
    mask_camion_dependency = (df[pct_camion_24] > 20) if pct_camion_24 in df.columns else pd.Series([False] * len(df))
    df.loc[mask_camion_dependency & (df['Transition_Type'] == 'No_Transition'), 'Transition_Type'] = 'High_Truck_Dependency'
    df.loc[mask_camion_dependency & (df['Transition_Score'] == 0), 'Transition_Score'] = 2
    df.loc[mask_camion_dependency & (df['Transition_Description'] == ''), 'Transition_Description'] = 'High dependency on water trucks (>20%)'
    
    return df

def generate_water_analysis_report(stats_17, stats_24, stats_merged, level_name, name_col, report_lines, area_type='All'):
    area_label = f" - {area_type}" if area_type != 'All' else ""
    
    report_lines.append(f"\n" + "="*100)
    report_lines.append(f"WATER SOURCE ANALYSIS - {level_name.upper()}{area_label}")
    report_lines.append("="*100)
    
    report_lines.append(f"\n--- SUMMARY STATISTICS ---")
    report_lines.append(f"Total units analyzed: {len(stats_merged)}")
    
    total_viv_17 = stats_merged['Total_Viviendas_17'].sum() if 'Total_Viviendas_17' in stats_merged.columns else 0
    total_viv_24 = stats_merged['Total_Viviendas_24'].sum() if 'Total_Viviendas_24' in stats_merged.columns else 0
    total_pers_17 = stats_merged['Total_Personas_17'].sum() if 'Total_Personas_17' in stats_merged.columns else 0
    total_pers_24 = stats_merged['Total_Personas_24'].sum() if 'Total_Personas_24' in stats_merged.columns else 0
    
    report_lines.append(f"\nPopulation: {total_pers_17:,.0f} (2017) -> {total_pers_24:,.0f} (2024) | Change: {total_pers_24-total_pers_17:+,.0f} ({((total_pers_24/total_pers_17)-1)*100 if total_pers_17 > 0 else 0:+.1f}%)")
    report_lines.append(f"Households: {total_viv_17:,.0f} (2017) -> {total_viv_24:,.0f} (2024) | Change: {total_viv_24-total_viv_17:+,.0f} ({((total_viv_24/total_viv_17)-1)*100 if total_viv_17 > 0 else 0:+.1f}%)")
    
    report_lines.append(f"\n--- WATER SOURCES COMPARISON ---")
    report_lines.append(f"{'Source':<25} {'2017':>15} {'2024':>15} {'Change':>15} {'%':>10} {'%Viv 17':>10} {'%Viv 24':>10}")
    report_lines.append("-"*100)
    
    for src_key, src_info in WATER_SOURCES.items():
        col_17 = f"Total_{src_info['name_short']}_17"
        col_24 = f"Total_{src_info['name_short']}_24"
        
        val_17 = stats_merged[col_17].sum() if col_17 in stats_merged.columns else 0
        val_24 = stats_merged[col_24].sum() if col_24 in stats_merged.columns else 0
        change = val_24 - val_17
        pct_change = ((val_24 / val_17) - 1) * 100 if val_17 > 0 else 0
        pct_viv_17 = (val_17 / total_viv_17) * 100 if total_viv_17 > 0 else 0
        pct_viv_24 = (val_24 / total_viv_24) * 100 if total_viv_24 > 0 else 0
        
        report_lines.append(f"{src_info['name']:<25} {val_17:>15,.0f} {val_24:>15,.0f} {change:>+15,.0f} {pct_change:>+9.1f}% {pct_viv_17:>9.2f}% {pct_viv_24:>9.2f}%")
    
    report_lines.append(f"\n--- TOP 10: HIGHEST WELL (POZO) INCREASE ---")
    if 'Cambio_Pozo' in stats_merged.columns:
        top_pozo_increase = stats_merged[stats_merged['Cambio_Pozo'] > 0].nlargest(10, 'Cambio_Pozo')
        report_lines.append(f"{'Rank':<5} {level_name:<35} {'Pozo 17':>12} {'Pozo 24':>12} {'Change':>12} {'%':>10}")
        report_lines.append("-"*90)
        for i, (_, row) in enumerate(top_pozo_increase.iterrows(), 1):
            name = str(row[name_col])[:33] if pd.notna(row.get(name_col)) else "N/A"
            p17 = row.get('Total_Pozo_17', 0)
            p24 = row.get('Total_Pozo_24', 0)
            chg = row.get('Cambio_Pozo', 0)
            pct = row.get('Cambio_Pozo_Pct', 0)
            report_lines.append(f"{i:<5} {name:<35} {p17:>12,.0f} {p24:>12,.0f} {chg:>+12,.0f} {pct:>+9.1f}%")
    
    report_lines.append(f"\n--- TOP 10: HIGHEST WELL (POZO) PERCENTAGE INCREASE ---")
    if 'Cambio_Pozo_Pct' in stats_merged.columns:
        valid_pct = stats_merged[(stats_merged['Total_Pozo_17'] > 10) & (stats_merged['Cambio_Pozo_Pct'].notna())]
        top_pozo_pct = valid_pct.nlargest(10, 'Cambio_Pozo_Pct')
        report_lines.append(f"{'Rank':<5} {level_name:<35} {'Pozo 17':>12} {'Pozo 24':>12} {'Change':>12} {'%':>10}")
        report_lines.append("-"*90)
        for i, (_, row) in enumerate(top_pozo_pct.iterrows(), 1):
            name = str(row[name_col])[:33] if pd.notna(row.get(name_col)) else "N/A"
            p17 = row.get('Total_Pozo_17', 0)
            p24 = row.get('Total_Pozo_24', 0)
            chg = row.get('Cambio_Pozo', 0)
            pct = row.get('Cambio_Pozo_Pct', 0)
            report_lines.append(f"{i:<5} {name:<35} {p17:>12,.0f} {p24:>12,.0f} {chg:>+12,.0f} {pct:>+9.1f}%")
    
    report_lines.append(f"\n--- TOP 10: HIGHEST WATER TRUCK (CAMION) INCREASE ---")
    if 'Cambio_Camion' in stats_merged.columns:
        top_camion = stats_merged[stats_merged['Cambio_Camion'] > 0].nlargest(10, 'Cambio_Camion')
        report_lines.append(f"{'Rank':<5} {level_name:<35} {'Camion 17':>12} {'Camion 24':>12} {'Change':>12} {'%':>10}")
        report_lines.append("-"*90)
        for i, (_, row) in enumerate(top_camion.iterrows(), 1):
            name = str(row[name_col])[:33] if pd.notna(row.get(name_col)) else "N/A"
            c17 = row.get('Total_Camion_17', 0)
            c24 = row.get('Total_Camion_24', 0)
            chg = row.get('Cambio_Camion', 0)
            pct = row.get('Cambio_Camion_Pct', 0)
            report_lines.append(f"{i:<5} {name:<35} {c17:>12,.0f} {c24:>12,.0f} {chg:>+12,.0f} {pct:>+9.1f}%")
    
    report_lines.append(f"\n--- CRITICAL: WELL TO WATER TRUCK TRANSITION (Potential Aquifer Drought) ---")
    if 'Transition_Type' in stats_merged.columns:
        critical_transition = stats_merged[stats_merged['Transition_Type'].isin(['Well_to_Truck_Critical', 'Well_to_Truck_Moderate'])]
        critical_transition = critical_transition.sort_values('Transition_Score', ascending=False)
        if len(critical_transition) > 0:
            report_lines.append(f"{'Rank':<5} {level_name:<30} {'Type':<25} {'Pozo Chg':>12} {'Camion Chg':>12}")
            report_lines.append("-"*90)
            for i, (_, row) in enumerate(critical_transition.head(15).iterrows(), 1):
                name = str(row[name_col])[:28] if pd.notna(row.get(name_col)) else "N/A"
                trans_type = row.get('Transition_Type', '')[:23]
                pozo_chg = row.get('Cambio_Pozo', 0)
                camion_chg = row.get('Cambio_Camion', 0)
                report_lines.append(f"{i:<5} {name:<30} {trans_type:<25} {pozo_chg:>+12,.0f} {camion_chg:>+12,.0f}")
        else:
            report_lines.append("  No critical well-to-truck transitions identified.")
    
    report_lines.append(f"\n--- WATER SCARCITY HOTSPOTS SUMMARY ---")
    if 'Hotspot_Type' in stats_merged.columns:
        hotspot_counts = stats_merged['Hotspot_Type'].value_counts()
        for htype, count in hotspot_counts.items():
            if htype != 'Normal':
                report_lines.append(f"  {htype}: {count}")
    
    return report_lines

def generate_rural_urban_water_analysis(gdf_17, gdf_24, stats_all, level_name, name_col, report_lines):
    report_lines.append(f"\n" + "="*100)
    report_lines.append(f"RURAL vs URBAN WATER ANALYSIS - {level_name.upper()}")
    report_lines.append("="*100)
    
    rural_17 = calculate_national_water_statistics(gdf_17, '17', 'Rural')
    urban_17 = calculate_national_water_statistics(gdf_17, '17', 'Urban')
    rural_24 = calculate_national_water_statistics(gdf_24, '24', 'Rural')
    urban_24 = calculate_national_water_statistics(gdf_24, '24', 'Urban')
    
    report_lines.append(f"\n--- NATIONAL RURAL vs URBAN COMPARISON ---")
    report_lines.append(f"\n{'Metric':<35} {'Rural 17':>15} {'Rural 24':>15} {'Urban 17':>15} {'Urban 24':>15}")
    report_lines.append("-"*100)
    
    report_lines.append(f"{'Census Blocks':<35} {rural_17.get('N_Blocks_17', 0):>15,.0f} {rural_24.get('N_Blocks_24', 0):>15,.0f} {urban_17.get('N_Blocks_17', 0):>15,.0f} {urban_24.get('N_Blocks_24', 0):>15,.0f}")
    report_lines.append(f"{'Population':<35} {rural_17.get('Total_Personas_17', 0):>15,.0f} {rural_24.get('Total_Personas_24', 0):>15,.0f} {urban_17.get('Total_Personas_17', 0):>15,.0f} {urban_24.get('Total_Personas_24', 0):>15,.0f}")
    report_lines.append(f"{'Households':<35} {rural_17.get('Total_Viviendas_17', 0):>15,.0f} {rural_24.get('Total_Viviendas_24', 0):>15,.0f} {urban_17.get('Total_Viviendas_17', 0):>15,.0f} {urban_24.get('Total_Viviendas_24', 0):>15,.0f}")
    report_lines.append(f"{'Avg Persons/Household':<35} {rural_17.get('Avg_Persons_Per_Household_17', 0):>15.2f} {rural_24.get('Avg_Persons_Per_Household_24', 0):>15.2f} {urban_17.get('Avg_Persons_Per_Household_17', 0):>15.2f} {urban_24.get('Avg_Persons_Per_Household_24', 0):>15.2f}")
    
    report_lines.append(f"\n--- WATER SOURCES BY AREA TYPE ---")
    
    for area_type, stats_17, stats_24 in [('RURAL', rural_17, rural_24), ('URBAN', urban_17, urban_24)]:
        report_lines.append(f"\n  {area_type} AREAS:")
        report_lines.append(f"  {'Source':<23} {'2017':>12} {'2024':>12} {'Change':>12} {'%':>10} {'%Viv 17':>10} {'%Viv 24':>10}")
        report_lines.append("  " + "-"*90)
        
        for src_key, src_info in WATER_SOURCES.items():
            col_17 = f"Total_{src_info['name_short']}_17"
            col_24 = f"Total_{src_info['name_short']}_24"
            
            val_17 = stats_17.get(col_17, 0)
            val_24 = stats_24.get(col_24, 0)
            change = val_24 - val_17
            pct_change = ((val_24 / val_17) - 1) * 100 if val_17 > 0 else 0
            
            total_viv_17 = stats_17.get('Total_Viviendas_17', 1)
            total_viv_24 = stats_24.get('Total_Viviendas_24', 1)
            pct_viv_17 = (val_17 / total_viv_17) * 100 if total_viv_17 > 0 else 0
            pct_viv_24 = (val_24 / total_viv_24) * 100 if total_viv_24 > 0 else 0
            
            report_lines.append(f"  {src_info['name']:<23} {val_17:>12,.0f} {val_24:>12,.0f} {change:>+12,.0f} {pct_change:>+9.1f}% {pct_viv_17:>9.2f}% {pct_viv_24:>9.2f}%")
    
    report_lines.append(f"\n--- KEY INSIGHT: RURAL WATER ACCESS CHANGES ---")
    
    rural_pozo_17 = rural_17.get('Total_Pozo_17', 0)
    rural_pozo_24 = rural_24.get('Total_Pozo_24', 0)
    rural_camion_17 = rural_17.get('Total_Camion_17', 0)
    rural_camion_24 = rural_24.get('Total_Camion_24', 0)
    rural_rio_17 = rural_17.get('Total_Rio_17', 0)
    rural_rio_24 = rural_24.get('Total_Rio_24', 0)
    rural_red_17 = rural_17.get('Total_RedPublica_17', 0)
    rural_red_24 = rural_24.get('Total_RedPublica_24', 0)
    
    report_lines.append(f"\n  Rural Well (Pozo) Access:")
    report_lines.append(f"    2017: {rural_pozo_17:,.0f} households")
    report_lines.append(f"    2024: {rural_pozo_24:,.0f} households")
    report_lines.append(f"    Change: {rural_pozo_24-rural_pozo_17:+,.0f} ({((rural_pozo_24/rural_pozo_17)-1)*100 if rural_pozo_17 > 0 else 0:+.1f}%)")
    
    report_lines.append(f"\n  Rural Water Truck (Camion) Dependency:")
    report_lines.append(f"    2017: {rural_camion_17:,.0f} households")
    report_lines.append(f"    2024: {rural_camion_24:,.0f} households")
    report_lines.append(f"    Change: {rural_camion_24-rural_camion_17:+,.0f} ({((rural_camion_24/rural_camion_17)-1)*100 if rural_camion_17 > 0 else 0:+.1f}%)")
    
    if rural_camion_24 > rural_camion_17 and rural_pozo_24 < rural_pozo_17:
        report_lines.append(f"\n  ⚠️ WARNING: Rural areas show well decrease with water truck increase - Potential groundwater stress")
    
    return report_lines

def generate_national_summary(gdf_17, gdf_24, report_lines):
    report_lines.append("\n" + "="*100)
    report_lines.append("NATIONAL SUMMARY - CHILE CENSUS 2017 vs 2024")
    report_lines.append("="*100)
    
    nat_17 = calculate_national_water_statistics(gdf_17, '17')
    nat_24 = calculate_national_water_statistics(gdf_24, '24')
    
    report_lines.append(f"\n--- CENSUS BLOCKS ---")
    report_lines.append(f"  2017: {nat_17['N_Blocks_17']:,}")
    report_lines.append(f"  2024: {nat_24['N_Blocks_24']:,}")
    block_change = nat_24['N_Blocks_24'] - nat_17['N_Blocks_17']
    report_lines.append(f"  Change: {block_change:+,} ({(block_change/nat_17['N_Blocks_17'])*100 if nat_17['N_Blocks_17'] > 0 else 0:+.1f}%)")
    
    report_lines.append(f"\n--- POPULATION ---")
    report_lines.append(f"  2017: {nat_17['Total_Personas_17']:,.0f}")
    report_lines.append(f"  2024: {nat_24['Total_Personas_24']:,.0f}")
    pers_change = nat_24['Total_Personas_24'] - nat_17['Total_Personas_17']
    report_lines.append(f"  Change: {pers_change:+,.0f} ({(pers_change/nat_17['Total_Personas_17'])*100 if nat_17['Total_Personas_17'] > 0 else 0:+.1f}%)")
    
    report_lines.append(f"\n--- HOUSEHOLDS ---")
    report_lines.append(f"  2017: {nat_17['Total_Viviendas_17']:,.0f}")
    report_lines.append(f"  2024: {nat_24['Total_Viviendas_24']:,.0f}")
    viv_change = nat_24['Total_Viviendas_24'] - nat_17['Total_Viviendas_17']
    report_lines.append(f"  Change: {viv_change:+,.0f} ({(viv_change/nat_17['Total_Viviendas_17'])*100 if nat_17['Total_Viviendas_17'] > 0 else 0:+.1f}%)")
    
    report_lines.append(f"\n--- AVERAGE PERSONS PER HOUSEHOLD ---")
    report_lines.append(f"  2017: {nat_17['Avg_Persons_Per_Household_17']:.2f}")
    report_lines.append(f"  2024: {nat_24['Avg_Persons_Per_Household_24']:.2f}")
    report_lines.append(f"  Change: {nat_24['Avg_Persons_Per_Household_24']-nat_17['Avg_Persons_Per_Household_17']:+.2f}")
    
    rural_17 = nat_17.get('Rural_Blocks_17', 0)
    urban_17 = nat_17.get('Urban_Blocks_17', 0)
    rural_24 = nat_24.get('Rural_Blocks_24', 0)
    urban_24 = nat_24.get('Urban_Blocks_24', 0)
    
    report_lines.append(f"\n--- RURAL vs URBAN BLOCKS ---")
    report_lines.append(f"  Rural 2017: {rural_17:,.0f} | Urban 2017: {urban_17:,.0f}")
    report_lines.append(f"  Rural 2024: {rural_24:,.0f} | Urban 2024: {urban_24:,.0f}")
    report_lines.append(f"  Rural Change: {rural_24-rural_17:+,.0f} | Urban Change: {urban_24-urban_17:+,.0f}")
    
    report_lines.append(f"\n--- NATIONAL WATER SOURCES SUMMARY ---")
    report_lines.append(f"{'Source':<25} {'2017':>15} {'2024':>15} {'Change':>15} {'%':>10} {'%Viv 17':>10} {'%Viv 24':>10}")
    report_lines.append("-"*100)
    
    for src_key, src_info in WATER_SOURCES.items():
        col_17 = f"Total_{src_info['name_short']}_17"
        col_24 = f"Total_{src_info['name_short']}_24"
        
        val_17 = nat_17.get(col_17, 0)
        val_24 = nat_24.get(col_24, 0)
        change = val_24 - val_17
        pct_change = ((val_24 / val_17) - 1) * 100 if val_17 > 0 else 0
        pct_viv_17 = (val_17 / nat_17['Total_Viviendas_17']) * 100 if nat_17['Total_Viviendas_17'] > 0 else 0
        pct_viv_24 = (val_24 / nat_24['Total_Viviendas_24']) * 100 if nat_24['Total_Viviendas_24'] > 0 else 0
        
        report_lines.append(f"{src_info['name']:<25} {val_17:>15,.0f} {val_24:>15,.0f} {change:>+15,.0f} {pct_change:>+9.1f}% {pct_viv_17:>9.2f}% {pct_viv_24:>9.2f}%")
    
    return report_lines, nat_17, nat_24

def main():
    print("\n" + "="*100)
    print("CENSUS 2017 vs 2024 WATER SOURCE ANALYSIS")
    print("With Spatial Joins to Reference Layers")
    print("="*100)
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    create_output_folder(OUTPUT_FOLDER)
    
    report_lines = []
    report_lines.append("="*100)
    report_lines.append("CENSUS 2017 vs 2024 WATER SOURCE ANALYSIS REPORT")
    report_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report_lines.append("="*100)
    
    print("\nLoading reference layers...")
    reference_gdfs = {}
    for layer_config in REFERENCE_LAYERS:
        gdf, prefix = load_reference_layer(layer_config)
        if gdf is not None:
            reference_gdfs[prefix] = gdf
            print(f"  Loaded {prefix}: {len(gdf)} features")
    
    print(f"\nLoaded {len(reference_gdfs)} reference layers: {list(reference_gdfs.keys())}")
    
    print("\nLoading Census 2017...")
    gdf_censo_2017 = gpd.read_file(GDB_PATH, layer=LAYER_CENSO_2017)
    if gdf_censo_2017.crs != TARGET_CRS:
        gdf_censo_2017 = gdf_censo_2017.to_crs(TARGET_CRS)
    print(f"  Loaded {len(gdf_censo_2017)} census blocks")
    
    print("\nLoading Census 2024...")
    gdf_censo_2024 = gpd.read_file(GDB_PATH, layer=LAYER_CENSO_2024)
    if gdf_censo_2024.crs != TARGET_CRS:
        gdf_censo_2024 = gdf_censo_2024.to_crs(TARGET_CRS)
    print(f"  Loaded {len(gdf_censo_2024)} census blocks")
    
    print("\nProcessing Census 2017...")
    
    if COL_AREA_17 in gdf_censo_2017.columns:
        gdf_censo_2017['Is_Rural_17'] = np.where(gdf_censo_2017[COL_AREA_17] == 2, 1, 0)
    else:
        gdf_censo_2017['Is_Rural_17'] = 0
    
    if COL_PERS_17 in gdf_censo_2017.columns:
        gdf_censo_2017['Personas_17'] = pd.to_numeric(gdf_censo_2017[COL_PERS_17], errors='coerce').fillna(0)
    else:
        gdf_censo_2017['Personas_17'] = 0
        print(f"  WARNING: Column {COL_PERS_17} not found in Census 2017")
    
    if COL_VIVIENDAS_17 in gdf_censo_2017.columns:
        gdf_censo_2017['Viviendas_17'] = pd.to_numeric(gdf_censo_2017[COL_VIVIENDAS_17], errors='coerce').fillna(0)
    else:
        gdf_censo_2017['Viviendas_17'] = 0
    
    for src_key, src_info in WATER_SOURCES.items():
        col_orig = src_info['col_2017']
        col_new = f"{src_info['name_short']}_17"
        if col_orig in gdf_censo_2017.columns:
            gdf_censo_2017[col_new] = pd.to_numeric(gdf_censo_2017[col_orig], errors='coerce').fillna(0)
        else:
            gdf_censo_2017[col_new] = 0
            print(f"  WARNING: Column {col_orig} not found in Census 2017")
    
    print("\nProcessing Census 2024...")
    
    if COL_AREA_24 in gdf_censo_2024.columns:
        gdf_censo_2024['Is_Rural_24'] = np.where(
            gdf_censo_2024[COL_AREA_24].astype(str).str.upper().str.strip() == 'RURAL', 1, 0
        )
    else:
        gdf_censo_2024['Is_Rural_24'] = 0
    
    if COL_PERS_24 in gdf_censo_2024.columns:
        gdf_censo_2024['Personas_24'] = pd.to_numeric(gdf_censo_2024[COL_PERS_24], errors='coerce').fillna(0)
    else:
        gdf_censo_2024['Personas_24'] = 0
        print(f"  WARNING: Column {COL_PERS_24} not found in Census 2024")
    
    if COL_VIVIENDAS_24 in gdf_censo_2024.columns:
        gdf_censo_2024['Viviendas_24'] = pd.to_numeric(gdf_censo_2024[COL_VIVIENDAS_24], errors='coerce').fillna(0)
    else:
        gdf_censo_2024['Viviendas_24'] = 0
    
    for src_key, src_info in WATER_SOURCES.items():
        col_orig = src_info['col_2024']
        col_new = f"{src_info['name_short']}_24"
        if col_orig in gdf_censo_2024.columns:
            gdf_censo_2024[col_new] = pd.to_numeric(gdf_censo_2024[col_orig], errors='coerce').fillna(0)
        else:
            gdf_censo_2024[col_new] = 0
            print(f"  WARNING: Column {col_orig} not found in Census 2024")
    
    print("\nPerforming spatial joins for Census 2017...")
    gdf_censo_2017_centroids = get_centroid_gdf(gdf_censo_2017)
    for prefix, ref_gdf in reference_gdfs.items():
        print(f"  Joining with {prefix}...")
        if prefix == 'SHAC':
            print(f"    Applying {SHAC_BUFFER_DISTANCE}m buffer for SHAC spatial join...")
            ref_gdf_buffered = buffer_shac_for_spatial_join(ref_gdf, SHAC_BUFFER_DISTANCE)
            gdf_censo_2017_centroids = perform_spatial_join(gdf_censo_2017_centroids, ref_gdf_buffered, prefix)
        else:
            gdf_censo_2017_centroids = perform_spatial_join(gdf_censo_2017_centroids, ref_gdf, prefix)
    gdf_censo_2017_joined = restore_original_geometry(gdf_censo_2017_centroids)
    gdf_censo_2017_joined = add_centroid_coordinates(gdf_censo_2017_joined)
    
    print("\nPerforming spatial joins for Census 2024...")
    gdf_censo_2024_centroids = get_centroid_gdf(gdf_censo_2024)
    for prefix, ref_gdf in reference_gdfs.items():
        print(f"  Joining with {prefix}...")
        if prefix == 'SHAC':
            print(f"    Applying {SHAC_BUFFER_DISTANCE}m buffer for SHAC spatial join...")
            ref_gdf_buffered = buffer_shac_for_spatial_join(ref_gdf, SHAC_BUFFER_DISTANCE)
            gdf_censo_2024_centroids = perform_spatial_join(gdf_censo_2024_centroids, ref_gdf_buffered, prefix)
        else:
            gdf_censo_2024_centroids = perform_spatial_join(gdf_censo_2024_centroids, ref_gdf, prefix)
    gdf_censo_2024_joined = restore_original_geometry(gdf_censo_2024_centroids)
    gdf_censo_2024_joined = add_centroid_coordinates(gdf_censo_2024_joined)
    
    print("\nSaving Census data with spatial joins...")
    
    df_censo_2017_export = pd.DataFrame(gdf_censo_2017_joined.drop(columns='geometry'))
    df_censo_2024_export = pd.DataFrame(gdf_censo_2024_joined.drop(columns='geometry'))
    
    censo_2017_path = os.path.join(OUTPUT_FOLDER, 'Census_Data', 'Census_2017_With_Spatial_Joins.xlsx')
    censo_2024_path = os.path.join(OUTPUT_FOLDER, 'Census_Data', 'Census_2024_With_Spatial_Joins.xlsx')
    
    df_censo_2017_export.to_excel(censo_2017_path, index=False)
    df_censo_2024_export.to_excel(censo_2024_path, index=False)
    print(f"  Saved: {censo_2017_path}")
    print(f"  Saved: {censo_2024_path}")

    censo_2017_shp_path = os.path.join(OUTPUT_FOLDER, 'Shapefiles', 'Census_2017_With_Spatial_Joins.shp')
    censo_2024_shp_path = os.path.join(OUTPUT_FOLDER, 'Shapefiles', 'Census_2024_With_Spatial_Joins.shp')
    
    gdf_censo_2017_joined.to_file(censo_2017_shp_path, driver='ESRI Shapefile')
    gdf_censo_2024_joined.to_file(censo_2024_shp_path, driver='ESRI Shapefile')
    
    print(f"  Saved Shapefile: {censo_2017_shp_path}")
    print(f"  Saved Shapefile: {censo_2024_shp_path}")
    
    print("\nGenerating National Summary...")
    report_lines, nat_17, nat_24 = generate_national_summary(gdf_censo_2017, gdf_censo_2024, report_lines)
    
    report_lines = generate_rural_urban_water_analysis(
        gdf_censo_2017, gdf_censo_2024, 
        None, 'National', 'National', report_lines
    )
    
    print("\nCalculating statistics at multiple levels...")
    
    all_stats = {}
    
    levels = [
        ('Region', 'Region_Name'),
        ('Muni', 'Muni_Name'),
        ('Cuenca', 'Cuenca_Name'),
        ('SHAC', 'SHAC_Name')
    ]
    
    for level_prefix, name_col in levels:
        print(f"\n  Processing {level_prefix} level...")
        
        stats_17 = calculate_water_statistics(gdf_censo_2017_joined, name_col, '17')
        stats_24 = calculate_water_statistics(gdf_censo_2024_joined, name_col, '24')
        
        stats_17_rural = calculate_water_statistics(gdf_censo_2017_joined, name_col, '17', 'Rural')
        stats_17_urban = calculate_water_statistics(gdf_censo_2017_joined, name_col, '17', 'Urban')
        stats_24_rural = calculate_water_statistics(gdf_censo_2024_joined, name_col, '24', 'Rural')
        stats_24_urban = calculate_water_statistics(gdf_censo_2024_joined, name_col, '24', 'Urban')
        
        if len(stats_17) > 0 and len(stats_24) > 0:
            stats_merged = stats_17.merge(stats_24, on=name_col, how='outer').fillna(0)
            stats_merged = calculate_change_metrics(stats_merged)
            stats_merged = identify_water_scarcity_hotspots(stats_merged, name_col)
            stats_merged = identify_well_increase_areas(stats_merged, name_col)
            stats_merged = identify_well_to_truck_transition(stats_merged, name_col)
            
            stats_merged_rural = pd.DataFrame()
            stats_merged_urban = pd.DataFrame()
            
            if len(stats_17_rural) > 0 and len(stats_24_rural) > 0:
                stats_merged_rural = stats_17_rural.merge(stats_24_rural, on=name_col, how='outer').fillna(0)
                stats_merged_rural = calculate_change_metrics(stats_merged_rural)
                stats_merged_rural = identify_water_scarcity_hotspots(stats_merged_rural, name_col)
                stats_merged_rural = identify_well_increase_areas(stats_merged_rural, name_col)
                stats_merged_rural = identify_well_to_truck_transition(stats_merged_rural, name_col)
            
            if len(stats_17_urban) > 0 and len(stats_24_urban) > 0:
                stats_merged_urban = stats_17_urban.merge(stats_24_urban, on=name_col, how='outer').fillna(0)
                stats_merged_urban = calculate_change_metrics(stats_merged_urban)
                stats_merged_urban = identify_water_scarcity_hotspots(stats_merged_urban, name_col)
            
            all_stats[level_prefix] = {
                'merged': stats_merged,
                'stats_17': stats_17,
                'stats_24': stats_24,
                'merged_rural': stats_merged_rural,
                'merged_urban': stats_merged_urban,
            }
            
            print(f"    {level_prefix}: {len(stats_merged)} units analyzed")
            
            report_lines = generate_water_analysis_report(
                stats_17, stats_24, stats_merged, 
                level_prefix, name_col, report_lines, 'All'
            )
            
            if len(stats_merged_rural) > 0:
                report_lines = generate_water_analysis_report(
                    stats_17_rural, stats_24_rural, stats_merged_rural, 
                    level_prefix, name_col, report_lines, 'Rural'
                )
            
            if len(stats_merged_urban) > 0:
                report_lines = generate_water_analysis_report(
                    stats_17_urban, stats_24_urban, stats_merged_urban, 
                    level_prefix, name_col, report_lines, 'Urban'
                )
    
    print("\nSaving outputs...")
    
    for level_prefix, name_col in levels:
        if level_prefix in all_stats:
            output_path = os.path.join(OUTPUT_FOLDER, 'Water_Analysis', f'Water_Analysis_{level_prefix}.xlsx')
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                all_stats[level_prefix]['merged'].to_excel(writer, sheet_name='All_Areas', index=False)
                all_stats[level_prefix]['stats_17'].to_excel(writer, sheet_name='Census_2017', index=False)
                all_stats[level_prefix]['stats_24'].to_excel(writer, sheet_name='Census_2024', index=False)
                
                if len(all_stats[level_prefix]['merged_rural']) > 0:
                    all_stats[level_prefix]['merged_rural'].to_excel(writer, sheet_name='Rural_Areas', index=False)
                if len(all_stats[level_prefix]['merged_urban']) > 0:
                    all_stats[level_prefix]['merged_urban'].to_excel(writer, sheet_name='Urban_Areas', index=False)
                
                hotspots = all_stats[level_prefix]['merged'][
                    all_stats[level_prefix]['merged']['Hotspot_Type'] != 'Normal'
                ].sort_values('Hotspot_Score', ascending=False)
                hotspots.to_excel(writer, sheet_name='Water_Hotspots', index=False)
                
                well_increase = all_stats[level_prefix]['merged'][
                    all_stats[level_prefix]['merged']['Well_Trend'].str.contains('Increase', na=False)
                ].sort_values('Well_Trend_Score', ascending=False)
                well_increase.to_excel(writer, sheet_name='Well_Increase_Areas', index=False)
                
                transitions = all_stats[level_prefix]['merged'][
                    all_stats[level_prefix]['merged']['Transition_Type'] != 'No_Transition'
                ].sort_values('Transition_Score', ascending=False)
                transitions.to_excel(writer, sheet_name='Well_to_Truck_Transitions', index=False)
            
            print(f"  Saved {output_path}")
    
    hotspots_all = []
    well_increase_all = []
    transitions_all = []
    
    for level_prefix, name_col in levels:
        if level_prefix in all_stats:
            df = all_stats[level_prefix]['merged'].copy()
            df['Level'] = level_prefix
            df['Unit_Name'] = df[name_col]
            
            hotspots_level = df[df['Hotspot_Type'] != 'Normal'].copy()
            hotspots_all.append(hotspots_level)
            
            well_level = df[df['Well_Trend'].str.contains('Increase', na=False)].copy()
            well_increase_all.append(well_level)
            
            trans_level = df[df['Transition_Type'] != 'No_Transition'].copy()
            transitions_all.append(trans_level)
    
    if hotspots_all:
        hotspots_combined = pd.concat(hotspots_all, ignore_index=True)
        hotspots_path = os.path.join(OUTPUT_FOLDER, 'Hotspots', 'All_Water_Hotspots.xlsx')
        with pd.ExcelWriter(hotspots_path, engine='openpyxl') as writer:
            hotspots_combined.sort_values('Hotspot_Score', ascending=False).to_excel(writer, sheet_name='All_Hotspots', index=False)
            for htype in hotspots_combined['Hotspot_Type'].unique():
                if htype != 'Normal':
                    filtered = hotspots_combined[hotspots_combined['Hotspot_Type'] == htype]
                    sheet_name = htype[:31]
                    filtered.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"  Saved {hotspots_path}")
    
    if well_increase_all:
        well_combined = pd.concat(well_increase_all, ignore_index=True)
        well_path = os.path.join(OUTPUT_FOLDER, 'Hotspots', 'Well_Increase_Areas.xlsx')
        well_combined.sort_values('Well_Trend_Score', ascending=False).to_excel(well_path, index=False)
        print(f"  Saved {well_path}")
    
    if transitions_all:
        trans_combined = pd.concat(transitions_all, ignore_index=True)
        trans_path = os.path.join(OUTPUT_FOLDER, 'Hotspots', 'Well_to_Truck_Transitions.xlsx')
        with pd.ExcelWriter(trans_path, engine='openpyxl') as writer:
            trans_combined.sort_values('Transition_Score', ascending=False).to_excel(writer, sheet_name='All_Transitions', index=False)
            critical = trans_combined[trans_combined['Transition_Type'] == 'Well_to_Truck_Critical']
            if len(critical) > 0:
                critical.to_excel(writer, sheet_name='Critical_Aquifer_Drought', index=False)
        print(f"  Saved {trans_path}")
    
    national_summary = []
    for src_key, src_info in WATER_SOURCES.items():
        col_17 = f"Total_{src_info['name_short']}_17"
        col_24 = f"Total_{src_info['name_short']}_24"
        
        val_17 = nat_17.get(col_17, 0)
        val_24 = nat_24.get(col_24, 0)
        
        national_summary.append({
            'Water_Source': src_info['name'],
            'Households_2017': val_17,
            'Households_2024': val_24,
            'Change_Absolute': val_24 - val_17,
            'Change_Percent': ((val_24 / val_17) - 1) * 100 if val_17 > 0 else 0,
            'Pct_Households_2017': (val_17 / nat_17['Total_Viviendas_17']) * 100 if nat_17['Total_Viviendas_17'] > 0 else 0,
            'Pct_Households_2024': (val_24 / nat_24['Total_Viviendas_24']) * 100 if nat_24['Total_Viviendas_24'] > 0 else 0
        })
    
    national_df = pd.DataFrame(national_summary)
    national_path = os.path.join(OUTPUT_FOLDER, 'Statistics', 'National_Water_Summary.xlsx')
    
    with pd.ExcelWriter(national_path, engine='openpyxl') as writer:
        national_df.to_excel(writer, sheet_name='Water_Sources', index=False)
        
        overview = pd.DataFrame({
            'Metric': ['Census Blocks', 'Population', 'Households', 'Avg Persons per Household',
                      'Rural Blocks', 'Urban Blocks'],
            '2017': [nat_17['N_Blocks_17'], nat_17['Total_Personas_17'], nat_17['Total_Viviendas_17'],
                    nat_17['Avg_Persons_Per_Household_17'], nat_17.get('Rural_Blocks_17', 0), nat_17.get('Urban_Blocks_17', 0)],
            '2024': [nat_24['N_Blocks_24'], nat_24['Total_Personas_24'], nat_24['Total_Viviendas_24'],
                    nat_24['Avg_Persons_Per_Household_24'], nat_24.get('Rural_Blocks_24', 0), nat_24.get('Urban_Blocks_24', 0)]
        })
        overview['Change'] = overview['2024'] - overview['2017']
        overview['Change_Pct'] = ((overview['2024'] / overview['2017']) - 1) * 100
        overview.to_excel(writer, sheet_name='Overview', index=False)
    
    print(f"  Saved {national_path}")
    
    report_lines.append("\n" + "="*100)
    report_lines.append("ANALYSIS COMPLETE")
    report_lines.append(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report_lines.append(f"NOTE: SHAC spatial join uses {SHAC_BUFFER_DISTANCE}m buffer distance")
    report_lines.append("="*100)
    
    report_path = os.path.join(OUTPUT_FOLDER, 'Reportes', f'Water_Analysis_Report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt')
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report_lines))
    print(f"\nReport saved: {report_path}")
    
    print("\n" + "\n".join(report_lines[:100]))
    
    print("\n" + "="*100)
    print("ANALYSIS COMPLETE")
    print("="*100)
    print(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Output folder: {OUTPUT_FOLDER}")

if __name__ == "__main__":
    main()