# Procesando los datos de la DGA

In [None]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import geopandas as gpd
from pyproj import Transformer
from shapely.geometry import Point
from shapely.prepared import prep

INPUT_XLSX = r"\assessment_of_wells_chile\data\DGA\DGA_consolidado_nacional\DerechosConcedidos.xlsx"

OUTPUT_CSV_FILTERED = r"\assessment_of_wells_chile\data\DGA\DGA_consolidado_nacional\DerechosConcedidos_edit_Subterraneo.csv"
OUTPUT_XLSX_FILTERED = OUTPUT_CSV_FILTERED.replace(".csv", ".xlsx")

GDB_PATH = r"\assessment_of_wells_chile\arcgis\assessment_of_wells_chile\Default.gdb"
LAYER_NAME = "CHL_Country"

OUTPUTS_FOLDER = r"\assessment_of_wells_chile\data\DGA\DGA_consolidado_nacional\outputs"
os.makedirs(OUTPUTS_FOLDER, exist_ok=True)

def normalize_columns(cols: pd.Index) -> pd.Index:
    s = cols.astype(str)
    s = s.str.replace("\n", " ", regex=False).str.replace("\t", " ", regex=False)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s

def strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def norm_text(s) -> str:
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = strip_accents(s).casefold()
    s = re.sub(r"\s+", " ", s)
    return s

def clean_illegal_characters(val):
    if pd.isna(val) or not isinstance(val, str):
        return val
    val = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", val)
    return val.replace("\r\n", " ").replace("\r", " ").strip()

def clean_number(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().replace("\t", "").replace(" ", "")
    if s == "":
        return np.nan

    if re.search(r"[a-zA-Z]", s) and "E" not in s.upper():
        return np.nan

    if "." in s and "," in s:
        s = s.replace(".", "").replace(",", ".")
    elif "," in s:
        s = s.replace(",", ".")

    try:
        return float(s)
    except ValueError:
        return np.nan

def parse_dms_compact(val):
    if pd.isna(val):
        return np.nan
    try:
        v = float(val)
    except Exception:
        return np.nan

    if v == 0:
        return np.nan

    if abs(v) < 90:
        return -abs(v)

    s_val = "{:.6f}".format(v).split(".")[0].zfill(6)

    try:
        deg = float(s_val[:-4])
        minu = float(s_val[-4:-2])
        sec = float(s_val[-2:])

        if not (0 <= minu < 60) or not (0 <= sec < 60):
            return np.nan

        decimal = deg + (minu / 60.0) + (sec / 3600.0)
        return -decimal
    except Exception:
        return np.nan

def is_plausible_chile_bbox(lat, lon) -> bool:
    if pd.isna(lat) or pd.isna(lon):
        return False
    try:
        lat = float(lat)
        lon = float(lon)
    except Exception:
        return False
    return (-56.8 <= lat <= -16.0) and (-76.8 <= lon <= -66.0)

def pick_col(df: pd.DataFrame, candidates, required=True):
    for c in df.columns:
        for cand in candidates:
            if callable(cand):
                try:
                    if cand(c):
                        return c
                except Exception:
                    continue
            else:
                if c == cand:
                    return c
    if required:
        raise KeyError(f"No se encontr√≥ columna. Candidatos: {candidates}")
    return None

def save_df_both(df: pd.DataFrame, base_path_no_ext: str, index=False):
    xlsx_path = base_path_no_ext + ".xlsx"
    csv_path = base_path_no_ext + ".csv"
    df.to_excel(xlsx_path, index=index)
    df.to_csv(csv_path, index=index, encoding="utf-8-sig")
    return xlsx_path, csv_path

def save_multisheet_with_csvs(sheets: dict, xlsx_path: str, csv_folder: str):
    os.makedirs(csv_folder, exist_ok=True)
    with pd.ExcelWriter(xlsx_path, engine="openpyxl") as writer:
        for name, df_ in sheets.items():
            sheet = str(name)[:31]
            df_.to_excel(writer, sheet_name=sheet, index=False)

            csv_path = os.path.join(csv_folder, f"{name}.csv")
            xlsx_sheet_path = os.path.join(csv_folder, f"{name}.xlsx")

            df_.to_csv(csv_path, index=False, encoding="utf-8-sig")
            df_.to_excel(xlsx_sheet_path, index=False)

_transformers_cache = {}

def parse_datum_label(datum_val):
    s = norm_text(datum_val)
    if not s:
        return None
    if "1984" in s or "wgs" in s:
        return "1984"
    if "1956" in s or "psad" in s:
        return "1956"
    if "1969" in s or "sad" in s:
        return "1969"
    return None

def get_transformer_utm_to_wgs84(datum_val, huso_val):
    d = parse_datum_label(datum_val)
    if d is None:
        return None
    try:
        z = int(round(float(huso_val)))
    except Exception:
        return None

    epsg = None
    if d == "1984":
        epsg = {17: 32717, 18: 32718, 19: 32719, 20: 32720}.get(z)
    elif d == "1956":
        epsg = {17: 24877, 18: 24878, 19: 24879, 20: None}.get(z)
    elif d == "1969":
        epsg = {17: 29187, 18: 29188, 19: 29189, 20: None}.get(z)

    if epsg is None:
        return None

    if epsg not in _transformers_cache:
        _transformers_cache[epsg] = Transformer.from_crs(f"EPSG:{epsg}", "EPSG:4326", always_xy=True)
    return _transformers_cache[epsg]

def get_transformer_geog_to_wgs84(datum_val):
    d = parse_datum_label(datum_val)
    if d is None or d == "1984":
        return None

    src = None
    if d == "1956":
        src = "EPSG:4248"
    elif d == "1969":
        src = "EPSG:4618"

    if src is None:
        return None

    key = (src, "EPSG:4326")
    if key not in _transformers_cache:
        _transformers_cache[key] = Transformer.from_crs(src, "EPSG:4326", always_xy=True)
    return _transformers_cache[key]

def choose_datum_cols(df, utm_n_col, utm_e_col, lat_col, lon_col):
    datum_cols = [c for c in df.columns if norm_text(c).startswith("datum")]
    if not datum_cols:
        return None, None
    if len(datum_cols) == 1:
        return datum_cols[0], datum_cols[0]

    utm_mask = pd.Series(False, index=df.index)
    ll_mask = pd.Series(False, index=df.index)

    if utm_n_col and utm_e_col:
        utm_mask = df[utm_n_col].notna() & df[utm_e_col].notna()
    if lat_col and lon_col:
        ll_mask = df[lat_col].notna() & df[lon_col].notna()

    scores_utm = {}
    scores_ll = {}
    for c in datum_cols:
        scores_utm[c] = int((df[c].notna() & utm_mask).sum())
        scores_ll[c] = int((df[c].notna() & ll_mask).sum())

    datum_utm = max(scores_utm, key=scores_utm.get)
    datum_ll = max(scores_ll, key=scores_ll.get)
    return datum_utm, datum_ll

def utm_inputs_valid(norte, este, huso, datum) -> bool:
    if pd.isna(norte) or pd.isna(este) or pd.isna(huso) or pd.isna(datum):
        return False
    try:
        n = float(norte)
        e = float(este)
        h = int(round(float(huso)))
    except Exception:
        return False

    if h < 17 or h > 20:
        return False

    if not (100000 <= e <= 900000):
        return False
    if not (1000000 <= n <= 10000000):
        return False

    return get_transformer_utm_to_wgs84(datum, huso) is not None

def latlon_inputs_valid(lat_raw, lon_raw) -> bool:
    if pd.isna(lat_raw) or pd.isna(lon_raw):
        return False
    return True

def main():
    df = pd.read_excel(INPUT_XLSX, header=6, engine="openpyxl")
    df.columns = normalize_columns(df.columns)

    df["_row_id"] = np.arange(len(df), dtype=int)
    df["_excel_row"] = df["_row_id"] + 8

    col_nat = pick_col(
        df,
        candidates=[
            "Naturaleza del Agua",
            lambda c: ("naturaleza" in norm_text(c) and "agua" in norm_text(c)),
        ],
        required=True,
    )

    mask_sub = df[col_nat].apply(norm_text).eq("subterranea")
    df_sub = df.loc[mask_sub].copy()

    col_expediente = pick_col(
        df_sub,
        candidates=[
            "C√≥digo de Expediente",
            lambda c: ("codigo" in norm_text(c) and "expediente" in norm_text(c)),
        ],
        required=True,
    )

    col_utm_n = pick_col(
        df_sub,
        [lambda c: ("utm" in norm_text(c) and "norte" in norm_text(c) and "capt" in norm_text(c))],
        required=False,
    )
    col_utm_e = pick_col(
        df_sub,
        [lambda c: ("utm" in norm_text(c) and "este" in norm_text(c) and "capt" in norm_text(c))],
        required=False,
    )
    col_huso = pick_col(df_sub, [lambda c: norm_text(c) == "huso"], required=False)

    col_lat_cap = pick_col(
        df_sub, [lambda c: ("latitud" in norm_text(c) and "capt" in norm_text(c))], required=False
    )
    col_lon_cap = pick_col(
        df_sub, [lambda c: ("longitud" in norm_text(c) and "capt" in norm_text(c))], required=False
    )

    datum_utm_col, datum_ll_col = choose_datum_cols(df_sub, col_utm_n, col_utm_e, col_lat_cap, col_lon_cap)

    months = [
        "Enero", "Febrero", "Marzo", "Abril", "Mayo", "Junio",
        "Julio", "Agosto", "Septiembre", "Octubre", "Noviembre", "Diciembre"
    ]
    month_cols = [c for c in df_sub.columns if c in months]

    col_caudal_anual_prom = pick_col(
        df_sub,
        candidates=[
            "Caudal Anual Prom",
            lambda c: ("caudal" in norm_text(c) and "anual" in norm_text(c) and "prom" in norm_text(c)),
        ],
        required=False,
    )

    numeric_cols = []
    numeric_cols += month_cols
    if col_caudal_anual_prom:
        numeric_cols.append(col_caudal_anual_prom)
    for c in [col_utm_n, col_utm_e, col_huso, col_lat_cap, col_lon_cap]:
        if c:
            numeric_cols.append(c)

    numeric_cols = sorted(set(numeric_cols))
    for c in numeric_cols:
        df_sub[c] = df_sub[c].apply(clean_number)

    for c in df_sub.select_dtypes(include=["object"]).columns:
        df_sub[c] = df_sub[c].apply(clean_illegal_characters)

    df_sub["lat_wgs84"] = np.nan
    df_sub["lon_wgs84"] = np.nan
    df_sub["Metodo_Coordenadas"] = ""
    df_sub["Coord_Valida_Chile_BBox"] = False
    df_sub["Huso_Usado_En_Calculo"] = np.nan

    def calculate_wgs84(row):
        if col_utm_n and col_utm_e and col_huso and datum_utm_col:
            norte = row[col_utm_n]
            este = row[col_utm_e]
            huso = row[col_huso]
            datum_utm = row[datum_utm_col]

            if utm_inputs_valid(norte, este, huso, datum_utm):
                tr = get_transformer_utm_to_wgs84(datum_utm, huso)
                try:
                    lon, lat = tr.transform(float(este), float(norte))

                    if is_plausible_chile_bbox(lat, lon):
                        return lat, lon, "UTM", True, float(round(float(huso), 0))
                    if is_plausible_chile_bbox(lon, lat):
                        return lon, lat, "UTM", True, float(round(float(huso), 0))

                except Exception:
                    pass

        if col_lat_cap and col_lon_cap:
            lat_raw = row[col_lat_cap]
            lon_raw = row[col_lon_cap]
            datum_ll = row[datum_ll_col] if datum_ll_col else np.nan

            if latlon_inputs_valid(lat_raw, lon_raw):
                lat = parse_dms_compact(lat_raw)
                lon = parse_dms_compact(lon_raw)

                if is_plausible_chile_bbox(lat, lon):
                    trg = get_transformer_geog_to_wgs84(datum_ll)
                    if trg is not None:
                        try:
                            lon2, lat2 = trg.transform(float(lon), float(lat))
                            if is_plausible_chile_bbox(lat2, lon2):
                                return lat2, lon2, "LatLon", True, np.nan
                        except Exception:
                            pass
                    return lat, lon, "LatLon", True, np.nan

        return np.nan, np.nan, "", False, np.nan

    latlons = df_sub.apply(calculate_wgs84, axis=1, result_type="expand")
    latlons.columns = ["lat_wgs84", "lon_wgs84", "Metodo_Coordenadas", "Coord_Valida_Chile_BBox", "Huso_Usado_En_Calculo"]
    df_sub[latlons.columns] = latlons

    df_sub.to_csv(OUTPUT_CSV_FILTERED, index=False, encoding="utf-8-sig")
    df_sub.to_excel(OUTPUT_XLSX_FILTERED, index=False)

    gdf_chile = gpd.read_file(GDB_PATH, layer=LAYER_NAME)
    if str(gdf_chile.crs).upper() != "EPSG:4326":
        gdf_chile = gdf_chile.to_crs("EPSG:4326")

    try:
        gdf_chile["geometry"] = gdf_chile.geometry.make_valid()
    except Exception:
        gdf_chile["geometry"] = gdf_chile.geometry.buffer(0)

    chile_geom = gdf_chile.geometry.unary_union
    chile_prepared = prep(chile_geom)

    def point_in_chile(lon, lat) -> bool:
        if pd.isna(lon) or pd.isna(lat):
            return False
        try:
            p = Point(float(lon), float(lat))
        except Exception:
            return False
        return bool(chile_prepared.covers(p))

    df_con_coords = df_sub.dropna(subset=["lat_wgs84", "lon_wgs84"]).copy()
    df_sin_coords = df_sub[df_sub[["lat_wgs84", "lon_wgs84"]].isna().any(axis=1)].copy()

    df_zonefix_diag = []

    def try_fix_zone(norte, este, huso, datum):
        if pd.isna(norte) or pd.isna(este) or pd.isna(huso) or pd.isna(datum):
            return None
        try:
            n = float(norte)
            e = float(este)
            h0 = int(round(float(huso)))
        except Exception:
            return None

        candidates = [h0, h0 - 1, h0 + 1]
        candidates = [h for h in candidates if 17 <= h <= 20]

        for h in candidates:
            tr = get_transformer_utm_to_wgs84(datum, h)
            if tr is None:
                continue
            try:
                lon, lat = tr.transform(e, n)
            except Exception:
                continue
            if point_in_chile(lon, lat):
                return lat, lon, h
        return None

    if len(df_con_coords) > 0:
        inside_mask_initial = df_con_coords.apply(lambda r: point_in_chile(r["lon_wgs84"], r["lat_wgs84"]), axis=1)
    else:
        inside_mask_initial = pd.Series([], dtype=bool)

    df_inside_initial = df_con_coords.loc[inside_mask_initial].copy()
    df_outside_geo_initial = df_con_coords.loc[~inside_mask_initial].copy()

    if len(df_outside_geo_initial) > 0 and col_utm_n and col_utm_e and col_huso and datum_utm_col:
        mask_utm_outside = df_outside_geo_initial["Metodo_Coordenadas"].eq("UTM")
        df_candidates = df_outside_geo_initial.loc[mask_utm_outside].copy()

        for idx, row in df_candidates.iterrows():
            res = try_fix_zone(
                norte=row[col_utm_n],
                este=row[col_utm_e],
                huso=row[col_huso],
                datum=row[datum_utm_col],
            )
            df_zonefix_diag.append({
                "_row_id": row.get("_row_id", np.nan),
                "_excel_row": row.get("_excel_row", np.nan),
                "Codigo_Expediente": row.get(col_expediente, ""),
                "huso_original": row.get(col_huso, np.nan),
                "datum_utm": row.get(datum_utm_col, np.nan),
                "lat_pre": row.get("lat_wgs84", np.nan),
                "lon_pre": row.get("lon_wgs84", np.nan),
                "fixed": res is not None,
                "huso_fixed": (res[2] if res else np.nan),
                "lat_post": (res[0] if res else np.nan),
                "lon_post": (res[1] if res else np.nan),
            })

            if res is None:
                continue

            lat_new, lon_new, huso_new = res

            df_sub.at[idx, "lat_wgs84"] = lat_new
            df_sub.at[idx, "lon_wgs84"] = lon_new
            df_sub.at[idx, "Metodo_Coordenadas"] = f"UTM_zone_fixed_to_{huso_new}"
            df_sub.at[idx, "Huso_Usado_En_Calculo"] = float(huso_new)

    if df_zonefix_diag:
        df_zonefix_diag = pd.DataFrame(df_zonefix_diag)
        diag_base = os.path.join(OUTPUTS_FOLDER, "Diagnostico_Rescate_Huso_UTM")
        save_df_both(df_zonefix_diag, diag_base, index=False)

    df_con_coords = df_sub.dropna(subset=["lat_wgs84", "lon_wgs84"]).copy()
    df_sin_coords = df_sub[df_sub[["lat_wgs84", "lon_wgs84"]].isna().any(axis=1)].copy()

    if len(df_con_coords) > 0:
        inside_mask = df_con_coords.apply(lambda r: point_in_chile(r["lon_wgs84"], r["lat_wgs84"]), axis=1)
    else:
        inside_mask = pd.Series([], dtype=bool)

    df_inside = df_con_coords.loc[inside_mask].copy()
    df_outside_geo = df_con_coords.loc[~inside_mask].copy()

    df_inside["Estado_Ubicacion"] = "Validado en Chile"
    df_outside_geo["Estado_Ubicacion"] = "Fuera del limite"
    df_sin_coords["Estado_Ubicacion"] = "Sin coordenadas"

    df_outside_final = pd.concat([df_outside_geo, df_sin_coords], ignore_index=True)

    def norm_code_series(s: pd.Series) -> pd.Series:
        return s.fillna("").astype(str).str.strip()

    df_inside["_exp_norm"] = norm_code_series(df_inside[col_expediente]) if len(df_inside) else ""
    df_outside_final["_exp_norm"] = norm_code_series(df_outside_final[col_expediente]) if len(df_outside_final) else ""
    df_sub["_exp_norm"] = norm_code_series(df_sub[col_expediente])

    inside_codes = set(pd.Series(df_inside["_exp_norm"]).replace("", np.nan).dropna().unique()) if len(df_inside) else set()

    mask_remove_outside = df_outside_final["_exp_norm"].isin(inside_codes) & df_outside_final["_exp_norm"].ne("")
    df_outside_removed = df_outside_final.loc[mask_remove_outside].copy()
    df_outside_removed["Motivo_Eliminacion"] = "Expediente existe en grupo DENTRO"

    df_outside_final_clean = df_outside_final.loc[~mask_remove_outside].copy()

    dup_mask_all = df_sub["_exp_norm"].ne("") & df_sub["_exp_norm"].duplicated(keep=False)
    df_duplicates_all = df_sub.loc[dup_mask_all].copy()

    codes_removed = sorted(set(df_outside_removed["_exp_norm"].replace("", np.nan).dropna().unique()))
    df_dup_pairs = pd.concat(
        [
            df_inside[df_inside["_exp_norm"].isin(codes_removed)].assign(Grupo="DENTRO"),
            df_outside_removed.assign(Grupo="FUERA_ELIMINADO"),
        ],
        ignore_index=True,
    )

    dup_summary = pd.DataFrame(
        {
            "metric": [
                "n_subterranea_total",
                "n_inside",
                "n_outside_final_antes",
                "n_outside_eliminados_por_duplicado_con_inside",
                "n_outside_final_despues",
                "n_codigos_con_eliminacion",
                "n_registros_con_codigo_duplicado_en_subterranea",
            ],
            "value": [
                int(len(df_sub)),
                int(len(df_inside)),
                int(len(df_outside_final)),
                int(len(df_outside_removed)),
                int(len(df_outside_final_clean)),
                int(len(codes_removed)),
                int(len(df_duplicates_all)),
            ],
        }
    )

    dup_xlsx = os.path.join(OUTPUTS_FOLDER, "Duplicados_Codigo_de_Expediente.xlsx")
    dup_csv_folder = os.path.join(OUTPUTS_FOLDER, "Duplicados_Codigo_de_Expediente_CSVs")
    save_multisheet_with_csvs(
        sheets={
            "Resumen": dup_summary,
            "Fuera_Eliminados": df_outside_removed.drop(columns=["_exp_norm"], errors="ignore"),
            "Pares_Dentro_vs_Fuera": df_dup_pairs.drop(columns=["_exp_norm"], errors="ignore"),
            "Duplicados_Todos_Subterranea": df_duplicates_all.drop(columns=["_exp_norm"], errors="ignore"),
        },
        xlsx_path=dup_xlsx,
        csv_folder=dup_csv_folder,
    )

    for _df in [df_inside, df_outside_final_clean]:
        if "_exp_norm" in _df.columns:
            _df.drop(columns=["_exp_norm"], inplace=True)

    inside_base = os.path.join(OUTPUTS_FOLDER, "Registros_En_Chile")
    outside_base = os.path.join(OUTPUTS_FOLDER, "Registros_Fuera_Chile")

    save_df_both(df_inside, inside_base, index=False)
    save_df_both(df_outside_final_clean, outside_base, index=False)

    region_col = pick_col(df_sub, ["Regi√≥n", "Region", lambda c: norm_text(c) == "region"], required=False)
    prov_col = pick_col(df_sub, ["Provincia", lambda c: norm_text(c) == "provincia"], required=False)
    com_col = pick_col(df_sub, ["Comuna", lambda c: norm_text(c) == "comuna"], required=False)

    df_outside_only_geo_clean = df_outside_final_clean[
        df_outside_final_clean["Estado_Ubicacion"].eq("Fuera del limite")
    ].copy()

    if len(df_outside_only_geo_clean) > 0 and all([region_col, prov_col, com_col]):
        dens = (
            df_outside_only_geo_clean
            .groupby([region_col, prov_col, com_col], dropna=False)
            .size()
            .reset_index(name="n_puntos_fuera")
            .sort_values("n_puntos_fuera", ascending=False)
        )
    else:
        dens = pd.DataFrame(columns=[region_col or "Regi√≥n", prov_col or "Provincia", com_col or "Comuna", "n_puntos_fuera"])

    dens_base = os.path.join(OUTPUTS_FOLDER, "Analisis_Densidad_Fuera_Chile")
    save_df_both(dens, dens_base, index=False)

    neg_xlsx = os.path.join(OUTPUTS_FOLDER, "Analisis_Caudal_Anual_Prom_Negativo.xlsx")
    neg_csv_folder = os.path.join(OUTPUTS_FOLDER, "Analisis_Caudal_Anual_Prom_Negativo_CSVs")

    if col_caudal_anual_prom and col_caudal_anual_prom in df_sub.columns:
        map_inside = dict(zip(df_inside.get("_row_id", []), df_inside.get("Estado_Ubicacion", [])))
        map_out = dict(zip(df_outside_final_clean.get("_row_id", []), df_outside_final_clean.get("Estado_Ubicacion", [])))

        def get_state(rid):
            if rid in map_inside:
                return map_inside[rid]
            if rid in map_out:
                return map_out[rid]
            return "No clasificado"

        df_sub_state = df_sub.copy()
        df_sub_state["Estado_Ubicacion"] = df_sub_state["_row_id"].apply(get_state)

        df_neg = df_sub_state[
            df_sub_state[col_caudal_anual_prom].notna() & (df_sub_state[col_caudal_anual_prom] < 0)
        ].copy()

        resumen_general = pd.DataFrame(
            {
                "metric": [
                    "registros_total_subterranea",
                    "registros_con_caudal_anual_prom_no_nulo",
                    "registros_caudal_anual_prom_negativo",
                ],
                "value": [
                    int(len(df_sub_state)),
                    int(df_sub_state[col_caudal_anual_prom].notna().sum()),
                    int(len(df_neg)),
                ],
            }
        )

        resumen_por_estado = (
            df_neg.groupby("Estado_Ubicacion", dropna=False)
            .size()
            .reset_index(name="n_negativos")
            .sort_values("n_negativos", ascending=False)
        )

        if all([region_col, prov_col, com_col]):
            resumen_por_loc = (
                df_neg.groupby([region_col, prov_col, com_col, "Estado_Ubicacion"], dropna=False)
                .size()
                .reset_index(name="n_negativos")
                .sort_values("n_negativos", ascending=False)
            )
        else:
            resumen_por_loc = pd.DataFrame()

        save_multisheet_with_csvs(
            sheets={
                "Resumen_General": resumen_general,
                "Resumen_por_Estado": resumen_por_estado,
                "Resumen_por_Region_Prov_Com": resumen_por_loc,
                "Registros_Negativos": df_neg,
            },
            xlsx_path=neg_xlsx,
            csv_folder=neg_csv_folder,
        )
    else:
        save_multisheet_with_csvs(
            sheets={"Error": pd.DataFrame({"error": ["No se encontr√≥ la columna 'Caudal Anual Prom' (o equivalente)."]})},
            xlsx_path=neg_xlsx,
            csv_folder=neg_csv_folder,
        )

    print("OK. Archivos principales:")
    print(" - Filtrado Subterranea:", OUTPUT_XLSX_FILTERED, "y", OUTPUT_CSV_FILTERED)
    print(" - Outputs:", OUTPUTS_FOLDER)

if __name__ == "__main__":
    main()

# Downloading data

In [None]:
import os
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from queue import Queue
import sys
import re

TEST_MODE = False
SHOW_BROWSER = False

BRAVE_PATH = r"\AppData\Local\BraveSoftware\Brave-Browser\Application\brave.exe"

RUTA_BASE = r"\assessment_of_wells_chile\data\DGA"

ARCHIVO_EXCEL_1 = r"\assessment_of_wells_chile\data\DGA\DGA_consolidado_nacional\outputs\Analisis_Caudal_Anual_Prom_Negativo_CSVs\Registros_Negativos.xlsx"
ARCHIVO_EXCEL_2 = r"\assessment_of_wells_chile\data\DGA\DGA_consolidado_nacional\outputs\Registros_Fuera_Chile.xlsx"

RUTAS = {
    'txt_salida': os.path.join(RUTA_BASE, "output_revisados", "txt"),
    'excel_salida': os.path.join(RUTA_BASE, "output_revisados", "excel"),
    'log_exitos': os.path.join(RUTA_BASE, "output_revisados", "log_exitos.json"),
    'log_errores': os.path.join(RUTA_BASE, "output_revisados", "log_errores.json"),
}

for key in ['txt_salida', 'excel_salida']:
    os.makedirs(RUTAS[key], exist_ok=True)
os.makedirs(os.path.join(RUTA_BASE, "output_revisados"), exist_ok=True)

NUM_WORKERS = 15
BATCH_SAVE_SIZE = 50

lock_log_exitos = threading.Lock()
lock_log_errores = threading.Lock()
lock_print = threading.Lock()

def cargar_log(ruta_log):
    if os.path.exists(ruta_log):
        try:
            with open(ruta_log, 'r', encoding='utf-8') as f:
                return json.load(f)
        except:
            return {}
    return {}

def guardar_log(ruta_log, datos):
    with open(ruta_log, 'w', encoding='utf-8') as f:
        json.dump(datos, f, indent=2, ensure_ascii=False)

def registrar_exito(codigo, datos):
    with lock_log_exitos:
        log = cargar_log(RUTAS['log_exitos'])
        log[codigo] = {'fecha': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'estado': 'exitoso'}
        guardar_log(RUTAS['log_exitos'], log)

def registrar_error(codigo, msg):
    with lock_log_errores:
        log = cargar_log(RUTAS['log_errores'])
        log[codigo] = {'fecha': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'error': str(msg)}
        guardar_log(RUTAS['log_errores'], log)

def print_safe(msg):
    with lock_print:
        print(msg)

def encontrar_columna_expediente(df):
    patrones = ['c√≥digo de expediente', 'codigo de expediente', 'expediente', 'id', 'c√≥digo', 'codigo']
    columnas_lower = {col.lower().strip(): col for col in df.columns}
    
    for patron in patrones:
        if patron in columnas_lower: return columnas_lower[patron]
        for col in columnas_lower:
            if patron in col: return columnas_lower[col]
            
    return df.columns[0] if len(df.columns) > 0 else None

def leer_codigos_excel(ruta_archivo, nombre_archivo=""):
    codigos = []
    if not os.path.exists(ruta_archivo):
        print(f"   ‚ùå Archivo no existe: {ruta_archivo}")
        return codigos
    
    try:
        print(f"   üìÑ Leyendo: {nombre_archivo}")
        df = pd.read_excel(ruta_archivo)
        col_exp = encontrar_columna_expediente(df)
        if col_exp:
            codigos_raw = df[col_exp].dropna().astype(str).str.strip().tolist()
            codigos = [c for c in codigos_raw if c and c.lower() != 'nan' and len(c) > 2]
            print(f"      ‚úÖ Registros encontrados: {len(codigos)}")
    except Exception as e:
        print(f"      ‚ùå Error leyendo: {e}")
    return codigos

def crear_driver(headless=True):
    options = Options()
    options.binary_location = BRAVE_PATH
    
    if headless and not SHOW_BROWSER:
        options.add_argument('--headless=new')
    
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--log-level=3')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-extensions')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

class DriverPool:
    def __init__(self, size):
        self.drivers = Queue()
        print(f"ü¶Å Iniciando {size} navegador(es) Brave...")
        for i in range(size):
            try:
                driver = crear_driver(headless=not SHOW_BROWSER)
                self.drivers.put(driver)
                print(f"   ‚úÖ Brave #{i+1} listo")
            except Exception as e:
                print(f"   ‚ùå Error iniciando Brave #{i+1}: {e}")
    
    def get_driver(self): return self.drivers.get()
    def return_driver(self, driver): self.drivers.put(driver)
    def close_all(self):
        print("üîí Cerrando navegadores...")
        while not self.drivers.empty():
            try: self.drivers.get().quit()
            except: pass

def extraer_ubicacion_detalle_solicitud(driver, verbose=False):
    ubicacion = {
        'Latitud': None, 'Longitud': None, 'Datum': None,
        'UTM norte': None, 'UTM este': None, 'Huso': None,
        'Datum UTM': None, 'Unidad UTM': None
    }
    try:
        tab_encontrado = False
        try:
            tabs = driver.find_elements(By.XPATH, "//*[contains(text(), 'Detalle de la solicitud')]")
            for tab in tabs:
                if tab.is_displayed():
                    driver.execute_script("arguments[0].click();", tab)
                    tab_encontrado = True
                    time.sleep(2)
                    break
        except: pass
        
        if not tab_encontrado:
            try:
                mat_tabs = driver.find_elements(By.CSS_SELECTOR, ".mat-tab-label, [role='tab']")
                for tab in mat_tabs:
                    if 'solicitud' in tab.text.lower():
                        driver.execute_script("arguments[0].click();", tab)
                        tab_encontrado = True
                        time.sleep(2)
                        break
            except: pass
        
        if not tab_encontrado: return ubicacion
        
        puntos_encontrado = False
        try:
            elems = driver.find_elements(By.XPATH, "//*[contains(text(), 'Puntos de captaci√≥n')]")
            for elem in elems:
                if elem.is_displayed():
                    driver.execute_script("arguments[0].click();", elem)
                    puntos_encontrado = True
                    time.sleep(2)
                    break
        except: pass
        
        if not puntos_encontrado:
            try:
                btns = driver.find_elements(By.CSS_SELECTOR, "button, mat-expansion-panel-header")
                for btn in btns:
                    if 'capta' in btn.text.lower():
                        driver.execute_script("arguments[0].click();", btn)
                        puntos_encontrado = True
                        time.sleep(2)
                        break
            except: pass
            
        time.sleep(1)
        tablas = driver.find_elements(By.TAG_NAME, "table")
        for tabla in tablas:
            try:
                if not tabla.is_displayed(): continue
                headers = [th.text.strip().lower() for th in tabla.find_elements(By.TAG_NAME, "th")]
                if any(h in ' '.join(headers) for h in ['utm', 'latitud', 'huso']):
                    filas = tabla.find_elements(By.TAG_NAME, "tr")
                    for fila in filas:
                        celdas = fila.find_elements(By.TAG_NAME, "td")
                        if len(celdas) >= 6:
                            vals = [c.text.strip() for c in celdas]
                            headers_limpios = [h for h in headers if h]
                            for idx, h in enumerate(headers_limpios):
                                if idx < len(vals):
                                    v = vals[idx]
                                    if v and v != '-':
                                        if 'latitud' in h: ubicacion['Latitud'] = v
                                        elif 'longitud' in h: ubicacion['Longitud'] = v
                                        elif 'datum' in h and 'utm' not in h: ubicacion['Datum'] = v
                                        elif 'norte' in h: ubicacion['UTM norte'] = v
                                        elif 'este' in h: ubicacion['UTM este'] = v
                                        elif 'huso' in h: ubicacion['Huso'] = v
                                        elif 'datum utm' in h: ubicacion['Datum UTM'] = v
                                        elif 'unidad' in h: ubicacion['Unidad UTM'] = v
                            
                            if not any(ubicacion.values()):
                                orden = ['Latitud', 'Longitud', 'Datum', 'UTM norte', 'UTM este', 'Huso', 'Datum UTM', 'Unidad UTM']
                                for i, k in enumerate(orden):
                                    if i < len(vals) and vals[i] != '-': ubicacion[k] = vals[i]
                            break
                    break
            except: continue
            
        if not any(ubicacion.values()):
            body = driver.find_element(By.TAG_NAME, "body").text
            patrones = {
                'UTM norte': r'UTM\s*norte[:\s]+([0-9.,]+)', 'UTM este': r'UTM\s*este[:\s]+([0-9.,]+)',
                'Huso': r'Huso[:\s]+(\d+)', 'Latitud': r'Latitud[:\s]+([-0-9.,]+)', 'Longitud': r'Longitud[:\s]+([-0-9.,]+)'
            }
            for k, p in patrones.items():
                m = re.search(p, body, re.IGNORECASE)
                if m and not ubicacion.get(k): ubicacion[k] = m.group(1)

    except: pass
    return ubicacion

def extraer_datos_dga(url, driver, verbose=False):
    raw_id = url.split('/')[-1]
    if raw_id.endswith('-1'):
        expediente_limpio = raw_id[:-2]
    else:
        expediente_limpio = raw_id

    resultado = {
        'expediente': expediente_limpio,
        'datos_solicitud': {}, 'caudal_concedido': {}, 'caudal_mensual': {}, 'ubicacion': {}
    }
    
    try:
        driver.get(url)
        time.sleep(3)
        
        resultado['ubicacion'] = extraer_ubicacion_detalle_solicitud(driver, verbose)
        
        try:
            btns = driver.find_elements(By.XPATH, "//*[contains(text(), 'Detalle de la resoluci√≥n')]")
            for btn in btns:
                if btn.is_displayed():
                    driver.execute_script("arguments[0].click();", btn)
                    time.sleep(2)
                    break
        except: pass

        try:
            btns_c = driver.find_elements(By.XPATH, "//*[contains(text(), 'Caudal concedido')]")
            for btn in btns_c:
                if btn.is_displayed():
                    driver.execute_script("arguments[0].click();", btn)
                    time.sleep(1)
                    break
            
            body = driver.find_element(By.TAG_NAME, "body").text
            lines = body.split('\n')
            campos = {'Ejercicio del derecho': None, 'Tipo de distribuci√≥n': None, 'Caudal': None, 'Volumen': None}
            
            for i, l in enumerate(lines):
                lc = l.strip()
                if lc in campos:
                    for j in range(i+1, min(i+5, len(lines))):
                        val = lines[j].strip()
                        if val and '---' not in val and val not in campos:
                            campos[lc] = val
                            break
            resultado['caudal_concedido'] = campos

            meses = ['Enero', 'Febrero', 'Marzo', 'Abril']
            for t in driver.find_elements(By.TAG_NAME, "table"):
                if not t.is_displayed(): continue
                hs = [x.text.strip() for x in t.find_elements(By.TAG_NAME, "th")]
                if any(m in hs for m in meses):
                    rows = t.find_elements(By.TAG_NAME, "tr")
                    for r in rows:
                        cs = [x.text.strip() for x in r.find_elements(By.TAG_NAME, "td")]
                        if len(cs) >= 12:
                            m_names = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre', 'Unidad']
                            for k, m_name in enumerate(m_names):
                                if k < len(cs): resultado['caudal_mensual'][m_name] = cs[k]
                            break
        except: pass
        
    except Exception as e:
        raise e
    return resultado

def guardar_txt_formato_especifico(datos, ruta_carpeta):
    nombre = f"{datos['expediente']}.txt"
    ruta = os.path.join(ruta_carpeta, nombre)
    cc = datos.get('caudal_concedido', {})
    cm = datos.get('caudal_mensual', {})
    ub = datos.get('ubicacion', {})
    
    with open(ruta, 'w', encoding='utf-8') as f:
        f.write("="*60 + f"\nEXPEDIENTE: {datos['expediente']}\n" + "="*60 + "\n\n")
        f.write("PUNTOS DE CAPTACI√ìN (Ubicaci√≥n):\n" + "-"*40 + "\n")
        for k, v in ub.items(): f.write(f"{k}: {v if v else '-'}\n")
        f.write("\nCAUDAL CONCEDIDO:\n" + "-"*40 + "\n")
        f.write(f"Ejercicio del derecho: {cc.get('Ejercicio del derecho', 'Sin informaci√≥n')}\n")
        f.write(f"Tipo de distribuci√≥n: {cc.get('Tipo de distribuci√≥n', 'Sin informaci√≥n')}\n")
        f.write(f"Caudal: {cc.get('Caudal', 'Sin informaci√≥n')}\n")
        f.write(f"Volumen: {cc.get('Volumen', 'Sin informaci√≥n')}\n\n")
        f.write("CAUDAL MENSUAL:\n" + "-"*40 + "\n")
        ms = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre', 'Unidad']
        f.write("\t".join(ms) + "\n")
        f.write("\t".join([cm.get(m, '0') for m in ms]) + "\n")
    return ruta

def generar_fila_excel(datos):
    cc = datos.get('caudal_concedido', {})
    cm = datos.get('caudal_mensual', {})
    ub = datos.get('ubicacion', {})
    fila = {
        'C√≥digo de Expediente': datos['expediente'],
        'Latitud': ub.get('Latitud', ''), 'Longitud': ub.get('Longitud', ''),
        'Datum': ub.get('Datum', ''), 'UTM_Norte': ub.get('UTM norte', ''),
        'UTM_Este': ub.get('UTM este', ''), 'Huso': ub.get('Huso', ''),
        'Datum_UTM': ub.get('Datum UTM', ''), 'Unidad_UTM': ub.get('Unidad UTM', ''),
        'Ejercicio del derecho': cc.get('Ejercicio del derecho', ''),
        'Tipo de distribuci√≥n': cc.get('Tipo de distribuci√≥n', ''),
        'Caudal Aprobado': cc.get('Caudal', ''), 'Volumen Aprobado': cc.get('Volumen', '')
    }
    for m in ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre', 'Unidad']:
        fila[f'Caudal_{m}'] = cm.get(m, '')
    return fila

def procesar_codigo(codigo, driver_pool, verbose=False):
    driver = driver_pool.get_driver()
    codigo = str(codigo).strip()
    url_code = codigo if codigo.endswith('-1') else f"{codigo}-1"
    url = f"https://consulta-expedientes.mop.gob.cl/detalle-expediente/{url_code}"
    
    try:
        datos = extraer_datos_dga(url, driver, verbose)
        guardar_txt_formato_especifico(datos, RUTAS['txt_salida'])
        fila = generar_fila_excel(datos)
        registrar_exito(codigo, datos)
        return {'status': 'ok', 'codigo': codigo, 'fila': fila}
    except Exception as e:
        registrar_error(codigo, str(e))
        return {'status': 'error', 'codigo': codigo, 'msg': str(e)}
    finally:
        driver_pool.return_driver(driver)

def main():
    print("\n" + "="*70)
    print("üöÄ DESCARGA MASIVA DGA (CORREGIDA)")
    print("="*70)
    
    if not os.path.exists(BRAVE_PATH):
        print(f"‚ùå Brave no encontrado en: {BRAVE_PATH}")
        return

    codigos_procesar = set()
    codigos_procesar.update(leer_codigos_excel(ARCHIVO_EXCEL_1, "Registros Negativos"))
    codigos_procesar.update(leer_codigos_excel(ARCHIVO_EXCEL_2, "Registros Fuera Chile"))
    
    lista_codigos = sorted(list(codigos_procesar))
    if not lista_codigos:
        print("‚ùå No hay c√≥digos para procesar.")
        return

    log = cargar_log(RUTAS['log_exitos'])
    procesados = set(log.keys())
    procesados.update([k.replace('-1', '') for k in log.keys()])
    
    pendientes = [c for c in lista_codigos if c not in procesados and f"{c}-1" not in log]
    
    print(f"\nüìã Total √∫nicos: {len(lista_codigos)}")
    print(f"‚úÖ Ya procesados: {len(lista_codigos) - len(pendientes)}")
    print(f"üöÄ Pendientes:    {len(pendientes)}")
    
    if not pendientes:
        print("\n‚ú® ¬°Todo actualizado!")
        return

    driver_pool = DriverPool(NUM_WORKERS)
    resultados = []
    
    try:
        with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
            futures = {executor.submit(procesar_codigo, c, driver_pool): c for c in pendientes}
            
            for i, fut in enumerate(as_completed(futures)):
                res = fut.result()
                code = futures[fut]
                
                if res['status'] == 'ok':
                    resultados.append(res['fila'])
                    print_safe(f"[{i+1}/{len(pendientes)}] ‚úÖ {code}")
                else:
                    print_safe(f"[{i+1}/{len(pendientes)}] ‚ùå {code} | {res.get('msg')[:50]}")
                
                if len(resultados) % BATCH_SAVE_SIZE == 0:
                    pd.DataFrame(resultados).to_excel(os.path.join(RUTAS['excel_salida'], "parcial.xlsx"), index=False)

    except KeyboardInterrupt:
        print("\n‚ö†Ô∏è Interrumpido por usuario")
    finally:
        driver_pool.close_all()
        
    if resultados:
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        ruta_fin = os.path.join(RUTAS['excel_salida'], f"Consolidado_{ts}.xlsx")
        pd.DataFrame(resultados).to_excel(ruta_fin, index=False)
        print(f"\nüíæ Guardado final: {ruta_fin}")

if __name__ == "__main__":
    main()

# Fixing values

In [None]:
import pandas as pd
import numpy as np
import re

input_file = r"\assessment_of_wells_chile\data\DGA\output_revisados\excel\Consolidado_20251216_222918.xlsx"
output_file = r"\assessment_of_wells_chile\data\DGA\output_revisados\excel\Consolidado_txt_individual.xlsx"

print(f"üìñ Leyendo archivo: {input_file} ...")
df = pd.read_excel(input_file)

def limpiar_formato_chileno_seguro(valor):
    if pd.isna(valor) or str(valor).strip() == "":
        return np.nan
    
    if isinstance(valor, (int, float)):
        return float(valor)

    valor_str = str(valor).strip()
    
    valor_clean = re.sub(r'[^\d,.-]', '', valor_str)
    
    if not valor_clean:
        return np.nan
    
    try:
        valor_clean = valor_clean.replace('.', '')
        
        valor_clean = valor_clean.replace(',', '.')
        
        return float(valor_clean)
        
    except ValueError:
        return np.nan

cols_a_convertir = [
    'Latitud', 
    'Longitud', 
    'Datum', 
    'UTM_Norte', 
    'UTM_Este', 
    'Huso', 
    'Datum_UTM',
    'Caudal Aprobado', 
    'Volumen Aprobado'
]

meses = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 
         'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre']
for mes in meses:
    col_name = f'Caudal_{mes}'
    if col_name in df.columns:
        cols_a_convertir.append(col_name)

print("üîÑ Aplicando correcci√≥n de formato Chileno (segura)...")

for col in cols_a_convertir:
    if col in df.columns:
        df[col] = df[col].apply(limpiar_formato_chileno_seguro)

print(f"üíæ Guardando archivo corregido en: {output_file}")
df.to_excel(output_file, index=False)
print("‚úÖ ¬°Proceso completado! Los n√∫meros enteros (19, 188200) se han mantenido intactos.")

check_cols = [c for c in ['UTM_Norte', 'Huso', 'Caudal_Enero'] if c in df.columns]
df[check_cols].head()