In [227]:
# Si no se está usando un ambiente virtual, descomentar y ejecutar las siguientes líneas
# !pip install numpy
# !pip install pandas
# !pip install ucimlrepo
#!pip install seaborn


In [228]:
import sys
print(sys.executable)

c:\Users\chile\anaconda3\envs\devstack\python.exe


In [229]:
# Cargando las librerías

import os
from pathlib import Path
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
ignore_warnings = True
import seaborn as sns
import json
#from ucimlrepo import fetch_ucirepo

#import yaml

In [230]:
# --- Config mínima ---
DICT_TXT = Path(r"C:\Users\chile\projects\proy_insurance\data\raw\dictionary.txt")
CSV_PATH = Path(r"C:\Users\chile\projects\proy_insurance\data\raw\insurance_company_modified.csv")
OUT_DIR  = Path(r"C:\Users\chile\projects\proy_insurance\data\clean_data")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV    = OUT_DIR / "coil2000_clean.csv"
OUT_REPORT = OUT_DIR / "coil2000_dtypes_report.csv"
N_COLS = 86
CAT_COLS = {"MOSTYPE", "MOSHOOFD", "MKOOPKLA"}

In [232]:
RANGES = {
    "MOSTYPE":  (1, 41),
    "MAANTHUI": (1, 10),
    "MGEMOMV":  (1, 6),
    "MGEMLEEF": (1, 6),
    "MOSHOOFD": (1, 10),
    "MGODRK":   (0, 9),
    "PWAPART":  (0, 9),
    "AWAPART":  (1, 12),
    "CARAVAN":  (0, 1),
}

INVALID_PATTERN = re.compile(r"(?i)^\s*$|^(nan|none|null|n/a|invalid|\?|unknown|error|missing|-)$")

# 1) Diccionario -> nombres (simple)
def parse_dictionary(txt_path: Path):
    txt = txt_path.read_text(encoding="utf-8", errors="ignore")
    names = []
    in_table = False
    for ln in txt.splitlines():
        ln = ln.strip()
        if re.match(r"^1\s+\S+", ln):  # empieza tabla
            in_table = True
        if not in_table:
            continue
        if not ln or ln.startswith("L0"):  # termina tabla
            break
        m = re.match(r"^(\d+)\s+(\S+)\s+.*$", ln)
        if m:
            names.append((int(m.group(1)), m.group(2)))
    cols = [n for _, n in sorted(names, key=lambda x: x[0])]
    if len(cols) < N_COLS:
        raise ValueError(f"El diccionario tiene {len(cols)} nombres; se requieren {N_COLS}.")
    return cols[:N_COLS]

# 2) Limpieza strings (simple)
def normalize_strings_to_nan(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.select_dtypes(include="object").columns:
        df[c] = df[c].astype("string").str.strip()
    return df.replace(INVALID_PATTERN, np.nan, regex=True)

# 3) Tipado: 3 categóricas; resto numérico (simple)
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # numérico para no-categóricas
    non_cat = [c for c in df.columns if c not in CAT_COLS]
    df[non_cat] = df[non_cat].apply(pd.to_numeric, errors="coerce")
    # categóricas
    for c in CAT_COLS:
        if c in df.columns:
            df[c] = df[c].astype("category")
    return df

# 4) Reglas por intervalos cerrados (simple)
def apply_interval_rules(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    print(df.shape)

    for col, (lo, hi) in RANGES.items():
        if col not in df.columns:
            continue
        # convertir a numérico de forma robusta (string -> numeric)
        s = pd.to_numeric(df[col].astype("string").str.strip(), errors="coerce") \
            if col in CAT_COLS or not pd.api.types.is_numeric_dtype(df[col]) else df[col]
        mask = s.between(lo, hi, inclusive="both")
        removed = (~mask).sum()
        if removed:
            print(f"{col} ∈ [{lo}, {hi}]: filas eliminadas = {removed}")
        df = df[mask].copy()
    # CARAVAN a int si procede
    if "CARAVAN" in df.columns:
        vals = set(pd.Series(df["CARAVAN"]).dropna().unique().tolist())
        if vals <= {0, 1}:
            df["CARAVAN"] = pd.to_numeric(df["CARAVAN"], errors="coerce").astype("Int64").astype("int64")
    df.reset_index(drop=True, inplace=True)
    return df

# 5) Sin nulos en salida (simple)
def enforce_no_nulls(df: pd.DataFrame) -> pd.DataFrame:
    df = df.replace(r"^\s*$", np.nan, regex=True)
    before = len(df)
    df = df.dropna().reset_index(drop=True)
    print(f"Filas eliminadas por nulos/vacíos: {before - len(df)}")
    return df

# 6) Pipeline principal (compacto)
def main():
    cols = parse_dictionary(DICT_TXT)

    df = pd.read_csv(CSV_PATH, header=None)
    if df.shape[1] < N_COLS:
        raise ValueError(f"El CSV tiene {df.shape[1]} columnas; se requieren {N_COLS}.")
    df = df.iloc[:, :N_COLS].copy()
    df.columns = cols

    df = normalize_strings_to_nan(df)
    df = apply_interval_rules(df)
    df = cast_types(df)
    df = enforce_no_nulls(df)
    assert df.isna().sum().sum() == 0

    df.to_csv(OUT_CSV, index=False)

    report = pd.DataFrame({
        "columna": df.columns,
        "dtype_final": df.dtypes.astype(str).values,
        "nulos": df.isna().sum().values,
        "unicos": df.nunique(dropna=True).values,
        "es_categorica": [c in CAT_COLS for c in df.columns],
    })
    report.to_csv(OUT_REPORT, index=False)

    print(f"Guardado dataset limpio: {OUT_CSV}")
    print(f"Guardado reporte dtypes: {OUT_REPORT}")

if __name__ == "__main__":
    main()


(5940, 86)
MOSTYPE ∈ [1, 41]: filas eliminadas = 75
MAANTHUI ∈ [1, 10]: filas eliminadas = 67
MGEMOMV ∈ [1, 6]: filas eliminadas = 62
MGEMLEEF ∈ [1, 6]: filas eliminadas = 67
MOSHOOFD ∈ [1, 10]: filas eliminadas = 56
MGODRK ∈ [0, 9]: filas eliminadas = 26
PWAPART ∈ [0, 9]: filas eliminadas = 42
AWAPART ∈ [1, 12]: filas eliminadas = 2927
CARAVAN ∈ [0, 1]: filas eliminadas = 8
Filas eliminadas por nulos/vacíos: 1267
Guardado dataset limpio: C:\Users\chile\projects\proy_insurance\data\clean_data\coil2000_clean.csv
Guardado reporte dtypes: C:\Users\chile\projects\proy_insurance\data\clean_data\coil2000_dtypes_report.csv
