# 02 · Preprocessing Pipeline
Este cuaderno replica la limpieza de datos directamente con funciones locales para dejar el dataset listo para modelado.


In [1]:
from pathlib import Path
import re

import numpy as np
import pandas as pd
import yaml


## Configuración
Leemos los parámetros del YAML del proyecto y resolvemos rutas relevantes.


In [2]:
project_root = Path('..').resolve()
params_path = project_root / 'config' / 'params.yaml'
params = yaml.safe_load(params_path.read_text())

raw_path = (project_root / 'data' / 'enriched' / 'insurance_company_enriched.csv').resolve()
interim_rel = params['paths']['interim']
interim_path = (project_root / interim_rel).resolve()
interim_path.parent.mkdir(parents=True, exist_ok=True)

print('RAW:', raw_path)
print('INTERIM:', interim_path)


RAW: /Users/jfts/Documents/ML_OPS_PROYECT/mlops_proyect/data/enriched/insurance_company_enriched.csv
INTERIM: /Users/jfts/Documents/ML_OPS_PROYECT/mlops_proyect/data/interim/data_clean.parquet


## Limpieza
Implementamos la rutina de preparación directamente en este cuaderno (sin depender de `src/`).


In [3]:
MISSING_TOKENS = {
    '', ' ', 'na', 'n/a', 'nan', 'none', 'null', '?', 'unknown', 'missing', 'invalid', 'bad', '--'
}

def load_raw_dataframe(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, low_memory=False, keep_default_na=False)


def slugify(name: str) -> str:
    name = re.sub(r"[^\w]+", '_', name.strip().lower())
    name = re.sub(r"_+", '_', name).strip('_')
    if not name:
        name = 'col'
    if name[0].isdigit():
        name = f'col_{name}'
    return name


def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    seen = {}
    new_cols = []
    for col in df.columns:
        base = slugify(str(col))
        count = seen.get(base, 0)
        if count:
            new_cols.append(f"{base}_{count}")
        else:
            new_cols.append(base)
        seen[base] = count + 1
    df.columns = new_cols
    return df


def standardize_missing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    tokens = {t.lower() for t in MISSING_TOKENS}
    for col in df.columns:
        if pd.api.types.is_object_dtype(df[col]):
            series = df[col].astype(str)
            mask = series.str.strip().str.lower().isin(tokens)
            df[col] = series.where(~mask).replace({'': np.nan})
    return df


def coerce_numeric(df: pd.DataFrame, numeric_like_ratio: float = 0.8):
    df = df.copy()
    numeric_cols = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            numeric_cols.append(col)
            continue
        if not pd.api.types.is_object_dtype(df[col]):
            continue
        coerced = pd.to_numeric(df[col], errors='coerce')
        non_na = coerced.notna().sum()
        if non_na == 0:
            continue
        ratio = non_na / len(df)
        if ratio >= numeric_like_ratio:
            df[col] = coerced
            numeric_cols.append(col)
    return df, numeric_cols


def mask_outliers(df: pd.DataFrame, threshold: float = 4.0):
    df = df.copy()
    masked_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:
        series = df[col]
        std = series.std(skipna=True)
        mean = series.mean(skipna=True)
        if std and std > 0:
            z = (series - mean) / std
            mask = z.abs() > threshold
            if mask.any():
                df.loc[mask, col] = np.nan
                masked_cols.append(col)
    return df, masked_cols


def drop_sparse(df: pd.DataFrame, thresh: float = 0.95):
    if thresh >= 1:
        return df, []
    df = df.copy()
    ratios = df.isna().mean()
    to_drop = ratios[ratios > thresh].index.tolist()
    df = df.drop(columns=to_drop)
    return df, to_drop


def clean_dataframe(df: pd.DataFrame, drop_na_thresh: float, outlier_z: float, numeric_like_ratio: float = 0.8):
    steps = {}
    df1 = normalize_columns(df)
    df2 = standardize_missing(df1)
    df3, numeric_cols = coerce_numeric(df2, numeric_like_ratio=numeric_like_ratio)
    steps['numeric_columns'] = numeric_cols

    if outlier_z is not None:
        df4, masked = mask_outliers(df3, threshold=outlier_z)
        steps['outlier_masked_cols'] = masked
    else:
        df4 = df3
        steps['outlier_masked_cols'] = []

    df5, dropped = drop_sparse(df4, drop_na_thresh)
    steps['dropped_columns'] = dropped

    return df5, steps


df_raw = load_raw_dataframe(raw_path)
df_clean, summary = clean_dataframe(
    df_raw,
    drop_na_thresh=params['clean']['drop_na_thresh'],
    outlier_z=params['clean']['outlier_z'],
    numeric_like_ratio=0.8,
)
print(df_raw.shape, '→', df_clean.shape)
df_clean.head()


(5937, 96) → (5937, 96)


Unnamed: 0,mostype,maanthui,mgemomv,mgemleef,moshoofd,mgodrk,mgodpr,mgodov,mgodge,mrelge,...,mostype_label,mgemleef_label,moshoofd_label,mgodrk_label,mgodpr_label,mgodov_label,mgodge_label,pwapart_label,pwabedr_label,pwaland_label
0,37.0,1.0,2.0,2.0,8.0,1.0,4.0,,4.0,6.0,...,Mixed small town dwellers,30-40 years,Family with grown ups,1 - 10%,37 - 49%,,37 - 49%,f 50 – 99,f 0,f 0
1,37.0,1.0,2.0,2.0,8.0,0.0,4.0,2.0,4.0,3.0,...,Mixed small town dwellers,30-40 years,Family with grown ups,0%,37 - 49%,11 - 23%,37 - 49%,f 50 – 99,f 0,f 0
2,9.0,1.0,3.0,3.0,3.0,2.0,3.0,2.0,4.0,5.0,...,"Modern, complete families",40-50 years,Average Family,11 - 23%,24 - 36%,11 - 23%,37 - 49%,f 0,f 0,f 0
3,40.0,1.0,4.0,2.0,10.0,1.0,4.0,1.0,4.0,7.0,...,Large family farms,30-40 years,Farmers,1 - 10%,37 - 49%,1 - 10%,37 - 49%,f 0,f 0,f 0
4,23.0,1.0,2.0,1.0,5.0,0.0,5.0,0.0,5.0,0.0,...,Young and rising,20-30 years,Living well,0%,50 - 62%,0%,50 - 62%,f 0,f 0,f 0


### Resumen de operaciones
La propiedad `report_` del limpiador nos permite auditar qué se modificó.


In [4]:
import pandas as pd
pd.Series({k: len(v) if isinstance(v, (list, tuple)) else v for k, v in summary.items()})


numeric_columns        86
outlier_masked_cols    86
dropped_columns         0
dtype: int64

### Métricas clave
Revisamos distribución de NA y estadísticas básicas después de limpiar.


In [5]:
missing = df_clean.isna().mean().sort_values(ascending=False)
missing.head(10)


mgemleef_label    0.027960
mostype_label     0.027455
mgodpr_label      0.026444
pbrand            0.025265
pwapart_label     0.024760
mfgekind          0.024592
mgodpr            0.024592
mink3045          0.024592
mbermidd          0.024423
moshoofd_label    0.024255
dtype: float64

In [6]:
numeric_cols = df_clean.select_dtypes(include='number').columns.tolist()
df_clean[numeric_cols].describe().T.head(12)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mostype,5801.0,25.552836,24.954089,1.0,10.0,30.0,35.0,523.0
maanthui,5815.0,1.683577,7.45388,1.0,1.0,1.0,1.0,194.0
mgemomv,5817.0,3.25202,8.635586,1.0,2.0,3.0,3.0,188.0
mgemleef,5802.0,3.639435,9.600588,1.0,2.0,3.0,3.0,210.0
moshoofd,5808.0,6.029959,6.235557,1.0,3.0,7.0,8.0,215.0
mgodrk,5819.0,0.786218,3.054002,0.0,0.0,0.0,1.0,145.0
mgodpr,5791.0,4.873597,6.275972,0.0,4.0,5.0,6.0,198.0
mgodov,5825.0,1.365665,5.74452,0.0,0.0,1.0,2.0,164.0
mgodge,5827.0,3.458898,5.241784,0.0,2.0,3.0,4.0,169.0
mrelge,5802.0,6.442434,6.604565,0.0,5.0,6.0,7.0,196.0


## Persistencia
Guardamos el dataset limpio para reutilizarlo en modelado y trazabilidad de DVC.


In [7]:
if interim_path.suffix == '.parquet':
    df_clean.to_parquet(interim_path, index=False)
else:
    df_clean.to_csv(interim_path, index=False)
print('Dataset guardado en', interim_path)


Dataset guardado en /Users/jfts/Documents/ML_OPS_PROYECT/mlops_proyect/data/interim/data_clean.parquet


> **Notas**: ajusta los parámetros en `config/params.yaml` o las funciones de este cuaderno si necesitas reglas adicionales (nuevos tokens nulos, umbrales de outliers, etc.).
