# 03 · Baseline Modeling
Entrenamos un primer modelo supervisado usando el dataset limpio generado en el cuaderno de preprocesamiento; si no existe, lo recreamos aquí con funciones locales.


In [6]:
from pathlib import Path
import re

import numpy as np
import pandas as pd
import yaml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score


In [7]:
MISSING_TOKENS = {
    '', ' ', 'na', 'n/a', 'nan', 'none', 'null', '?', 'unknown', 'missing', 'invalid', 'bad', '--'
}

def load_raw_dataframe(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, low_memory=False, keep_default_na=False)


def slugify(name: str) -> str:
    name = re.sub(r"[^\w]+", '_', name.strip().lower())
    name = re.sub(r"_+", '_', name).strip('_')
    if not name:
        name = 'col'
    if name[0].isdigit():
        name = f'col_{name}'
    return name


def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    seen = {}
    new_cols = []
    for col in df.columns:
        base = slugify(str(col))
        count = seen.get(base, 0)
        if count:
            new_cols.append(f"{base}_{count}")
        else:
            new_cols.append(base)
        seen[base] = count + 1
    df.columns = new_cols
    return df


def standardize_missing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    tokens = {t.lower() for t in MISSING_TOKENS}
    for col in df.columns:
        if pd.api.types.is_object_dtype(df[col]):
            series = df[col].astype(str)
            mask = series.str.strip().str.lower().isin(tokens)
            df[col] = series.where(~mask).replace({'': np.nan})
    return df


def coerce_numeric(df: pd.DataFrame, numeric_like_ratio: float = 0.8):
    df = df.copy()
    numeric_cols = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            numeric_cols.append(col)
            continue
        if not pd.api.types.is_object_dtype(df[col]):
            continue
        coerced = pd.to_numeric(df[col], errors='coerce')
        non_na = coerced.notna().sum()
        if non_na == 0:
            continue
        ratio = non_na / len(df)
        if ratio >= numeric_like_ratio:
            df[col] = coerced
            numeric_cols.append(col)
    return df, numeric_cols


def mask_outliers(df: pd.DataFrame, threshold: float = 4.0):
    df = df.copy()
    masked_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:
        series = df[col]
        std = series.std(skipna=True)
        mean = series.mean(skipna=True)
        if std and std > 0:
            z = (series - mean) / std
            mask = z.abs() > threshold
            if mask.any():
                df.loc[mask, col] = np.nan
                masked_cols.append(col)
    return df, masked_cols


def drop_sparse(df: pd.DataFrame, thresh: float = 0.95):
    if thresh >= 1:
        return df, []
    df = df.copy()
    ratios = df.isna().mean()
    to_drop = ratios[ratios > thresh].index.tolist()
    df = df.drop(columns=to_drop)
    return df, to_drop


def clean_dataframe(df: pd.DataFrame, drop_na_thresh: float, outlier_z: float, numeric_like_ratio: float = 0.8):
    steps = {}
    df1 = normalize_columns(df)
    df2 = standardize_missing(df1)
    df3, numeric_cols = coerce_numeric(df2, numeric_like_ratio=numeric_like_ratio)
    steps['numeric_columns'] = numeric_cols

    if outlier_z is not None:
        df4, masked = mask_outliers(df3, threshold=outlier_z)
        steps['outlier_masked_cols'] = masked
    else:
        df4 = df3
        steps['outlier_masked_cols'] = []

    df5, dropped = drop_sparse(df4, drop_na_thresh)
    steps['dropped_columns'] = dropped

    return df5, steps


## Carga de datos
Usamos el dataset limpio generado previamente; si no existe lo creamos aplicando las funciones locales de limpieza.


In [10]:
project_root = Path('..').resolve()
params = yaml.safe_load((project_root / 'config' / 'params.yaml').read_text())
clean_path = (project_root / params['paths']['interim']).resolve()
raw_path = (project_root / 'data' / 'enriched' / 'insurance_company_enriched.csv').resolve()
clean_path.parent.mkdir(parents=True, exist_ok=True)

if not clean_path.exists():
    print('Limpieza no encontrada, recreando...')
    df_clean, summary = clean_dataframe(
        load_raw_dataframe(raw_path),
        drop_na_thresh=params['clean']['drop_na_thresh'],
        outlier_z=params['clean']['outlier_z'],
        numeric_like_ratio=0.8,
    )
    if clean_path.suffix == '.parquet':
        df_clean.to_parquet(clean_path, index=False)
    else:
        df_clean.to_csv(clean_path, index=False)

df = pd.read_parquet(clean_path) if clean_path.suffix == '.parquet' else pd.read_csv(clean_path)

target_col = 'caravan' if 'caravan' in df.columns else 'CARAVAN'
print('Dataset shape:', df.shape)
print('Target column:', target_col)
print('Positive rate:', df[target_col].dropna().astype(float).mean())


Dataset shape: (5937, 96)
Target column: caravan
Positive rate: 0.11051816147357549


In [15]:
target_col = 'caravan'
label_cols = [c for c in df.columns if c.endswith('_label')]
features = df.drop(columns=label_cols + [target_col], errors='ignore')
X = features.apply(pd.to_numeric, errors='coerce').fillna(0.0)
y = df[target_col].fillna(0).astype(int)
print(X.shape, y.mean())


(5937, 85) 0.10813542193026782


## Entrenamiento
Creamos un pipeline simple (escalado + regresión logística) para obtener un baseline.


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
# Asegurar tipo numérico
y = pd.to_numeric(df[target_col], errors='coerce').fillna(0)

# Convertir todos los valores >1 en 1
y = np.where(y > 1, 1, y).astype(int)

print(np.unique(y))

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # ahora sí puedes estratificar
)

# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=500, class_weight='balanced'))
])

# Entrenamiento
pipeline.fit(X_train, y_train)

# Predicción
y_pred = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)[:, 1]

# Métricas
print(classification_report(y_test, y_pred, digits=3))
print('ROC-AUC:', roc_auc_score(y_test, y_score))


[0 1]
              precision    recall  f1-score   support

           0      0.964     0.666     0.788      1117
           1      0.103     0.606     0.177        71

    accuracy                          0.662      1188
   macro avg      0.534     0.636     0.482      1188
weighted avg      0.912     0.662     0.751      1188

ROC-AUC: 0.6693923613300212


**Interpretación de métricas**

- `precision` y `recall` para la clase positiva resumen cómo el modelo acierta sobre los clientes con Caravan; valores desbalanceados adelantan la necesidad de ajustar umbrales o balancear la muestra.
- El `f1-score` combina ambos efectos y el promedio ponderado ilustra el desempeño general pese al desbalance.
- `ROC-AUC` indica la capacidad de ranking; si se mantiene alrededor de 0.65-0.70 hay señal utilizable, pero es conveniente explorar modelos más expresivos y calibración.


## Interpretación rápida
Revisamos los coeficientes del modelo para identificar señales preliminares.


In [18]:
clf = pipeline.named_steps['clf']
coefs = pd.Series(clf.coef_[0], index=X.columns)
coefs.sort_values(key=np.abs, ascending=False).head(15)


mzpart      2.851997
maut2       1.520464
mink7512    1.479573
mberboer    1.363028
mrelge      1.282212
mkoopkla    1.248266
mzfonds     1.178221
mberarbo    1.071214
mgodpr      1.041312
mhhuur      0.979517
moshoofd    0.887548
mgemleef    0.887317
atractor    0.796642
mrelsa      0.734835
mgodrk      0.733523
dtype: float64

**Lectura de coeficientes**

- Coeficientes positivos empujan la probabilidad hacia la compra; suelen asociarse a hogares con mayor carga de pólizas o ingresos.
- Coeficientes negativos sugieren características que desalientan la compra; úsalos para detectar segmentos poco propensos.
- Ordenar por magnitud ayuda a seleccionar variables para nuevas transformaciones o para reducir dimensionalidad en modelos posteriores.
