# Demo: Pipeline de Preprocesamiento

Este notebook demuestra el uso del pipeline de preprocesamiento creado para el proyecto.

In [None]:
import sys
sys.path.insert(0, '..')

from src.data_processing.pipeline import DataPipeline
import pandas as pd
import numpy as np

## 1. Cargar y Preparar Datos

In [None]:
pipeline = DataPipeline()
X, y, df = pipeline.load_and_prepare_data()

print(f"Shape de X: {X.shape}")
print(f"Shape de y: {y.shape}")
print(f"\nPrimeras filas de X:")
X.head()

## 2. Aplicar Transformaciones

In [None]:
X_transformed = pipeline.fit_transform(X)

print(f"Shape después de transformación: {X_transformed.shape}")
print(f"Tipo de datos: {type(X_transformed)}")
print(f"\nPrimeras 5 filas, primeras 10 columnas:")
print(X_transformed[:5, :10])

## 3. Nombres de Features

In [None]:
feature_names = pipeline.get_feature_names()

print(f"Total de features: {len(feature_names)}")
print(f"\nPrimeras 20 features:")
for i, name in enumerate(feature_names[:20], 1):
    print(f"{i:2d}. {name}")

## 4. Distribución de Features por Tipo

In [None]:
numeric_count = len([f for f in feature_names if not f.startswith(('Gender', 'Education', 'Job', 'text'))])
categorical_count = len([f for f in feature_names if f.startswith(('Gender', 'Education', 'Job'))])
text_count = len([f for f in feature_names if f.startswith('text_')])

print("Distribución de Features:")
print(f"  Numéricas:    {numeric_count:3d}")
print(f"  Categóricas:  {categorical_count:3d}")
print(f"  Texto (TF-IDF): {text_count:3d}")
print(f"  {'='*30}")
print(f"  TOTAL:        {len(feature_names):3d}")

## 5. Verificación de Calidad

In [None]:
print("Verificaciones de Calidad:")
print(f"  ✓ Valores nulos: {np.isnan(X_transformed).sum()}")
print(f"  ✓ Valores infinitos: {np.isinf(X_transformed).sum()}")
print(f"  ✓ Shape consistente: {X_transformed.shape[0] == len(y)}")
print(f"\nEstadísticas básicas:")
print(f"  Min: {X_transformed.min():.4f}")
print(f"  Max: {X_transformed.max():.4f}")
print(f"  Media: {X_transformed.mean():.4f}")
print(f"  Std: {X_transformed.std():.4f}")