In [None]:
import pandas as pd

# Función para limpiar el dataset
def limpiar_dataset(df):
    # Renombrar columna objetivo
    df = df.rename(columns={"default payment next month": "default"})

    # Eliminar columna ID
    df = df.drop(columns=["ID"], errors="ignore")

    # Eliminar filas con datos faltantes
    df = df.dropna()

    df = df[(df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)]
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4

    return df

# Cargar archivos 
df_train =     df_train = pd.read_csv(
        "../files/input/train_data.csv.zip",
        index_col=False,
        compression='zip'
    )
df_test = pd.read_csv(
        "../files/input/test_data.csv.zip",
        index_col= False,
        compression='zip'
    )

# Aplicar limpieza
df_train_clean = limpiar_dataset(df_train)
df_test_clean = limpiar_dataset(df_test)

In [2]:
# Separar X (features) e y (target)
X_train = df_train_clean.drop(columns="default")
y_train = df_train_clean["default"]

X_test = df_test_clean.drop(columns="default")
y_test = df_test_clean["default"]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numeric_features = [col for col in X_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("num", MinMaxScaler(), numeric_features)
])

pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_classif)),
    ("classifier", LogisticRegression(
    solver="saga", 
    max_iter=1000,
    random_state=42
))
])

In [19]:
from sklearn.model_selection import GridSearchCV

# GridSearchCV (Paso 4)
param_grid = {
    "feature_selection__k": range(1, 11),
    "classifier__penalty": ["l1", "l2"],
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1
)

# Entrenamiento
grid_search.fit(X_train, y_train)



0,1,2
,estimator,Pipeline(step...ver='saga'))])
,param_grid,"{'classifier__C': [0.001, 0.01, ...], 'classifier__penalty': ['l1', 'l2'], 'feature_selection__k': range(1, 11)}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function f_c...00277B25C45E0>
,k,1

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'saga'
,max_iter,10000


In [20]:
import gzip
import os
import pickle

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)


In [21]:
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    balanced_accuracy_score, confusion_matrix
)
import json

# Calcular métricas
def calcular_metricas(y_true, y_pred, dataset):
    return {
        "type": "metrics",
        "dataset": dataset,
        "precision": precision_score(y_true, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred)
    }

def matriz_confusion_dict(y_true, y_pred, dataset):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        "type": "cm_matrix",
        "dataset": dataset,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

# Predicciones
y_pred_train = grid_search.predict(X_train)
y_pred_test = grid_search.predict(X_test)

# Guardar métricas + matrices en una sola lista
metrics = [
    calcular_metricas(y_train, y_pred_train, "train"),
    calcular_metricas(y_test, y_pred_test, "test"),
    matriz_confusion_dict(y_train, y_pred_train, "train"),
    matriz_confusion_dict(y_test, y_pred_test, "test"),
]

os.makedirs("../files/output", exist_ok=True)
with open("../files/output/metrics.json", "w", encoding="utf-8") as f:
    for row in metrics:
        json.dump(row, f)
        f.write("\n")


In [22]:
print("Train score:", grid_search.score(X_train, y_train))
print("Test score:", grid_search.score(X_test, y_test))


Train score: 0.6392682710528409
Test score: 0.6547057822566611


In [23]:
print(metrics)


[{'type': 'metrics', 'dataset': 'train', 'precision': 0.6939338235294118, 'balanced_accuracy': 0.6392682710528409, 'recall': 0.31957671957671957, 'f1_score': 0.43761773655991887}, {'type': 'metrics', 'dataset': 'test', 'precision': 0.7017913593256059, 'balanced_accuracy': 0.6547057822566611, 'recall': 0.34942287513116477, 'f1_score': 0.46654991243432575}, {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {'predicted_0': 15562, 'predicted_1': 666}, 'true_1': {'predicted_0': 3215, 'predicted_1': 1510}}, {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {'predicted_0': 6790, 'predicted_1': 283}, 'true_1': {'predicted_0': 1240, 'predicted_1': 666}}]


In [24]:
print("Train balanced accuracy:", grid_search.score(X_train, y_train))
print("Best params:", grid_search.best_params_)


Train balanced accuracy: 0.6392682710528409
Best params: {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'feature_selection__k': 1}
