In [1]:
import pandas as pd

# Función para limpiar el dataset
def limpiar_dataset(df):
    # Renombrar columna objetivo
    df = df.rename(columns={"default payment next month": "default"})

    # Eliminar columna ID
    df = df.drop(columns=["ID"], errors="ignore")

    # Eliminar filas con datos faltantes
    df = df.dropna()

    df = df[(df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)]
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4

    return df

# Cargar archivos 
df_train =     df_train = pd.read_csv(
        "../files/input/train_data.csv.zip",
        index_col=False,
        compression='zip'
    )
df_test = pd.read_csv(
        "../files/input/test_data.csv.zip",
        index_col= False,
        compression='zip'
    )

# Aplicar limpieza
df_train_clean = limpiar_dataset(df_train)
df_test_clean = limpiar_dataset(df_test)

In [2]:
# Separar X (features) e y (target)
X_train = df_train_clean.drop(columns="default")
y_train = df_train_clean["default"]

X_test = df_test_clean.drop(columns="default")
y_test = df_test_clean["default"]

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Columnas
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_features = [col for col in X_train.columns if col not in categorical_features]



preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selectk', SelectKBest(score_func=f_classif)),
    ('pca', PCA()),
    ('clf', MLPClassifier(max_iter=15000, random_state=42))
])

In [4]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'pca__n_components': [None],
    'selectk__k': [20],
    'clf__hidden_layer_sizes': [(50, 30, 40, 60)],
    'clf__alpha': [0.28],
    'clf__learning_rate_init': [0.001]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='balanced_accuracy',
    refit=True
)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'clf__alpha': [0.28], 'clf__hidden_layer_sizes': [(50, ...)], 'clf__learning_rate_init': [0.001], 'pca__n_components': [None], ...}"
,scoring,'balanced_accuracy'
,n_jobs,
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...0024586DB14E0>
,k,20

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,hidden_layer_sizes,"(50, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.28
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,15000
,shuffle,True


In [5]:
import gzip
import os
import pickle

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid, f)

In [6]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import json

# Predecir en train y test
y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

# Calcular métricas
metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": precision_score(y_train, y_pred_train),
        "balanced_accuracy": balanced_accuracy_score(y_train, y_pred_train),
        "recall": recall_score(y_train, y_pred_train),
        "f1_score": f1_score(y_train, y_pred_train)
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": precision_score(y_test, y_pred_test),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_test),
        "recall": recall_score(y_test, y_pred_test),
        "f1_score": f1_score(y_test, y_pred_test)
    }
]

In [7]:
from sklearn.metrics import confusion_matrix

# Confusion matrices
cm_train = confusion_matrix(y_train, y_pred_train)
cm_test = confusion_matrix(y_test, y_pred_test)

# Convertir a diccionarios como pide el formato
cm_train_dict = {
    "type": "cm_matrix",
    "dataset": "train",
    "true_0": {
        "predicted_0": int(cm_train[0, 0]),
        "predicted_1": int(cm_train[0, 1])
    },
    "true_1": {
        "predicted_0": int(cm_train[1, 0]),
        "predicted_1": int(cm_train[1, 1])
    }
}

cm_test_dict = {
    "type": "cm_matrix",
    "dataset": "test",
    "true_0": {
        "predicted_0": int(cm_test[0, 0]),
        "predicted_1": int(cm_test[0, 1])
    },
    "true_1": {
        "predicted_0": int(cm_test[1, 0]),
        "predicted_1": int(cm_test[1, 1])
    }
}

# Agregar a la lista de métricas
metrics.append(cm_train_dict)
metrics.append(cm_test_dict)


In [8]:
# Crear directorio si no existe
os.makedirs("../files/output", exist_ok=True)

# Guardar en JSON
with open("../files/output/metrics.json", "w") as f:
    for entry in metrics:
        f.write(json.dumps(entry) + "\n")