In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, balanced_accuracy_score
# import os
import pickle

In [17]:
import pandas as pd

# Función para limpiar el dataset
def limpiar_dataset(df):
    # Renombrar columna objetivo
    df = df.rename(columns={"default payment next month": "default"})

    # Eliminar columna ID
    df = df.drop(columns=["ID"], errors="ignore")

    # Eliminar filas con datos faltantes
    df = df.dropna()

    # Agrupar valores > 4 en EDUCATION como "others" (usamos 4 para representarlo)
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: x if x in [0, 1, 2, 3, 4] else 4)

    return df

# Cargar archivos (asumiendo que ya los descomprimiste)
df_train =     df_train = pd.read_csv(
        "../files/input/train_data.csv.zip",
        index_col=False,
        compression='zip'
    )
df_test = pd.read_csv(
        "../files/input/test_data.csv.zip",
        index_col= False,
        compression='zip'
    )

# Aplicar limpieza
df_train_clean = limpiar_dataset(df_train)
df_test_clean = limpiar_dataset(df_test)

In [19]:
df_train_clean.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [18]:
# Separar X (features) e y (target)
X_train = df_train_clean.drop(columns="default")
y_train = df_train_clean["default"]

X_test = df_test_clean.drop(columns="default")
y_test = df_test_clean["default"]

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)],
    remainder="passthrough"
)

pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [29]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__n_estimators": [200, 300],
    "classifier__max_depth": [15, 20, None],
    "classifier__min_samples_split": [2, 4],
    "classifier__min_samples_leaf": [1, 2]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__max_depth': [15, 20, ...], 'classifier__min_samples_leaf': [1, 2], 'classifier__min_samples_split': [2, 4], 'classifier__n_estimators': [200, 300]}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,4
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
import joblib
import gzip
import os

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)


In [36]:
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score
import json

def calcular_metricas(y_true, y_pred, dataset_name):
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred)
    }

y_pred_train = grid_search.predict(X_train)
y_pred_test = grid_search.predict(X_test)

metrics = [
    calcular_metricas(y_train, y_pred_train, "train"),
    calcular_metricas(y_test, y_pred_test, "test")
]

In [37]:
from sklearn.metrics import confusion_matrix
def matriz_confusion_dict(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

metrics.append(matriz_confusion_dict(y_train, y_pred_train, "train"))
metrics.append(matriz_confusion_dict(y_test, y_pred_test, "test"))

# Guardar métricas
os.makedirs("../files/output", exist_ok=True)
with open("../files/output/metrics.json", "w") as f:
    for row in metrics:
        json.dump(row, f)
        f.write("\n")
