In [4]:
# Se importan las librerías básicas para trabajar los datos y visualizarlos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_PATH = "data/"

In [5]:
X_t0 = pd.read_parquet(DATA_PATH + "X_t0.parquet")
y_t0 = pd.read_parquet(DATA_PATH + "y_t0.parquet")

X_t1 = pd.read_parquet(DATA_PATH + "X_t1_new.parquet").reset_index(drop=True)
y_t1 = pd.read_parquet(DATA_PATH + "y_t1.parquet").reset_index(drop=True)


df_t0 = pd.concat([X_t0, y_t0], axis=1)
df_t1 = pd.concat([X_t1, y_t1], axis=1)

columns_to_drop = [
    'borrow_block_number',
    'wallet_address',
    'borrow_timestamp',
    'first_tx_timestamp',
    'last_tx_timestamp',
    'risky_first_tx_timestamp',
    'risky_last_tx_timestamp',
    'unique_borrow_protocol_count',
    'unique_lending_protocol_count',
]


df_t0.drop(columns=columns_to_drop, inplace=True)
df_t1.drop(columns=columns_to_drop, inplace=True)

In [6]:
from sklearn.model_selection import train_test_split

X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(
    df_t0.drop(columns=["target"]),
    df_t0["target"],
    test_size=0.3,
    random_state=42,
)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    df_t1.drop(columns=["target"]),
    df_t1["target"],
    test_size=0.3,
    random_state=42,
)

In [10]:
# Definir las características numéricas y categóricas
numeric_features = X_train_0.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train_0.select_dtypes(include=['object']).columns

In [14]:
import xgboost as xgb
import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Pipeline para características numéricas con PCA (opcional)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),   # Escalado estándar
    # ('pca', PCA(n_components=50))  # Si deseas usar PCA, descomenta esta línea
])

# Pipeline para características categóricas con OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocesador que combina los dos pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # Numéricas con PCA
        ('cat', categorical_transformer, categorical_features)  # Categóricas con OneHot
    ]
)

# Definir la función de optimización de hiperparámetros para el primer entrenamiento
def objective_first(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'eta': trial.suggest_float('eta', 1e-3, 1e-1, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1e-1, log=True),
        'verbosity': 0
    }

    # Crear el modelo con los parámetros sugeridos
    model = xgb.XGBClassifier(**params)

    # Crear el pipeline completo
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('xgb', model)
    ])

    # Entrenar el modelo en el primer conjunto de datos
    model_pipeline.fit(X_train_0, y_train_0)

    # Evaluar el modelo en el conjunto de validación
    y_pred_proba = model_pipeline.predict_proba(X_test_0)[:, 1]
    auc = roc_auc_score(y_test_0, y_pred_proba)
    return auc

# Ejecutar la optimización con Optuna para el primer conjunto de datos
study_first = optuna.create_study(direction='maximize')
study_first.optimize(objective_first, n_trials=30)

# Obtener los mejores parámetros del primer entrenamiento
best_params_first = study_first.best_params
print("Mejores parámetros para el primer modelo:", best_params_first)

# Crear el modelo final con los mejores parámetros obtenidos
model_first = xgb.XGBClassifier(**best_params_first)

# Entrenar el modelo con los mejores parámetros y todo el pipeline
model_pipeline_first = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', model_first)
])

# Entrenar el modelo en el primer conjunto de datos
model_pipeline_first.fit(X_train_0, y_train_0)

# Guardar el modelo entrenado
joblib.dump(model_pipeline_first, 'model_pipeline_first.pkl')

# Definir la función de optimización de hiperparámetros para el segundo entrenamiento (reentrenamiento)
def objective_second(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'eta': trial.suggest_float('eta', 1e-3, 1e-1, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1e-1, log=True),
        'verbosity': 0
    }

    # Crear el modelo con los parámetros sugeridos
    model = xgb.XGBClassifier(**params)

    # Crear el pipeline completo
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('xgb', model)
    ])

    # Reentrenar el modelo en el segundo conjunto de datos
    model_pipeline.fit(X_train_1, y_train_1)

    # Evaluar el modelo en el conjunto de validación del segundo conjunto
    y_pred_proba = model_pipeline.predict_proba(X_test_1)[:, 1]
    auc = roc_auc_score(y_test_1, y_pred_proba)
    return auc

# Ejecutar la optimización con Optuna para el segundo conjunto de datos
study_second = optuna.create_study(direction='maximize')
study_second.optimize(objective_second, n_trials=30)

# Obtener los mejores parámetros del segundo reentrenamiento
best_params_second = study_second.best_params
print("Mejores parámetros para el segundo modelo:", best_params_second)

# Crear el modelo final con los mejores parámetros obtenidos
model_second = xgb.XGBClassifier(**best_params_second)

# Entrenar el modelo con los mejores parámetros y todo el pipeline
model_pipeline_second = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', model_second)
])

# Reentrenar el modelo en el segundo conjunto de datos
model_pipeline_second.fit(X_train_1, y_train_1)

# Evaluar el modelo final en el conjunto de validación del segundo conjunto
y_pred_proba = model_pipeline_second.predict_proba(X_test_1)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

# Calcular métricas de evaluación
metrics = {
    "AUC": roc_auc_score(y_test_1, y_pred_proba),
    "Accuracy": accuracy_score(y_test_1, y_pred),
    "Precision": precision_score(y_test_1, y_pred),
    "Recall": recall_score(y_test_1, y_pred),
    "F1-Score": f1_score(y_test_1, y_pred)
}

# Mostrar todas las métricas
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")



[I 2024-12-05 18:37:22,540] A new study created in memory with name: no-name-d3b12e92-aa2f-41cf-abe0-6184e6ceaedd
[I 2024-12-05 18:37:22,931] Trial 0 finished with value: 0.8593069652943254 and parameters: {'max_depth': 4, 'eta': 0.004625649129855801, 'gamma': 0.0038026007003080973}. Best is trial 0 with value: 0.8593069652943254.
[I 2024-12-05 18:37:23,466] Trial 1 finished with value: 0.8934972583845116 and parameters: {'max_depth': 5, 'eta': 0.03221729681269613, 'gamma': 0.013309549731846412}. Best is trial 1 with value: 0.8934972583845116.
[I 2024-12-05 18:37:23,954] Trial 2 finished with value: 0.8922740246607146 and parameters: {'max_depth': 5, 'eta': 0.029358843118617656, 'gamma': 0.0034695350835228754}. Best is trial 1 with value: 0.8934972583845116.
[I 2024-12-05 18:37:24,348] Trial 3 finished with value: 0.8929799277575141 and parameters: {'max_depth': 4, 'eta': 0.061590637505482354, 'gamma': 0.002382853357536746}. Best is trial 1 with value: 0.8934972583845116.
[I 2024-12-05

Mejores parámetros para el primer modelo: {'max_depth': 5, 'eta': 0.09925656108814808, 'gamma': 0.06371770302446195}


[I 2024-12-05 18:37:35,920] A new study created in memory with name: no-name-e7291a98-8804-46eb-bb72-5df3b866aa7c
[I 2024-12-05 18:37:36,639] Trial 0 finished with value: 0.8396410552179381 and parameters: {'max_depth': 3, 'eta': 0.008683999861378989, 'gamma': 0.0011810603004694918}. Best is trial 0 with value: 0.8396410552179381.
[I 2024-12-05 18:37:37,380] Trial 1 finished with value: 0.8377853261907651 and parameters: {'max_depth': 3, 'eta': 0.005870016689869966, 'gamma': 0.0037407062682524956}. Best is trial 0 with value: 0.8396410552179381.
[I 2024-12-05 18:37:38,450] Trial 2 finished with value: 0.8607933559848279 and parameters: {'max_depth': 5, 'eta': 0.01319448304050378, 'gamma': 0.014047948216191783}. Best is trial 2 with value: 0.8607933559848279.
[I 2024-12-05 18:37:39,122] Trial 3 finished with value: 0.8405531204913521 and parameters: {'max_depth': 3, 'eta': 0.009018201511039022, 'gamma': 0.007667634931332479}. Best is trial 2 with value: 0.8607933559848279.
[I 2024-12-05

Mejores parámetros para el segundo modelo: {'max_depth': 5, 'eta': 0.08577650809183676, 'gamma': 0.038585596277909986}
AUC: 0.8918
Accuracy: 0.8279
Precision: 0.8557
Recall: 0.6566
F1-Score: 0.7431


In [15]:
from zipfile import ZipFile
import os
def generateFiles(predict_data, clf_pipe):
    """Genera los archivos a subir en CodaLab

    Input
    ---------------
    predict_data: Dataframe con los datos de entrada a predecir
    clf_pipe: pipeline del clf

    Ouput
    ---------------
    archivo de txt
    """
    y_pred_clf = clf_pipe.predict_proba(predict_data)[:, 1]
    with open('./predictions.txt', 'w') as f:
        for item in y_pred_clf:
            f.write("%s\n" % item)
    
    with ZipFile('predictions.zip', 'w') as zipObj:
        zipObj.write('predictions.txt')
    os.remove('predictions.txt')

X_test = pd.read_parquet(DATA_PATH + "X_t2.parquet")

generateFiles(X_test, model_pipeline_second)