In [9]:
# Se importan las librerías básicas para trabajar los datos y visualizarlos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_PATH = "data/"

In [10]:
X_t0 = pd.read_parquet(DATA_PATH + "X_t0.parquet")
y_t0 = pd.read_parquet(DATA_PATH + "y_t0.parquet")

X_t1 = pd.read_parquet(DATA_PATH + "X_t1_new.parquet").reset_index(drop=True)
y_t1 = pd.read_parquet(DATA_PATH + "y_t1.parquet").reset_index(drop=True)


df_t0 = pd.concat([X_t0, y_t0], axis=1)
df_t1 = pd.concat([X_t1, y_t1], axis=1)

columns_to_drop = [
    'borrow_block_number',
    'wallet_address',
    'borrow_timestamp',
    'first_tx_timestamp',
    'last_tx_timestamp',
    'risky_first_tx_timestamp',
    'risky_last_tx_timestamp',
    'unique_borrow_protocol_count',
    'unique_lending_protocol_count',
]


df_t0.drop(columns=columns_to_drop, inplace=True)
df_t1.drop(columns=columns_to_drop, inplace=True)

In [11]:
from sklearn.model_selection import train_test_split

X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(
    df_t0.drop(columns=["target"]),
    df_t0["target"],
    test_size=0.3,
    random_state=42,
)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    df_t1.drop(columns=["target"]),
    df_t1["target"],
    test_size=0.3,
    random_state=42,
)

In [12]:
# Definir las características numéricas y categóricas
numeric_features = X_train_0.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X_train_0.select_dtypes(include=['object']).columns

In [13]:
import xgboost as xgb
import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

# Pipeline para características numéricas con PCA (opcional)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),   # Escalado estándar
    # ('pca', PCA(n_components=50))  # Si deseas usar PCA, descomenta esta línea
])

# Pipeline para características categóricas con OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocesador que combina los dos pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # Numéricas con PCA
        ('cat', categorical_transformer, categorical_features)  # Categóricas con OneHot
    ]
)

# Definir la función de optimización de hiperparámetros para el primer entrenamiento
def objective_first(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'eta': trial.suggest_float('eta', 1e-3, 1e-1, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1e-1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 1e1, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 1e1, log=True),
        'verbosity': 0
    }

    # Crear el modelo con los parámetros sugeridos
    model = xgb.XGBClassifier(**params)

    # Crear el pipeline completo
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('xgb', model)
    ])

    # Entrenar el modelo en el primer conjunto de datos
    model_pipeline.fit(X_train_0, y_train_0)

    # Evaluar el modelo en el conjunto de validación
    y_pred_proba = model_pipeline.predict_proba(X_test_0)[:, 1]
    auc = roc_auc_score(y_test_0, y_pred_proba)
    return auc

# Ejecutar la optimización con Optuna para el primer conjunto de datos
study_first = optuna.create_study(direction='maximize')
study_first.optimize(objective_first, n_trials=50)

# Obtener los mejores parámetros del primer entrenamiento
best_params_first = study_first.best_params
print("Mejores parámetros para el primer modelo:", best_params_first)

# Crear el modelo final con los mejores parámetros obtenidos
model_first = xgb.XGBClassifier(**best_params_first)

# Entrenar el modelo con los mejores parámetros y todo el pipeline
model_pipeline_first = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', model_first)
])

# Entrenar el modelo en el primer conjunto de datos
model_pipeline_first.fit(X_train_0, y_train_0)

# Guardar el modelo entrenado
joblib.dump(model_pipeline_first, 'model_pipeline_first.pkl')

# Definir la función de optimización de hiperparámetros para el segundo entrenamiento (reentrenamiento)
def objective_second(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'eta': trial.suggest_float('eta', 1e-3, 1e-1, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1e-1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 1e1, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 1e1, log=True),
        'verbosity': 0
    }

    # Crear el modelo con los parámetros sugeridos
    model = xgb.XGBClassifier(**params)

    # Crear el pipeline completo
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('xgb', model)
    ])

    # Reentrenar el modelo en el segundo conjunto de datos
    model_pipeline.fit(X_train_1, y_train_1)

    # Evaluar el modelo en el conjunto de validación del segundo conjunto
    y_pred_proba = model_pipeline.predict_proba(X_test_1)[:, 1]
    auc = roc_auc_score(y_test_1, y_pred_proba)
    return auc

# Ejecutar la optimización con Optuna para el segundo conjunto de datos
study_second = optuna.create_study(direction='maximize')
study_second.optimize(objective_second, n_trials=50)

# Obtener los mejores parámetros del segundo reentrenamiento
best_params_second = study_second.best_params
print("Mejores parámetros para el segundo modelo:", best_params_second)

# Crear el modelo final con los mejores parámetros obtenidos
model_second = xgb.XGBClassifier(**best_params_second)

# Entrenar el modelo con los mejores parámetros y todo el pipeline
model_pipeline_second = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', model_second)
])

# Reentrenar el modelo en el segundo conjunto de datos
model_pipeline_second.fit(X_train_1, y_train_1)

# Evaluar el modelo final en el conjunto de validación del segundo conjunto
y_pred_proba = model_pipeline_second.predict_proba(X_test_1)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

# Calcular métricas de evaluación
metrics = {
    "AUC": roc_auc_score(y_test_1, y_pred_proba),
    "Accuracy": accuracy_score(y_test_1, y_pred),
    "Precision": precision_score(y_test_1, y_pred),
    "Recall": recall_score(y_test_1, y_pred),
    "F1-Score": f1_score(y_test_1, y_pred)
}

# Mostrar todas las métricas
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")



[I 2024-12-05 19:00:33,891] A new study created in memory with name: no-name-cd3515b2-3e18-4ae7-8265-17b924033206
[I 2024-12-05 19:00:34,396] Trial 0 finished with value: 0.8973073832110783 and parameters: {'max_depth': 6, 'eta': 0.010694994221802671, 'gamma': 0.09932954523065837, 'subsample': 0.8668758387716123, 'colsample_bytree': 0.6699857540397218, 'lambda': 0.0035658340255091633, 'alpha': 0.04696183904865741}. Best is trial 0 with value: 0.8973073832110783.
[I 2024-12-05 19:00:36,506] Trial 1 finished with value: 0.9498262948506946 and parameters: {'max_depth': 10, 'eta': 0.04671001826685764, 'gamma': 0.012760409038279715, 'subsample': 0.8482650373626661, 'colsample_bytree': 0.9912908650536875, 'lambda': 0.002793213387553028, 'alpha': 0.7062549585554888}. Best is trial 1 with value: 0.9498262948506946.
[I 2024-12-05 19:00:36,984] Trial 2 finished with value: 0.930382423291652 and parameters: {'max_depth': 6, 'eta': 0.0901223470187328, 'gamma': 0.09914074724656664, 'subsample': 0.6

Mejores parámetros para el primer modelo: {'max_depth': 10, 'eta': 0.08246460320260632, 'gamma': 0.05909495061897502, 'subsample': 0.7217477057928843, 'colsample_bytree': 0.8349762231285862, 'lambda': 0.0021616058626457643, 'alpha': 0.29644943901514703}


[I 2024-12-05 19:01:50,153] A new study created in memory with name: no-name-cc58fa3e-09cb-4def-8711-4ddf0415238c
[I 2024-12-05 19:01:51,188] Trial 0 finished with value: 0.8750998540048622 and parameters: {'max_depth': 5, 'eta': 0.03143845968181066, 'gamma': 0.005165292371999061, 'subsample': 0.8777210887345823, 'colsample_bytree': 0.5025891843610275, 'lambda': 0.013692750443526074, 'alpha': 0.004181702446953714}. Best is trial 0 with value: 0.8750998540048622.
[I 2024-12-05 19:01:52,451] Trial 1 finished with value: 0.8726734689871956 and parameters: {'max_depth': 6, 'eta': 0.012376750458950475, 'gamma': 0.023022904735762947, 'subsample': 0.5894949967574334, 'colsample_bytree': 0.8234913318741208, 'lambda': 0.15155880962119173, 'alpha': 0.0025220220485178565}. Best is trial 0 with value: 0.8750998540048622.
[I 2024-12-05 19:01:56,771] Trial 2 finished with value: 0.909487028559161 and parameters: {'max_depth': 10, 'eta': 0.014721781044758889, 'gamma': 0.013827308942528328, 'subsample

Mejores parámetros para el segundo modelo: {'max_depth': 10, 'eta': 0.09414805757492885, 'gamma': 0.06397477831465628, 'subsample': 0.5306636963609598, 'colsample_bytree': 0.9716389723114033, 'lambda': 0.7471156416317043, 'alpha': 1.1309133358298906}
AUC: 0.9366
Accuracy: 0.8727
Precision: 0.9035
Recall: 0.7434
F1-Score: 0.8157


In [14]:
from zipfile import ZipFile
import os
def generateFiles(predict_data, clf_pipe):
    """Genera los archivos a subir en CodaLab

    Input
    ---------------
    predict_data: Dataframe con los datos de entrada a predecir
    clf_pipe: pipeline del clf

    Ouput
    ---------------
    archivo de txt
    """
    y_pred_clf = clf_pipe.predict_proba(predict_data)[:, 1]
    with open('./predictions.txt', 'w') as f:
        for item in y_pred_clf:
            f.write("%s\n" % item)
    
    with ZipFile('predictions.zip', 'w') as zipObj:
        zipObj.write('predictions.txt')
    os.remove('predictions.txt')

X_test = pd.read_parquet(DATA_PATH + "X_t2.parquet")

generateFiles(X_test, model_pipeline_second)