In [14]:
import pandas as pd
import os
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV 


In [4]:
# Directorio base
base_dir = os.path.join('..','data', 'train_test_val')

# Cargar los DataFrames
train_df = pd.read_csv(os.path.join(base_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(base_dir, 'test.csv'))
val_df = pd.read_csv(os.path.join(base_dir, 'val.csv'))

# Declaracion de la variable objetivo
target = 'income'

In [7]:
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_val = val_df.drop(columns=[target])
y_val = val_df[target]

X_test = test_df.drop(columns=[target])
y_test = test_df[target]

In [11]:
#definimos el modelo
model =LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 7961, number of negative: 25170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 694
[LightGBM] [Info] Number of data points in the train set: 33131, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240289 -> initscore=-1.151098
[LightGBM] [Info] Start training from score -1.151098


In [17]:
y_train_pred = model.predict(X_train)
print("--- Entrenamiento ---")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
print("F1 (train):", f1_score(y_train, y_train_pred))
print("--- Validacion ---")
y_val_pred = model.predict(X_val)
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print("F1:", f1_score(y_val, y_val_pred))

--- Entrenamiento ---
[[23919  1251]
 [ 2584  5377]]
              precision    recall  f1-score   support

           0       0.90      0.95      0.93     25170
           1       0.81      0.68      0.74      7961

    accuracy                           0.88     33131
   macro avg       0.86      0.81      0.83     33131
weighted avg       0.88      0.88      0.88     33131

F1 (train): 0.7371307149222016
--- Validacion ---
[[5090  304]
 [ 560 1146]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      5394
           1       0.79      0.67      0.73      1706

    accuracy                           0.88      7100
   macro avg       0.85      0.81      0.82      7100
weighted avg       0.87      0.88      0.87      7100

F1: 0.7262357414448669


In [18]:
weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

model2 = LGBMClassifier(random_state=42, scale_pos_weight=weight, learning_rate=0.05, max_depth=10, n_estimators=1000, num_leaves=31, colsample_bytree=0.8, subsample=0.8, min_child_samples=20)
model2.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 7961, number of negative: 25170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 694
[LightGBM] [Info] Number of data points in the train set: 33131, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.240289 -> initscore=-1.151098
[LightGBM] [Info] Start training from score -1.151098


In [19]:
y_train_pred2 = model2.predict(X_train)
print("--- Entrenamiento ---")
print(confusion_matrix(y_train, y_train_pred2))
print(classification_report(y_train, y_train_pred2))
print("F1 (train):", f1_score(y_train, y_train_pred2))
print("--- Validacion ---")
y_val_pred2 = model2.predict(X_val)
print(confusion_matrix(y_val, y_val_pred2))
print(classification_report(y_val, y_val_pred2))
print("F1:", f1_score(y_val, y_val_pred2))

--- Entrenamiento ---
[[22012  3158]
 [  386  7575]]
              precision    recall  f1-score   support

           0       0.98      0.87      0.93     25170
           1       0.71      0.95      0.81      7961

    accuracy                           0.89     33131
   macro avg       0.84      0.91      0.87     33131
weighted avg       0.92      0.89      0.90     33131

F1 (train): 0.8104204557612068
--- Validacion ---
[[4569  825]
 [ 275 1431]]
              precision    recall  f1-score   support

           0       0.94      0.85      0.89      5394
           1       0.63      0.84      0.72      1706

    accuracy                           0.85      7100
   macro avg       0.79      0.84      0.81      7100
weighted avg       0.87      0.85      0.85      7100

F1: 0.7223624432104997


Probamos con la tecnica de oversampling a ver si conseguimos mejores resultados.

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [22]:
model3 = LGBMClassifier(random_state=42, scale_pos_weight=weight, learning_rate=0.05, max_depth=10, n_estimators=1000, num_leaves=31, colsample_bytree=0.8, subsample=0.8, min_child_samples=20)
model3.fit(X_res, y_res)

[LightGBM] [Info] Number of positive: 25170, number of negative: 25170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7847
[LightGBM] [Info] Number of data points in the train set: 50340, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [23]:
y_train_pred3 = model3.predict(X_res)
print("--- Entrenamiento ---")
print(confusion_matrix(y_res, y_train_pred3))
print(classification_report(y_res, y_train_pred3))
print("F1 (train):", f1_score(y_res, y_train_pred3))
print("--- Validacion ---")
y_val_pred3 = model3.predict(X_val)
print(confusion_matrix(y_val, y_val_pred3))
print(classification_report(y_val, y_val_pred3))
print("F1:", f1_score(y_val, y_val_pred3))

--- Entrenamiento ---
[[21821  3349]
 [  400 24770]]
              precision    recall  f1-score   support

           0       0.98      0.87      0.92     25170
           1       0.88      0.98      0.93     25170

    accuracy                           0.93     50340
   macro avg       0.93      0.93      0.93     50340
weighted avg       0.93      0.93      0.93     50340

F1 (train): 0.9296477697085702
--- Validacion ---
[[4549  845]
 [ 246 1460]]
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      5394
           1       0.63      0.86      0.73      1706

    accuracy                           0.85      7100
   macro avg       0.79      0.85      0.81      7100
weighted avg       0.87      0.85      0.85      7100

F1: 0.7279980054849164


Observamos que con la tecnica de SMOTE mejora significativamente, aumenta la precision, el f1-score y recall en la clase minoritaria, y en validacion conseguimos que disminuyan los fasos negativos y aumenten las metricas. Realizaremos un gridsearch para ver si podemos optimizar aun mas los hiperparametros.

### Optimizacion de hiperparametros

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
param_dist = {
    'num_leaves': [15, 31, 63],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_samples': [10, 20, 50],
    'scale_pos_weight': [3.1616631076497925]
    
}

random_search = RandomizedSearchCV(
    estimator=model3,
    param_distributions=param_dist,
    n_iter=30,
    scoring="f1",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_res, y_res)
print("Mejores hiperparámetros: ", random_search.best_params_)
print("Mejor score: ", random_search.best_score_)

# Mejor modelo encontrado
best_model = random_search.best_estimator_

# Evaluación sobre el conjunto de validación
print("Mejor score (en validación): ", best_model.score(X_val, y_val))

# Predicción sobre el conjunto de validación
y_val_pred = best_model.predict(X_val)

# Métricas de evaluación
print("F1 en validación:", f1_score(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Number of positive: 25170, number of negative: 25170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7847
[LightGBM] [Info] Number of data points in the train set: 50340, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Mejores hiperparámetros:  {'subsample': 1.0, 'scale_pos_weight': 3.1616631076497925, 'num_leaves': 15, 'n_estimators': 100, 'min_child_samples': 20, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
Mejor score:  0.8222888570027962
Mejor score (en validación):  0.7214084507042253
F1 en validación: 0.6187355435620663
Classification Report:
               precision    recall  f1-score   support

           0       0.97  