#Readme

La entrega cuenta con otra notebook (feature_engineering.ipynb) donde se crean las nuevas variables al dataset.

En esta notebook se tiene las celdas para la optimización bayesiana con optuna y el entrenamiento de los modelos.

Hay dos variables que funcionan de flags y son importantes a la hora de ejecutar.

* 'run_bayesian_optimization' : flag booleano.

    * True: Ejecuta la optimización con Optuna y luego plotea los análisis del experimento.

    * False: Omite los experimentos y utiliza los hiperparámetros considerados como óptimos (los mismos están hardcodeados)

* 'debug' : flag booleano.

    * True: Se entrena los modelos con 2 semillas para reducir tiempos y hacer pruebas

    * False: Entrena los modelos con todas las semillas para hacer entregas.

In [None]:
ganancia_acierto = 780000
costo_estimulo = 20000
debug = False
run_bayesian_optimization = False
submission_number = 12

estudio = 8
semillas = [50,100,150,400,700,1000,1500,2000,3000]
semillas_debug = [400,1000]

threshold = 0.015

In [None]:
%pip install optuna



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

from joblib import Parallel, delayed

from time import time

import lightgbm as lgb

import optuna
from optuna.visualization import plot_param_importances, plot_contour,  plot_slice, plot_optimization_history
import pickle



In [None]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_path = '/content/drive/MyDrive/maestria/dm-eyf/'
dataset_path = base_path + 'datos/'
dataset_file = 'competencia_01_fe.csv'
modelos_path = base_path + 'modelos/'

In [None]:
df = pd.read_csv(dataset_path + dataset_file)

df['clase_peso'] = 1.0
df.loc[df['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
df.loc[df['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001


In [None]:
df['clase_binaria'] = 0
df['clase_binaria'] = np.where(df['clase_ternaria'].isin(['BAJA+2']), 1, 0)

df = df.drop(columns=['clase_ternaria'])

In [None]:
clientes_test = df[df['foto_mes'] == 202104]["numero_de_cliente"]

clientes_predict = df[df['foto_mes'] == 202106]["numero_de_cliente"]

In [None]:
df = df.drop(columns=['numero_de_cliente','tmobile_app','mplazo_fijo_dolares'])

In [None]:
df_train = df[df['foto_mes'].isin([202101,202102,202103])]
df_test = df[df['foto_mes'] == 202104]
df_predict = df[df['foto_mes'] == 202106]

df_train_predict = df[df['foto_mes'].isin([202101,202102,202103,202104])]

df_train = df_train.drop(columns=['foto_mes'])
df_test = df_test.drop(columns=['foto_mes'])
df_predict = df_predict.drop(columns=['foto_mes'])
df_train_predict = df_train_predict.drop(columns=['foto_mes'])

In [None]:
df_train_weight = df_train['clase_peso']
df_train_clase_binaria_baja = df_train['clase_binaria']
df_test_clase_binaria_baja = df_test['clase_binaria']

df_predict_clase_binaria_baja = df_train_predict['clase_binaria']
df_train_predict_weight = df_train_predict['clase_peso']

df_train = df_train.drop(columns=['clase_binaria'])
df_train_predict = df_train_predict.drop(columns=['clase_binaria'])
df_test = df_test.drop(columns=['clase_binaria'])
df_predict = df_predict.drop(columns=['clase_binaria'])

In [None]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

In [None]:
sss_opt = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=semillas[1])
resultados_medias = []

def objective(trial, X, y, weight, sss):
  num_leaves = trial.suggest_int('num_leaves', 8, 80)
  learning_rate = trial.suggest_float('learning_rate', 0.01, 0.4)
  max_depth = trial.suggest_int("max_depth", -1, 50)
  min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 1000)
  feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0)
  max_bin = trial.suggest_int('max_bin', 255, 500)
  num_iterations = trial.suggest_int('num_iterations', 100, 500)

  params = {
      'objective': 'binary',
      'metric': 'auc',
      'boosting_type': 'rf',
      'first_metric_only': True,
      'boost_from_average': True,
      'feature_pre_filter': False,
      'max_bin': max_bin,
      'max_depth': max_depth,
      'num_leaves': num_leaves,
      'learning_rate': learning_rate,
      'min_data_in_leaf': min_data_in_leaf,
      'feature_fraction': feature_fraction,
      'seed': semillas[1],
      'verbose': -1,
      'num_iterations': num_iterations
      }

  train_data = lgb.Dataset(X,
                            label=y,
                            weight=weight)
  cv_results = lgb.cv(
      params,
      train_data,
      num_boost_round=110,
      callbacks=[lgb.early_stopping( int((50 + 5) / learning_rate))],
      feval=lgb_gan_eval,
      stratified=True,
      nfold=5
  )
  max_gan = max(cv_results['valid gan_eval-mean'])
  best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

  trial.set_user_attr("best_iter", best_iter)

  return max_gan

storage_name = f"sqlite:////content/drive/MyDrive/maestria/dm-eyf/datos/optimization_lgbm_{estudio}.db"
study_name = f"exp_{estudio}_lgbm-opt"

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2025-10-12 21:59:30,965] Using an existing study with name 'exp_8_lgbm-opt' instead of creating a new one.


In [None]:
if run_bayesian_optimization:
  study.optimize(lambda trial: objective(trial, df_train, df_train_clase_binaria_baja, df_train_weight, sss_opt), n_trials=50)



In [None]:
visualization = None
if run_bayesian_optimization:
  visualization = optuna.visualization.plot_optimization_history(study)

visualization

In [None]:
visualization = None
if run_bayesian_optimization:
  visualization = plot_param_importances(study)
visualization

In [None]:
visualization = None
if run_bayesian_optimization:
  visualization = plot_slice(study)
visualization

In [None]:
visualization = None
if run_bayesian_optimization:
  visualization = plot_contour(study)
visualization

In [None]:
visualization = None
if run_bayesian_optimization:
  visualization = plot_contour(study, params=["num_leaves", "learning_rate", 'min_data_in_leaf','feature_fraction', 'bagging_fraction'])
visualization

In [None]:
from lightgbm import LGBMClassifier

def build_and_save_models(semillas, train_dataset, y_target, weight, is_test):
  train_data = lgb.Dataset(train_dataset,
                              label=y_target,
                              weight=weight)

  modelos = {}
  print(f"Construimos los modelos para las semillas : {semillas}")

  for seed in semillas:
    print(f"Semilla: {seed}")

    params = {
            'objective': 'binary',
              'metric': 'auc',
              'boosting_type': 'rf',
              'first_metric_only': True,
              'boost_from_average': True,
              'feature_pre_filter': False,
              'max_bin': 31,
              'seed': seed,
              'verbose': -1
        }


    if run_bayesian_optimization:
      best_iter = study.best_trial.user_attrs["best_iter"]
      params.update(study.best_trial.params)
      n_estimators = best_iter
    else:
      new_params = {'num_leaves': 73, 'learning_rate': 0.2497842951354348, 'max_depth': 12, 'min_data_in_leaf': 633, 'feature_fraction': 0.17951553564916345, 'bagging_fraction': 0.7869774609783072, 'min_child_samples': 58, 'n_estimators': 582}
      params.update(new_params)


    model = lgb.train(params,
                  train_data,
                  num_boost_round=50)

    modelos[seed] = model
    if is_test:
      model.save_model(modelos_path + f'lgb_{submission_number}_{seed}_test.txt')
    else:
      model.save_model(modelos_path + f'lgb_{submission_number}_{seed}_predict.txt')
  return modelos

In [None]:
#lgb.plot_importance(model, figsize=(30, 40))
#plt.show()

In [None]:
#importances = model.feature_importances_
#feature_names = X_train.columns.tolist()
#importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
#importance_df = importance_df.sort_values('importance', ascending=False)
#importance_df[importance_df['importance'] > 0]

In [None]:
# Find the 20 least important features
#least_important_features = importance_df.tail(40)
#least_important_features['feature'].to_list()

In [None]:
def ganancia_prob(y_pred, y_true, threshold,prop = 1):
  ganancia = np.where(y_true == 1, ganancia_acierto, 0) - np.where(y_true == 0, costo_estimulo, 0)
  return ganancia[y_pred >= threshold].sum() / prop

In [None]:
def binarize_predictions(y_pred, binarization_threshold):
    return np.where(y_pred >= binarization_threshold, 1, 0)

In [None]:
def build_predictions(clientes, modelos, dataset, threshold,y_true=None):
  predicciones = {}
  for seed,model in modelos.items():
    print(f"Semilla: {seed}")
    predictions = model.predict(dataset)
    print(predictions)
    predicciones[seed] = predictions
    if y_true is not None:
      print(f"Ganancias de Modelo con semilla {seed}:", ganancia_prob(predictions, y_true,threshold))

  mean_predictions = np.mean(list(predicciones.values()), axis=0)
  return pd.DataFrame({'numero_de_cliente': clientes, 'Predicted': binarize_predictions(mean_predictions,threshold)})

In [None]:
def private_vs_public(modelos, dataset, y_true):

  sss_futuro = StratifiedShuffleSplit(n_splits=15,
                              test_size=0.3,
                              random_state=semillas[1])
  rows = []
  split_number = 0
  for private_index, public_index in sss_futuro.split(dataset, y_true):
    row = {}
    for seed,model in modelos.items():
      predictions_public = model.predict(dataset.iloc[public_index])
      predictions_private = model.predict(dataset.iloc[private_index])
      row[str(seed) + "_public"] = ganancia_prob(binarize_predictions(predictions_public,threshold), y_true.iloc[public_index], threshold, 0.3)
      row[str(seed) + "_private"] = ganancia_prob(binarize_predictions(predictions_private,threshold), y_true.iloc[private_index], threshold, 0.7)
    rows.append(row)

    split_number += 1
  df_lb = pd.DataFrame(rows)

  df_lb_long = df_lb.reset_index()
  df_lb_long = df_lb_long.melt(id_vars=['index'], var_name='model_type', value_name='ganancia')
  df_lb_long[['modelo', 'tipo']] = df_lb_long['model_type'].str.split('_', expand=True)
  df_lb_long = df_lb_long[['ganancia', 'tipo', 'modelo']]

  g = sns.FacetGrid(df_lb_long, col="tipo", row="modelo", aspect=2)
  g.map(sns.histplot, "ganancia", kde=True)
  plt.show()

In [None]:
if debug:
  semillas = semillas_debug

In [None]:
import os
from lightgbm import LGBMClassifier

train_test_models = True

test_models = {}
for seed in semillas:
  model_file_path = modelos_path + f'lgb_test_{seed}_{submission_number}.txt'
  if os.path.exists(model_file_path):
    print(f"Cargamos el modelo de Test de la submission {submission_number} para la semilla {seed}")
    booster = lgb.Booster(model_file=model_file_path)
    # Create an LGBMClassifier and set its booster
    #model = LGBMClassifier()
    #model._Booster = booster
    #test_models[seed] = model
    test_models[seed] = booster
    train_test_models = False
  else:
    print(f"El modelo de Test para la semilla {seed} no existe en {model_file_path}. Se entrenará.")

El modelo de Test para la semilla 50 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_test_50_12.txt. Se entrenará.
El modelo de Test para la semilla 100 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_test_100_12.txt. Se entrenará.
El modelo de Test para la semilla 150 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_test_150_12.txt. Se entrenará.
El modelo de Test para la semilla 400 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_test_400_12.txt. Se entrenará.
El modelo de Test para la semilla 700 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_test_700_12.txt. Se entrenará.
El modelo de Test para la semilla 1000 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_test_1000_12.txt. Se entrenará.
El modelo de Test para la semilla 1500 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_test_1500_12.txt. Se entrenará.
El modelo de Test para la semilla 2000 no existe en /content/drive/

In [None]:
if train_test_models:
  test_models = build_and_save_models(semillas, df_train, df_train_clase_binaria_baja, df_train_weight,is_test=True)



Construimos los models para las semillas : [50, 100, 150, 400, 700, 1000, 1500, 2000, 3000]
Semilla: 50
Semilla: 100
Semilla: 150
Semilla: 400
Semilla: 700
Semilla: 1000
Semilla: 1500
Semilla: 2000
Semilla: 3000


In [None]:
train_predict_models = True

predict_models = {}
for seed in semillas:
  model_file_path = modelos_path + f'lgb_predict_{seed}_{submission_number}.txt'
  if os.path.exists(model_file_path):
    print(f"Cargamos el modelo de Predicción de la submission {submission_number} para la semilla {seed}")
    modelo_anterior = lgb.Booster(model_file=model_file_path)
    predict_models[seed] = modelo_anterior
    train_predict_models = False
  else:
    print(f"El modelo de Predicción para la semilla {seed} no existe en {model_file_path}. Se entrenará.")

El modelo de Predicción para la semilla 50 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_predict_50_12.txt. Se entrenará.
El modelo de Predicción para la semilla 100 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_predict_100_12.txt. Se entrenará.
El modelo de Predicción para la semilla 150 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_predict_150_12.txt. Se entrenará.
El modelo de Predicción para la semilla 400 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_predict_400_12.txt. Se entrenará.
El modelo de Predicción para la semilla 700 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_predict_700_12.txt. Se entrenará.
El modelo de Predicción para la semilla 1000 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_predict_1000_12.txt. Se entrenará.
El modelo de Predicción para la semilla 1500 no existe en /content/drive/MyDrive/maestria/dm-eyf/modelos/lgb_predict_1500_12.txt. Se entrenará.
El m

In [None]:
if train_predict_models:
  predict_models = build_and_save_models(semillas,df_train_predict,df_predict_clase_binaria_baja, df_train_predict_weight, is_test=False)

Construimos los models para las semillas : [50, 100, 150, 400, 700, 1000, 1500, 2000, 3000]
Semilla: 50
Semilla: 100
Semilla: 150
Semilla: 400
Semilla: 700
Semilla: 1000
Semilla: 1500
Semilla: 2000
Semilla: 3000


In [None]:
test_predictions = build_predictions(clientes_test, test_models, df_test, threshold=threshold, y_true=df_test_clase_binaria_baja)

Semilla: 50
[0.0022148  0.00220403 0.00642298 ... 0.00589776 0.00340636 0.00266803]
Ganancias de Modelo con semilla 50: 737600000.0
Semilla: 100
[0.00220695 0.00220542 0.00723708 ... 0.00431245 0.00355499 0.00262798]
Ganancias de Modelo con semilla 100: 733600000.0
Semilla: 150
[0.00220316 0.00220316 0.00699901 ... 0.00582651 0.00345026 0.00270207]
Ganancias de Modelo con semilla 150: 734580000.0
Semilla: 400
[0.00220419 0.00220419 0.00634693 ... 0.00551365 0.00339337 0.00269484]
Ganancias de Modelo con semilla 400: 738680000.0
Semilla: 700
[0.00220288 0.00220288 0.00748512 ... 0.00447748 0.0034343  0.00260793]
Ganancias de Modelo con semilla 700: 736740000.0
Semilla: 1000
[0.00220683 0.00220683 0.00645021 ... 0.0048593  0.00353122 0.00270924]
Ganancias de Modelo con semilla 1000: 731600000.0
Semilla: 1500
[0.00221062 0.00221062 0.0070547  ... 0.00462166 0.00362126 0.00286754]
Ganancias de Modelo con semilla 1500: 724800000.0
Semilla: 2000
[0.00220066 0.00220066 0.00604465 ... 0.005370

In [None]:
test_predictions["Predicted"].value_counts()

Unnamed: 0_level_0,count
Predicted,Unnamed: 1_level_1
0,154925
1,8493


In [None]:
kaggle_predictions = build_predictions(clientes_predict, predict_models, df_predict, threshold=threshold, y_true=None)

Semilla: 50
[0.00231883 0.00231883 0.00720729 ... 0.04491245 0.01306287 0.01999197]
Semilla: 100
[0.00232261 0.00232029 0.00870493 ... 0.05158691 0.01388792 0.02198643]
Semilla: 150
[0.00231811 0.00231811 0.00838997 ... 0.05067763 0.01277869 0.02095379]
Semilla: 400
[0.00231919 0.00231919 0.00881202 ... 0.04348632 0.01277622 0.02026289]
Semilla: 700
[0.00231775 0.00231775 0.00830336 ... 0.04697103 0.01120965 0.02034119]
Semilla: 1000
[0.00232215 0.00232215 0.00805505 ... 0.05323175 0.01378581 0.02259322]
Semilla: 1500
[0.00232773 0.00232603 0.00872955 ... 0.05709232 0.0127921  0.02373459]
Semilla: 2000
[0.00231509 0.00231509 0.00871104 ... 0.04501386 0.01139906 0.01975826]
Semilla: 3000
[0.00231859 0.00231859 0.00885993 ... 0.0436218  0.01333286 0.02071133]
dict_values([array([0.00231883, 0.00231883, 0.00720729, ..., 0.04491245, 0.01306287,
       0.01999197]), array([0.00232261, 0.00232029, 0.00870493, ..., 0.05158691, 0.01388792,
       0.02198643]), array([0.00231811, 0.00231811, 0.

In [None]:
kaggle_predictions["Predicted"].value_counts()

Unnamed: 0_level_0,count
Predicted,Unnamed: 1_level_1
0,156790
1,7523


In [None]:
private_vs_public(test_models, df_test, df_test_clase_binaria_baja)

In [None]:
kaggle_predictions.to_csv(dataset_path + f"predictions_{submission_number}.csv", index=False)