In [1]:
!pip install optuna==3.6.1

Collecting optuna==3.6.1
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
Installing collected packages: optuna
  Attempting uninstall: optuna
    Found existing installation: optuna 4.5.0
    Uninstalling optuna-4.5.0:
      Successfully uninstalled optuna-4.5.0
Successfully installed optuna-3.6.1


In [3]:
import polars as pl

In [9]:
# @title
# src/optimization.py (actualizar)
from token import SEMI
import optuna
import lightgbm as lgb

import numpy as np
import os
import datetime



In [10]:
# @title
dataset_path = './'
modelos_directory = './'
csv = "competencia_02_fe.csv"
sufix = "us-0-1"

In [46]:
## VARIABLES DE ENTORNO Y CONFIGURACION
MES_TRAIN = [202002,202003, 202004, 202005, 202006, 202007, 202008, 202009, 202010,202011,202012,202101]
IS_EXPERIMENTO = False
MES_VALIDACION = 202102
STUDY_NAME = "study-comp2-us-0-1-vieja-opt"
GANANCIA_ACIERTO = 780000
COSTO_ESTIMULO = 20000
FINAL_PREDICT = 202108
MES_TEST = 202103
FINAL_TRAIN = [202003, 202004, 202005,202006, 202007, 202008, 202009, 202010,202011,202012, 202101,202102,202103,202104,202105,202106]
SEMILLA = [50,100,150,400,700,1000,1500,2000,3000,4000,5000,10000,15000,20000,25000]
UNDERSAMPLE_FRACTION = 0.1
RUN_BAYESIAN_OPTIMIZATION = False
N_TRIALS = 30
APLICAR_UNDERSAMPLING= False

In [12]:
# @title
def drop_columns(df : pl.DataFrame):

    col_drops = ["Visa_Finiciomora","Visa_Finiciomora",
          "Visa_fultimo_cierre", "Master_fultimo_cierre",
          "Visa_Fvencimiento", "Master_Fvencimiento",'tmobile_app','mprestamos_personales','cprestamos_personales'
      ]

    if "Master_Finiciomora" in df.columns:
      col_drops.append("Master_Finiciomora")

    df = df.drop(col_drops)
    return df

In [13]:
# @title


def ganancia_optima_idealizada(df :pl.DataFrame, ternaria : pl.Series) -> float:

  df_ganancias = df.hstack(ternaria.to_frame())
  df_ganancias = df_ganancias.with_columns(
      pl.when(pl.col('clase_ternaria').is_in(["BAJA+2"]))
        .then(780000)
        .alias('ganancia_individual')
  )

  ganancia = df_ganancias['ganancia_individual'].sum()
  return ganancia

In [14]:
# @title
def undersample_df(df: pl.DataFrame, fraction) -> pl.DataFrame:
  clientes_solo_continuas = df.group_by("numero_de_cliente").agg(n_bajas=pl.col("clase_binaria").sum()).filter(pl.col("n_bajas") == 0)
  clientes_solo_continuas_undersampled = clientes_solo_continuas.sample(fraction=1-fraction, seed=1000)
  df = df.filter(~pl.col('numero_de_cliente').is_in(clientes_solo_continuas_undersampled["numero_de_cliente"]))
  return df


In [15]:
# @title
def generate_clase_peso(df : pl.DataFrame):

    df = df.with_columns(
        pl.lit(1.0).alias('clase_peso')
    ).with_columns(
        pl.when(pl.col('clase_ternaria') == 'BAJA+2')
        .then(pl.lit(1.00002))
        .otherwise(pl.col('clase_peso'))
        .alias('clase_peso')
    ).with_columns(
        pl.when(pl.col('clase_ternaria') == 'BAJA+1')
        .then(pl.lit(1.00001))
        .otherwise(pl.col('clase_peso'))
        .alias('clase_peso')
    )

    return df

## SE BINARIZA LA CLASE OBJETIVO.
def generate_clase_binaria(df : pl.DataFrame):

    df = df.with_columns(pl.lit(0).alias('clase_binaria'))

    df = df.with_columns(
        pl.when(pl.col('clase_ternaria').is_in(['BAJA+2'])).then(pl.lit(1)).otherwise(pl.lit(0)).alias('clase_binaria')
    )

    return df

In [19]:
df_crudo = pl.read_csv(os.path.join(csv), infer_schema_length=None)
df_crudo = df_crudo.sort(by=["numero_de_cliente", "foto_mes"], descending=[False, False])

In [21]:
df = generate_clase_peso(df_crudo)
df = generate_clase_binaria(df)
df = drop_columns(df)

df_base_train_optuna = df.filter(pl.col('foto_mes').is_in(MES_TRAIN))
if APLICAR_UNDERSAMPLING:
  df_base_train_optuna = undersample_df(df_base_train_optuna, UNDERSAMPLE_FRACTION)

df_train_for_optuna_targets = df_base_train_optuna.select(['numero_de_cliente', 'clase_binaria','clase_peso'])
df_train_for_optuna_features = df_base_train_optuna.drop(['numero_de_cliente', 'clase_binaria','clase_peso','foto_mes',"clase_ternaria"])

# --- Prepare data for final model testing/training (MES_TRAIN, always full size) ---
df_base_train_full = df.filter(pl.col('foto_mes').is_in(MES_TRAIN))
df_train_for_testing_targets = df_base_train_full.select(['numero_de_cliente', 'clase_binaria','clase_peso'])
df_train_for_testing_features = df_base_train_full.drop(['numero_de_cliente', 'clase_binaria','clase_peso','foto_mes',"clase_ternaria"])


# MES_TEST, FINAL_PREDICT, FINAL_TRAIN, MES_VALIDACION)
df_test_base = df.filter(pl.col('foto_mes') == MES_TEST)
df_test_with_target = df_test_base.select(['numero_de_cliente', 'clase_binaria','clase_peso', "clase_ternaria"])
df_test_features = df_test_base.drop(['clase_binaria','clase_peso','foto_mes',"clase_ternaria"])

df_predict_base = df.filter(pl.col('foto_mes') == FINAL_PREDICT)
df_predict_with_target = df_predict_base.select(['numero_de_cliente'])
if IS_EXPERIMENTO:
  df_predict_with_target = df_predict_base.select(['numero_de_cliente', 'clase_binaria','clase_peso','clase_ternaria'])
  df_predict_features = df_predict_base.drop(['clase_binaria','clase_peso','foto_mes',"clase_ternaria"])
else:
  df_predict_features = df_predict_base.drop(['clase_binaria','clase_peso','foto_mes',"clase_ternaria"])

df_train_predict_base = df.filter(pl.col('foto_mes').is_in(FINAL_TRAIN))
df_train_predict_with_target = df_train_predict_base.select(['numero_de_cliente', 'clase_binaria','clase_peso'])
df_train_predict_features = df_train_predict_base.drop(['numero_de_cliente','clase_binaria','clase_peso','foto_mes',"clase_ternaria"])

#df_val_base = df.filter(pl.col('foto_mes') == MES_VALIDACION)
#df_val_with_target = df_val_base.select(['numero_de_cliente', 'clase_binaria','clase_peso'])
#df_val_features = df_val_base.drop(['clase_binaria','clase_peso','foto_mes',"clase_ternaria"])

In [22]:
# @title
## SE ARMAN LAS PREDICCIONES PROMEDIADAS
def build_predictions(modelos, dataset : pl.DataFrame) -> pl.DataFrame:
  predicciones = {}

  # Sort the dataset by numero_de_cliente to ensure consistent order
  dataset = dataset.sort("numero_de_cliente")

  clientes = dataset["numero_de_cliente"]
  df_to_predict = dataset.drop(["numero_de_cliente"])

  # Convert the dataset to a NumPy array for LightGBM prediction
  dataset_np = df_to_predict.to_numpy()

  for seed,model in modelos.items():
    if seed in SEMILLA:
      predictions = model.predict(dataset_np)
      predicciones[seed] = predictions

  mean_predictions = np.mean(list(predicciones.values()), axis=0)
  return pl.DataFrame({'numero_de_cliente': clientes, 'Predicted': mean_predictions})

In [23]:
# @title
def lgb_gan_eval(y_pred, data: lgb.Dataset):
  weight = data.get_weight()

  ganancia = np.where(weight == 1.00002, GANANCIA_ACIERTO, 0) - np.where(
      weight < 1.00002, COSTO_ESTIMULO, 0
  )

  ganancia = ganancia[np.argsort(y_pred)[::-1]]
  ganancia = np.cumsum(ganancia)

  return "gan_eval", float(np.max(ganancia)), True


In [24]:
# @title
def objective(trial) -> float:

    logger.info(f"Begin Trial {trial.number}")
    num_leaves = trial.suggest_int('num_leaves', 8, 80)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.4)
    max_depth = trial.suggest_int("max_depth", 10, 100)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 1000)
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0)
    max_bin = trial.suggest_int('max_bin', 255, 500)
    num_iterations = trial.suggest_int('num_iterations', 100, 500)

    logger.info(f"Opt Train Data : {len(df_train_for_optuna_features.columns)} , {df_train_for_optuna_targets["clase_binaria"].shape} , {df_train_for_optuna_targets["clase_peso"].shape}")
    opt_train_pd = df_train_for_optuna_features.to_numpy()
    opt_y_pd = df_train_for_optuna_targets["clase_binaria"].to_numpy()
    opt_weight_pd = df_train_for_optuna_targets["clase_peso"].to_numpy()

    train_data = lgb.Dataset(opt_train_pd,
                                label=opt_y_pd,
                                weight=opt_weight_pd)

    #opt_X_val_pd = df_val_features.to_numpy()
    #opt_y_val_pd = df_val_with_target["clase_binaria"].to_numpy()
    #weight_val_pd = df_val_with_target["clase_peso"].to_numpy()
    #val_data = lgb.Dataset(opt_X_val_pd,label=opt_y_val_pd,weight=weight_val_pd)

    modelos = {}
    params = {
      'objective': 'binary',
      'metric': 'custom',
      'boosting_type': 'rf',
      'first_metric_only': True,
      'boost_from_average': True,
      'feature_pre_filter': False,
      'max_bin': max_bin,
      'max_depth': max_depth,
      'num_leaves': num_leaves,
      'learning_rate': learning_rate,
      'min_data_in_leaf': min_data_in_leaf,
      'feature_fraction': feature_fraction,
      'seed': SEMILLA[1],
      'verbose': -1,
      'num_iterations': num_iterations
      }

    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=110,
        callbacks=[lgb.early_stopping( int((50 + 5) / learning_rate))],
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5
    )
    max_gan = np.mean(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    trial.set_user_attr("best_iter", best_iter)

    return max_gan


# SE INTENTA RECUPERAR UN ESTUDIO O SE INICIA UNO NUEVO
#storage_name = f"sqlite:////{os.path.join(BUCKETS, BUCKET_TARGET,STUDY_NAME)}.db"
storage_name = f"sqlite:///{os.path.join(modelos_directory,STUDY_NAME)}.db"
study_name = STUDY_NAME

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)



# HAY UN FLAG EN EL CONFIG PARA EVITAR CORRER LA OPTIMIZACION SIEMPRE
if RUN_BAYESIAN_OPTIMIZATION:
  logger.info(f"Run Optimization with {N_TRIALS}")
  # Pass the training data explicitly to the objective function
  study.optimize(lambda trial: objective(trial), n_trials=N_TRIALS)

[I 2025-11-16 21:44:43,022] Using an existing study with name 'study-comp2-us-0-1-vieja-opt' instead of creating a new one.


In [49]:
# @title

## SE ARMA EL MODELO Y DE SER POSIBLE SE PERSISTE PARA PODER USARLO PARA OTRA PREDICCION.
def build_and_save_or_load_models(study, semillas : list, train_dataset : pl.DataFrame, y_target : pl.DataFrame, undersampling_fraction, is_test, is_final=False) -> dict:

  modelos = {}
  if is_test:
    sufix = "test"
  else:
    if not is_final:
        sufix = "final"
    else:
        sufix = "predict"

  all_models_exist = True
  for seed in SEMILLA:
    model_name = f"lgb_predict_{seed}_{sufix}.txt"
    model_file_path = os.path.join(modelos_directory, model_name)
    if not os.path.exists(model_file_path):
      all_models_exist = False
      break

  if all_models_exist:
    for seed in SEMILLA:
        model_name = f"lgb_predict_{seed}_{sufix}.txt"
        model_file_path = os.path.join(modelos_directory, model_name)
        modelos[seed] = lgb.Booster(model_file=model_file_path)
  else:
    train_dataset_pd = train_dataset.to_pandas()
    y_target_np = y_target["clase_binaria"].to_numpy()
    weight_np = y_target["clase_peso"].to_numpy()

    train_data = lgb.Dataset(train_dataset_pd,
                                label=y_target_np,
                                weight=weight_np)

    if len(study.trials) == 0:
      raise RuntimeError("No trials found in study. Run optimization first.")

    best_params = study.best_trial.params.copy()
    best_num_boost_round = study.best_trial.user_attrs.get("best_iter", 110)

    # Remove 'num_iterations' from best_params as we use best_num_boost_round
    if 'num_iterations' in best_params:
        del best_params['num_iterations']


    best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf']*80 / undersampling_fraction)

    for seed in semillas:
      params = {
              'objective': 'binary',
                'metric': 'custom',
                'boosting_type': 'rf',
                'first_metric_only': True,
                'boost_from_average': True,
                'feature_pre_filter': False,
                'seed': seed,
                'verbose': -1,
                **best_params
          }

      model = lgb.train(params, train_data, num_boost_round=best_num_boost_round)

      modelos[seed] = model
      model.save_model(os.path.join(modelos_directory,f"lgb_predict_{seed}_{sufix}.txt"))

  return modelos

In [30]:
# @title


def build_final_predictions(predict_models, df_predict, n_envios):
  mean_predictions = build_predictions(predict_models, df_predict)
  sorted_mean_predictions = mean_predictions.sort('Predicted', descending=True)
  final_predictions = sorted_mean_predictions.with_columns(
        (pl.arange(0, sorted_mean_predictions.height) < n_envios)
        .cast(pl.Int8)
        .alias("Predicted")
    )

  return final_predictions.select(["numero_de_cliente", "Predicted"])

In [31]:
# @title
def cantidad_envios(y_pred : pl.DataFrame, y_true : pl.DataFrame) -> float:
    df_eval = y_pred.join(y_true, on="numero_de_cliente")
    print(df_eval)
    df_ordenado = df_eval.sort("Predicted", descending=True)

    # Ganancia individual por fila, cast to Float64 to prevent potential overflow
    df_ordenado = df_ordenado.with_columns([
        pl.when(pl.col("clase_binaria") == 1)
          .then(pl.lit(GANANCIA_ACIERTO).cast(pl.Float64))
          .otherwise(pl.lit(-COSTO_ESTIMULO).cast(pl.Float64))
          .alias("ganancia_individual")
    ])

    # Ganancia acumulada
    df_ordenado = df_ordenado.with_columns([
        pl.col("ganancia_individual").cum_sum().alias("ganancia_acumulada")
    ])

    # Obtener ganancia maxima
    ganancia_maxima = df_ordenado.select(pl.col("ganancia_acumulada").max()).item()

    # Find the index of the first occurrence of the maximum cumulative gain
    idx_max_ganancia = df_ordenado["ganancia_acumulada"].arg_max()

    # The number of sends is the index + 1 (since index is 0-based)
    cantidad_envios_real = idx_max_ganancia + 1

    return float(ganancia_maxima), cantidad_envios_real

In [40]:
test_models = build_and_save_or_load_models(study, SEMILLA, df_train_for_testing_features, df_train_for_testing_targets, undersampling_fraction=UNDERSAMPLE_FRACTION, is_test = True)

In [41]:

if "clase_ternaria" in df_test_with_target.columns:
  df_test_ternaria = df_test_with_target["clase_ternaria"]
  log = f"Ganancia 'optima idealizada' en Prediccion usada como pruebas: {ganancia_optima_idealizada(df_test_features, df_test_ternaria)}"
print(log)
log = f"Clases ternarias en Test: {df_test_ternaria.value_counts()}"
print(log)

# The columns to drop from df_test_features are already dropped when df_test_features was created.
# comp_predictions = build_predictions(test_models, df_test) # Original line
comp_predictions = build_predictions(test_models, df_test_features)
ganancia, n_envios_final = cantidad_envios(comp_predictions, df_test_with_target)
print(f"Ganancia en Prediccion de Experimento : {ganancia} con {n_envios_final} envios")

Ganancia 'optima idealizada' en Prediccion usada como pruebas: 741000000
Clases ternarias en Test: shape: (3, 2)
┌────────────────┬────────┐
│ clase_ternaria ┆ count  │
│ ---            ┆ ---    │
│ str            ┆ u32    │
╞════════════════╪════════╡
│ BAJA+2         ┆ 950    │
│ BAJA+1         ┆ 1027   │
│ CONTINUA       ┆ 161132 │
└────────────────┴────────┘
shape: (163_109, 5)
┌───────────────────┬───────────┬───────────────┬────────────┬────────────────┐
│ numero_de_cliente ┆ Predicted ┆ clase_binaria ┆ clase_peso ┆ clase_ternaria │
│ ---               ┆ ---       ┆ ---           ┆ ---        ┆ ---            │
│ i64               ┆ f64       ┆ i32           ┆ f64        ┆ str            │
╞═══════════════════╪═══════════╪═══════════════╪════════════╪════════════════╡
│ 249221323         ┆ 0.002044  ┆ 0             ┆ 1.0        ┆ CONTINUA       │
│ 249227600         ┆ 0.001356  ┆ 0             ┆ 1.0        ┆ CONTINUA       │
│ 249234235         ┆ 0.017435  ┆ 0             ┆ 1.0  

In [37]:
comp_predictions.filter(pl.col("Predicted") > 0.5)

numero_de_cliente,Predicted
i64,f64
249654219,0.998332
249809121,0.770113
250130651,0.742283
250211454,1.0
250371967,0.955707
…,…
1548033591,0.885683
1558353134,0.595546
1558597439,0.655407
1559900287,0.592469


In [50]:
predict_models = build_and_save_or_load_models(study, SEMILLA,df_train_predict_features, df_train_predict_with_target, undersampling_fraction=UNDERSAMPLE_FRACTION, is_test = False, is_final=True)

In [51]:
# Retrieve best n_envios from the study's best trial
print(n_envios_final)
print(f"Predecimos {FINAL_PREDICT}")
print(f"Entrenamos {FINAL_TRAIN}")



if IS_EXPERIMENTO:
  if "clase_ternaria" in df_predict_with_target.columns:
    df_predict_ternaria = df_predict_with_target["clase_ternaria"]
    log = f"Ganancia 'optima idealizada' en Prediccion usada como pruebas: {ganancia_optima_idealizada(df_predict_features, df_predict_ternaria)}"
    logger.info(log)
    print(log)
    log = f"Clases ternarias en Prediccion: {df_predict_ternaria.value_counts()}"
    logger.info(log)
    print(log)

  # The columns to drop are already handled when df_predict_features was created
  # if cols_to_drop:
  #   logger.info(f"Dropping columns from df_predict: {cols_to_drop}")
  #   print(f"Dropping columns from df_predict: {cols_to_drop}")
  #   df_predict = df_predict.drop(cols_to_drop)

  comp_predictions = build_final_predictions(predict_models, df_predict_features, n_envios_final)
  print(comp_predictions["Predicted"].value_counts())
  ganancia, n_envios_final = cantidad_envios(comp_predictions, df_predict_with_target)
  logger.info(f"Ganancia en Prediccion de Experimento : {ganancia} con {n_envios_final} envios")
  print(f"Ganancia en Prediccion de Experimento : {ganancia} con {n_envios_final} envios")

  print(f"Forzando n_envios {forzar_n_envios(comp_predictions, df_predict_with_target, best_n_envios)}")


else:

  # The columns to drop are already handled when df_predict_features was created
  # if cols_to_drop:
  #   logger.info(f"Dropping columns from df_predict: {cols_to_drop}")
  #   df_predict = df_predict.drop(cols_to_drop)

  #prediction_path = os.path.join(BUCKETS, BUCKET_TARGET, f"predictions.csv")
  prediction_path = "predictions.csv"
  comp_predictions = build_final_predictions(predict_models, df_predict_features, n_envios_final)
  comp_predictions.write_csv(prediction_path)

print("Terminó el programa")

11703
Predecimos 202108
Entrenamos [202003, 202004, 202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105, 202106]
Terminó el programa


In [None]:
import matplotlib.pyplot as plt
lgb.plot_importance(predict_models[50], figsize=(30, 40))
plt.show()