In [83]:
%pip install optuna==3.6.1



In [84]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle

In [85]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [86]:
dataset_path = '/content/drive/MyDrive/MaestDataMining/Especializacion/2C/DMEF/Datasets/'
dataset_file = 'competencia_01.csv'

ganancia_acierto = 273000
costo_estimulo = 7000

mes_train = 202102
mes_test = 202104

# agregue sus semillas
semilla = 714

data = pd.read_csv(dataset_path + dataset_file)

In [87]:
data.shape

(981946, 155)

Vamos a asignar pesos a las clases. En unos minutos explicaremos las razones detrás de esta decisión. Mientras tanto, pueden aprovechar el código para ajustar el peso de la clase **BAJA+2** según lo deseen.





clase_binaria2: toma el valor 1 si clase_ternaria es diferente de 'CONTINUA'

In [88]:
data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [89]:
data['clase_binaria1'] = 0
data['clase_binaria2'] = 0
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [90]:
train_data = data[data['foto_mes'] == mes_train]
test_data = data[data['foto_mes'] == mes_test]

X_train = train_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_train_binaria1 = train_data['clase_binaria1']
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

X_test = test_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_test_binaria1 = test_data['clase_binaria1']
y_test_class = test_data['clase_ternaria']
w_test = test_data['clase_peso']

In [91]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
Xif = imp_mean.fit_transform(X_test)

Para evaluar la calidad del modelo, crearemos nuestra propia función de evaluación que calcule la ganancia. La razón de incluir los pesos es precisamente para poder implementar esta función de evaluación de manera adecuada. Al combinar las clases BAJA+1 y BAJA+2 en una sola, necesitamos una forma de diferenciarlas, y es aquí donde entra en juego el weight. Este parámetro nos permitirá distinguir entre ambas clases al momento de evaluarlas dentro del algoritmo.

In [92]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

LGBM necesita su propio tipo de Datasets:


In [93]:
train_data1 = lgb.Dataset(X_train, label=y_train_binaria1, weight=w_train)
train_data2 = lgb.Dataset(X_train, label=y_train_binaria2, weight=w_train)

In [94]:
def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 8, 100)
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 1000)
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    lambda_l1 = trial.suggest_float('lambda_l1', 0.0, 10.0)
    lambda_l2 = trial.suggest_float('lambda_l2', 0.0, 10.0)
    min_gain_to_split = trial.suggest_float('min_gain_to_split', 0.0, 1.0)
    bagging_freq = trial.suggest_int('bagging_freq', 1, 7)

    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'max_depth': max_depth,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'bagging_freq': bagging_freq,
        'seed': semilla,
        'verbose': -1
    }

    train_data = lgb.Dataset(X_train, label=y_train_binaria2, weight=w_train)
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=100,
        # early_stopping_rounds= int(50 + 5 / learning_rate),
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semilla
    )
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 5


In [95]:
storage_name = "sqlite:///" + dataset_path + "optimization_lgbm.db"
study_name = "exp_302_lgbm"

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-10-13 23:58:55,406] Using an existing study with name 'exp_302_lgbm' instead of creating a new one.


[I 2024-10-12 16:58:27,001] Trial 77 finished with value: 114506000.0 and parameters: {'num_leaves': 24, 'learning_rate': 0.11681200404658175, 'min_data_in_leaf': 200, 'feature_fraction': 0.23784197896792073, 'bagging_fraction': 0.6944512419517778, 'max_depth': 10, 'lambda_l1': 1.1536973623742635, 'lambda_l2': 3.3304940551026623, 'min_gain_to_split': 0.2827894890056999, 'bagging_freq': 2}. Best is trial 77 with value: 114506000.0.


In [96]:
study.optimize(objective, n_trials = 178) # subir subir

[I 2024-10-14 00:00:02,979] Trial 15 finished with value: 104097000.0 and parameters: {'num_leaves': 15, 'learning_rate': 0.017632876678178873, 'min_data_in_leaf': 486, 'feature_fraction': 0.8358625509823963, 'bagging_fraction': 0.6637184209052275, 'max_depth': 8, 'lambda_l1': 5.532834547455817, 'lambda_l2': 2.829492510015686, 'min_gain_to_split': 0.93573325246915, 'bagging_freq': 3}. Best is trial 14 with value: 105105000.0.
[I 2024-10-14 00:00:36,624] Trial 16 finished with value: 107044000.0 and parameters: {'num_leaves': 99, 'learning_rate': 0.0852261158407892, 'min_data_in_leaf': 982, 'feature_fraction': 0.8202750245808443, 'bagging_fraction': 0.6275379745927084, 'max_depth': 3, 'lambda_l1': 6.95688909577289, 'lambda_l2': 7.59925823690142, 'min_gain_to_split': 0.31694570516114695, 'bagging_freq': 2}. Best is trial 16 with value: 107044000.0.
[I 2024-10-14 00:01:03,889] Trial 17 finished with value: 107107000.0 and parameters: {'num_leaves': 62, 'learning_rate': 0.07647693998401717

In [97]:
plot_param_importances(study)

[I 2024-10-13 03:49:49,418] Trial 293 finished with value: 114828000.0 and parameters: {'num_leaves': 16, 'learning_rate': 0.0904129007526445, 'min_data_in_leaf': 292, 'feature_fraction': 0.28180962949951294, 'bagging_fraction': 0.9809595314504485, 'max_depth': 19, 'lambda_l1': 1.6196058700397447, 'lambda_l2': 8.275196353225926, 'min_gain_to_split': 0.3894426147286418, 'bagging_freq': 4}. Best is trial 293 with value: 114828000.0.


Y finalmente tomamos el mejor modelo y lo entrenamos con la totalidad de los
datos

In [98]:
mes_train = 202104
mes_test = 202106

In [99]:
train_data = data[data['foto_mes'] == mes_train]
test_data = data[data['foto_mes'] == mes_test]

X_train = train_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_train_binaria1 = train_data['clase_binaria1']
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

X_test = test_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_test_binaria1 = test_data['clase_binaria1']
y_test_class = test_data['clase_ternaria']
w_test = test_data['clase_peso']

In [100]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
Xif = imp_mean.fit_transform(X_test)

In [101]:
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor modelo: {best_iter}")

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'max_depth': study.best_trial.params['max_depth'],  # Nuevo parámetro
    'lambda_l1': study.best_trial.params['lambda_l1'],  # Nuevo parámetro
    'lambda_l2': study.best_trial.params['lambda_l2'],  # Nuevo parámetro
    'min_gain_to_split': study.best_trial.params['min_gain_to_split'],  # Nuevo parámetro
    'bagging_freq': study.best_trial.params['bagging_freq'],  # Nuevo parámetro
    'seed': semilla,
    'verbose': 0
}

train_data = lgb.Dataset(X_train,
                          label=y_train_binaria2,
                          weight=w_train)

model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)


Mejor cantidad de árboles para el mejor modelo: 65


In [None]:
importances = model.feature_importance()
feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)
importance_df[importance_df['importance'] > 0]


Unnamed: 0,feature,importance
22,mcuentas_saldo,60
5,cliente_edad,52
8,mrentabilidad_annual,46
52,mpayroll,44
33,mprestamos_personales,42
...,...,...
118,Master_mconsumospesos,1
83,mtransferencias_emitidas,1
80,ctransferencias_recibidas,1
38,cplazo_fijo,1


In [102]:
y_pred_lgm = model.predict(X_test)

In [103]:
def ganancia_prob(y_pred, y_true, prop = 1):
  ganancia = np.where(y_true == 1, ganancia_acierto, 0) - np.where(y_true == 0, costo_estimulo, 0)
  return ganancia[y_pred >= 0.025].sum() / prop

#print("Ganancia RF:", ganancia_prob(y_pred_rf, y_test_binaria1))
print("Ganancia LGBM:", ganancia_prob(y_pred_lgm, y_test_binaria1))


Ganancia LGBM: -105028000.0


Prediccion de abril

In [104]:
X_test.shape

(164876, 154)

In [114]:
# Agregar las probabilidades al DataFrame de prueba
test_data['pred_prob'] = y_pred_lgm  # Aquí debes usar las probabilidades predichas

# Ordenar las predicciones por la probabilidad de forma descendente
test_data = test_data.sort_values(by='pred_prob', ascending=False)

# Marcar los primeros X como positivos ()
X = 8700
test_data['Predicted'] = 0  # Inicializar todos como 0 (negativo)
test_data.iloc[:X, test_data.columns.get_loc('Predicted')] = 1  # Marcar los primeros X como 1 (positivo)

# Crear el archivo de salida para Kaggle
output = test_data[['numero_de_cliente', 'Predicted']]

# Guardar el archivo en formato CSV para Kaggle, usando la variable semilla en el nombre
nombre_archivo = f'predicciones_kaggle_{semilla}.csv'
output.to_csv(nombre_archivo, index=False)
print(f"Archivo {nombre_archivo} guardado.")

# Descargar el archivo a tu computadora
from google.colab import files
files.download(nombre_archivo)

Archivo predicciones_kaggle_714.csv guardado.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>