In [3]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle

In [None]:
os.chdir(os.path.join(os.path.dirname(os.getcwd()), 'data', 'in'))

In [None]:
clientes_df = pd.read_csv('train_clientes_sample.csv')
requerimientos_df = pd.read_csv('train_requerimientos_sample.csv')

In [None]:
def generar_variables_ingenieria(clientes_df):
    # SDO_ACTIVO (saldo)
    clientes_df["VAR_SDO_ACTIVO_6M"] = clientes_df["SDO_ACTIVO_MENOS0"] - clientes_df["SDO_ACTIVO_MENOS5"]
    clientes_df["PROM_SDO_ACTIVO_0M_2M"] = clientes_df[[f"SDO_ACTIVO_MENOS{i}" for i in range(3)]].mean(axis=1)
    clientes_df["PROM_SDO_ACTIVO_3M_5M"] = clientes_df[[f"SDO_ACTIVO_MENOS{i}" for i in range(3, 6)]].mean(axis=1)
    clientes_df["VAR_SDO_ACTIVO_3M"] = clientes_df["PROM_SDO_ACTIVO_0M_2M"] - clientes_df["PROM_SDO_ACTIVO_3M_5M"]
    clientes_df["PROM_SDO_ACTIVO_6M"] = clientes_df[[f"SDO_ACTIVO_MENOS{i}" for i in range(6)]].mean(axis=1)

    # FLG_SEGURO (flag binario)
    clientes_df["MESES_CON_SEGURO"] = clientes_df[[f"FLG_SEGURO_MENOS{i}" for i in range(6)]].sum(axis=1)

    # CANALES
    for canal in [1, 2, 3]:
        base = f"NRO_ACCES_CANAL{canal}_MENOS"
        clientes_df[f"VAR_NRO_ACCES_CANAL{canal}_6M"] = clientes_df[f"{base}0"] - clientes_df[f"{base}5"]
        clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_6M"] = clientes_df[[f"{base}{i}" for i in range(6)]].mean(axis=1)
        clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_0M_2M"] = clientes_df[[f"{base}{i}" for i in range(3)]].mean(axis=1)
        clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_3M_5M"] = clientes_df[[f"{base}{i}" for i in range(3, 6)]].mean(axis=1)
        clientes_df[f"VAR_NRO_ACCES_CANAL{canal}_3M"] = (clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_0M_2M"] - clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_3M_5M"])

    # ENTIDADES FINANCIERAS
    clientes_df["PROM_NRO_ENTID_SSFF_6M"] = clientes_df[[f"NRO_ENTID_SSFF_MENOS{i}" for i in range(6)]].mean(axis=1)
    clientes_df["VAR_NRO_ENTID_SSFF_6M"] = clientes_df["NRO_ENTID_SSFF_MENOS0"] - clientes_df["NRO_ENTID_SSFF_MENOS5"]
    clientes_df["PROM_NRO_ENTID_SSFF_0M_2M"] = clientes_df[[f"NRO_ENTID_SSFF_MENOS{i}" for i in range(3)]].mean(axis=1)
    clientes_df["PROM_NRO_ENTID_SSFF_3M_5M"] = clientes_df[[f"NRO_ENTID_SSFF_MENOS{i}" for i in range(3, 6)]].mean(axis=1)
    clientes_df["VAR_NRO_ENTID_SSFF_3M"] = (clientes_df["PROM_NRO_ENTID_SSFF_0M_2M"] - clientes_df["PROM_NRO_ENTID_SSFF_3M_5M"])

    # SALDO EN OTRAS ENTIDADES
    clientes_df["MESES_CON_SALDO"] = clientes_df[[f"FLG_SDO_OTSSFF_MENOS{i}" for i in range(6)]].sum(axis=1)

    return clientes_df


In [None]:
def imputacion_variables(clientes_df,requerimientos_df):
    moda = clientes_df['RANG_INGRESO'].mode()[0]
    clientes_df['RANG_INGRESO'].fillna(moda, inplace=True)

    moda = clientes_df['FLAG_LIMA_PROVINCIA'].mode()[0]
    clientes_df['FLAG_LIMA_PROVINCIA'].fillna(moda, inplace=True)

    clientes_df['EDAD'].fillna(clientes_df['EDAD'].median(), inplace=True)
    clientes_df['ANTIGUEDAD'].fillna(clientes_df['ANTIGUEDAD'].median(), inplace=True)

    moda = requerimientos_df['DICTAMEN'].mode()[0]
    requerimientos_df['DICTAMEN'].fillna(moda, inplace=True)

    return clientes_df, requerimientos_df

In [None]:
def encoder_categoricos(clientes_df):
    clientes_df['RANG_SDO_PASIVO_MENOS0'] = clientes_df['RANG_SDO_PASIVO_MENOS0'].replace('Cero', 'Rango_SDO_00')
    clientes_df['FLAG_LIMA_PROVINCIA'] = clientes_df['FLAG_LIMA_PROVINCIA'].map({'Lima': 1, 'Provincia': 0})
    cat_cols = clientes_df.select_dtypes(include=['object', 'category']).columns
    encoders_clientes = {} 

    for col in cat_cols:
        le = LabelEncoder()
        clientes_df[col] = le.fit_transform(clientes_df[col])
        encoders_clientes[col] = le

    return clientes_df, encoders_clientes

In [None]:
def construir_variables_requerimientos(df_reqs, id_col='ID_CORRELATIVO'):
    
    total_reqs = df_reqs.groupby(id_col).size().rename('total_requerimientos')
    if not isinstance(total_reqs, pd.DataFrame):
        total_reqs = total_reqs.to_frame()

    n_tipo_req = df_reqs.groupby(id_col)['TIPO_REQUERIMIENTO2'].nunique().rename('nro_tipos_requerimiento').to_frame()
    n_dictamen = df_reqs.groupby(id_col)['DICTAMEN'].nunique().rename('nro_dictamenes').to_frame()
    n_producto = df_reqs.groupby(id_col)['PRODUCTO_SERVICIO_2'].nunique().rename('nro_productos_servicios').to_frame()
    n_submotivo = df_reqs.groupby(id_col)['SUBMOTIVO_2'].nunique().rename('nro_submotivos').to_frame()

    tipo_ohe = pd.get_dummies(df_reqs['TIPO_REQUERIMIENTO2'], prefix='tipo')
    tipo_ohe[id_col] = df_reqs[id_col]
    tipo_ohe = tipo_ohe.groupby(id_col).sum()

    dictamen_ohe = pd.get_dummies(df_reqs['DICTAMEN'], prefix='dictamen')
    dictamen_ohe[id_col] = df_reqs[id_col]
    dictamen_ohe = dictamen_ohe.groupby(id_col).sum()

    df_agregado = pd.concat([total_reqs, n_tipo_req, n_dictamen, n_producto, n_submotivo, tipo_ohe, dictamen_ohe],axis=1)

    return df_agregado


In [None]:
def estandarizacion(df_final):
    no_escalar = ['ID_CORRELATIVO', 'CODMES', 'ATTRITION']

    columnas_a_escalar = df_final.columns.difference(no_escalar)

    df_predictoras = df_final[columnas_a_escalar]

    scaler = StandardScaler()
    df_escaladas = pd.DataFrame(scaler.fit_transform(df_predictoras),columns=columnas_a_escalar,index=df_final.index)

    df_final_estandarizado = pd.concat([df_final[no_escalar], df_escaladas],axis=1)

    return df_final_estandarizado, scaler

In [None]:
clientes_df = generar_variables_ingenieria(clientes_df)
clientes_df,requerimientos_df = imputacion_variables(clientes_df,requerimientos_df)
clientes_df, artifact_encoders_clientes = encoder_categoricos(clientes_df)
requerimientos_df = construir_variables_requerimientos(requerimientos_df)
df_final = clientes_df.merge(requerimientos_df, on='ID_CORRELATIVO', how='inner')
df_final, artifact_scaler = estandarizacion(df_final)

In [None]:
df_final.columns

### test

In [None]:
os.chdir(os.path.join(os.path.dirname(os.getcwd()), 'data', 'out'))

In [None]:
os.getcwd()

In [None]:
df_data_test = pd.read_csv("clientes_data_test.csv")
df_requerimientos_test = pd.read_csv("requerimientos_data_test.csv")

In [None]:
os.chdir('../..')

In [None]:
os.chdir('outputs/preprocess')

In [None]:
os.getcwd()

In [None]:
def prepare_impute_missing(df_data, x_cols):
    df_data_imputed = df_data.copy()
    df_impute_parameters = pd.read_csv(f"imputacion_parametros.csv")
    for col in x_cols:
        impute_value = df_impute_parameters[df_impute_parameters["variable"]==col]["valor"].values[0]
        df_data_imputed[col] = df_data_imputed[col].fillna(impute_value)
    return df_data_imputed

In [None]:
def generar_variables_ingenieria(clientes_df):
    clientes_df["VAR_SDO_ACTIVO_6M"] = clientes_df["SDO_ACTIVO_MENOS0"] - clientes_df["SDO_ACTIVO_MENOS5"]
    clientes_df["PROM_SDO_ACTIVO_0M_2M"] = clientes_df[[f"SDO_ACTIVO_MENOS{i}" for i in range(3)]].mean(axis=1)
    clientes_df["PROM_SDO_ACTIVO_3M_5M"] = clientes_df[[f"SDO_ACTIVO_MENOS{i}" for i in range(3, 6)]].mean(axis=1)
    clientes_df["VAR_SDO_ACTIVO_3M"] = clientes_df["PROM_SDO_ACTIVO_0M_2M"] - clientes_df["PROM_SDO_ACTIVO_3M_5M"]
    clientes_df["PROM_SDO_ACTIVO_6M"] = clientes_df[[f"SDO_ACTIVO_MENOS{i}" for i in range(6)]].mean(axis=1)
    clientes_df["MESES_CON_SEGURO"] = clientes_df[[f"FLG_SEGURO_MENOS{i}" for i in range(6)]].sum(axis=1)
    for canal in [1, 2, 3]:
        base = f"NRO_ACCES_CANAL{canal}_MENOS"
        clientes_df[f"VAR_NRO_ACCES_CANAL{canal}_6M"] = clientes_df[f"{base}0"] - clientes_df[f"{base}5"]
        clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_6M"] = clientes_df[[f"{base}{i}" for i in range(6)]].mean(axis=1)
        clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_0M_2M"] = clientes_df[[f"{base}{i}" for i in range(3)]].mean(axis=1)
        clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_3M_5M"] = clientes_df[[f"{base}{i}" for i in range(3, 6)]].mean(axis=1)
        clientes_df[f"VAR_NRO_ACCES_CANAL{canal}_3M"] = (clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_0M_2M"] - clientes_df[f"PROM_NRO_ACCES_CANAL{canal}_3M_5M"])
    clientes_df["PROM_NRO_ENTID_SSFF_6M"] = clientes_df[[f"NRO_ENTID_SSFF_MENOS{i}" for i in range(6)]].mean(axis=1)
    clientes_df["VAR_NRO_ENTID_SSFF_6M"] = clientes_df["NRO_ENTID_SSFF_MENOS0"] - clientes_df["NRO_ENTID_SSFF_MENOS5"]
    clientes_df["PROM_NRO_ENTID_SSFF_0M_2M"] = clientes_df[[f"NRO_ENTID_SSFF_MENOS{i}" for i in range(3)]].mean(axis=1)
    clientes_df["PROM_NRO_ENTID_SSFF_3M_5M"] = clientes_df[[f"NRO_ENTID_SSFF_MENOS{i}" for i in range(3, 6)]].mean(axis=1)
    clientes_df["VAR_NRO_ENTID_SSFF_3M"] = (clientes_df["PROM_NRO_ENTID_SSFF_0M_2M"] - clientes_df["PROM_NRO_ENTID_SSFF_3M_5M"])
    clientes_df["MESES_CON_SALDO"] = clientes_df[[f"FLG_SDO_OTSSFF_MENOS{i}" for i in range(6)]].sum(axis=1)

    return clientes_df

In [None]:
def construir_variables_requerimientos(df_reqs, id_col='ID_CORRELATIVO'):
        total_reqs = df_reqs.groupby(id_col).size().rename('total_requerimientos')
        if not isinstance(total_reqs, pd.DataFrame):
            total_reqs = total_reqs.to_frame()
        n_tipo_req = df_reqs.groupby(id_col)['TIPO_REQUERIMIENTO2'].nunique().rename('nro_tipos_requerimiento').to_frame()
        n_dictamen = df_reqs.groupby(id_col)['DICTAMEN'].nunique().rename('nro_dictamenes').to_frame()
        n_producto = df_reqs.groupby(id_col)['PRODUCTO_SERVICIO_2'].nunique().rename('nro_productos_servicios').to_frame()
        n_submotivo = df_reqs.groupby(id_col)['SUBMOTIVO_2'].nunique().rename('nro_submotivos').to_frame()
        tipo_ohe = pd.get_dummies(df_reqs['TIPO_REQUERIMIENTO2'], prefix='tipo')
        tipo_ohe[id_col] = df_reqs[id_col]
        tipo_ohe = tipo_ohe.groupby(id_col).sum()
        dictamen_ohe = pd.get_dummies(df_reqs['DICTAMEN'], prefix='dictamen')
        dictamen_ohe[id_col] = df_reqs[id_col]
        dictamen_ohe = dictamen_ohe.groupby(id_col).sum()
        df_agregado = pd.concat([total_reqs, n_tipo_req, n_dictamen, n_producto, n_submotivo, tipo_ohe, dictamen_ohe],axis=1)
        return df_agregado

In [None]:
def apply_label_encoders_to_test(df_test):
        df_test['RANG_SDO_PASIVO_MENOS0'] = df_test['RANG_SDO_PASIVO_MENOS0'].replace('Cero', 'Rango_SDO_00')
        df_test['FLAG_LIMA_PROVINCIA'] = df_test['FLAG_LIMA_PROVINCIA'].map({'Lima': 1, 'Provincia': 0})
        path_encoder='label_encoder_train.pkl'
        with open(path_encoder, 'rb') as f:
            encoders_clientes = pickle.load(f)
        for col, le in encoders_clientes.items():
            df_test[col] = le.transform(df_test[col])
        return df_test

In [None]:
def aplicar_estandarizacion_test(df_test):
    path_scaler='scaler_train.pkl'
    with open(path_scaler, 'rb') as f:
        scaler = pickle.load(f)
    no_escalar = ['ID_CORRELATIVO', 'CODMES', 'ATTRITION']
    columnas_a_escalar = df_test.columns.difference(no_escalar)
    df_predictoras = df_test[columnas_a_escalar]
    df_escaladas = pd.DataFrame(scaler.transform(df_predictoras),columns=columnas_a_escalar,index=df_test.index)
    df_test_estandarizado = pd.concat([df_test[no_escalar], df_escaladas], axis=1)
    return df_test_estandarizado

In [None]:
def prepare_dataset(df_data_test,df_requerimientos_test):
    x_cols_clientes = ['RANG_INGRESO','FLAG_LIMA_PROVINCIA','EDAD','ANTIGUEDAD']
    x_cols_requerimientos = ['DICTAMEN']
    df_data_imputed_clientes = prepare_impute_missing(df_data_test, x_cols_clientes)
    df_data_imputed_requerimientos = prepare_impute_missing(df_requerimientos_test, x_cols_requerimientos)
    df_data_feature_clientes = generar_variables_ingenieria(df_data_imputed_clientes)
    df_data_feature_requerimientos = construir_variables_requerimientos(df_data_imputed_requerimientos)
    df_data_encoder_clientes = apply_label_encoders_to_test(df_data_feature_clientes)
    df_final = df_data_encoder_clientes.merge(df_data_feature_requerimientos, on='ID_CORRELATIVO', how='inner')
    df_final = aplicar_estandarizacion_test(df_final)
    return df_final

In [None]:
df_final = prepare_dataset(df_data_test,df_requerimientos_test)

## Model

In [34]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import sklearn.metrics as metrics

In [4]:
os.chdir(os.path.join(os.path.dirname(os.getcwd()), 'data', 'out'))

In [5]:
os.getcwd()

'c:\\Users\\Omen\\UP\\Teoria importante\\ML OPS\\Proyecto final\\Bank_Attrition_Detection_MLOps\\data\\out'

In [6]:
data_train_prepared = pd.read_csv('data_train_prepared.csv')
data_test_prepared = pd.read_csv('data_test_prepared.csv')

In [7]:
os.chdir('../..')
os.chdir('outputs/preprocess')

In [8]:
os.getcwd()

'c:\\Users\\Omen\\UP\\Teoria importante\\ML OPS\\Proyecto final\\Bank_Attrition_Detection_MLOps\\outputs\\preprocess'

In [9]:
x_cols = pd.read_csv(f'x_col_names.csv')['x_col'].to_list()
y_col = pd.read_csv(f'y_col_name.csv')['y_col'].to_list()

##### XGBOST

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'eta': [0.05, 0.1, 0.2], 
    'gamma': [0, 1, 5],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'n_estimators': [50, 100],
    'scale_pos_weight': [1, 5, 10]
}

In [None]:
xgb_model = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='auc',random_state=42)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc', 
    cv=3,
    verbose=1,
    n_jobs=-1
)

In [31]:
grid_search.fit(data_train_prepared[x_cols], data_train_prepared[y_col].values.ravel())

Fitting 3 folds for each of 1458 candidates, totalling 4374 fits


In [49]:
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor puntuación:", grid_search.best_score_)


Mejores hiperparámetros: {'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 50, 'n_estimators': 200}
Mejor puntuación: 0.7873420309895427


In [36]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(data_test_prepared[x_cols])
y_proba = best_model.predict_proba(data_test_prepared[x_cols])

print(classification_report(data_test_prepared[y_col], y_pred))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      7659
           1       0.65      0.11      0.19       768

    accuracy                           0.91      8427
   macro avg       0.79      0.55      0.57      8427
weighted avg       0.89      0.91      0.88      8427



In [37]:
metrics.roc_auc_score(data_test_prepared[y_col], y_proba[:,1])

0.8014765784806545

#### RF

In [39]:
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
model_parameters_grid = {
    'n_estimators': [50, 100, 200],            
    'max_depth': [None, 4, 6, 8],              
    'min_samples_leaf': [1, 10, 50],           
    'min_impurity_decrease': [0.0, 0.01, 0.05]
}

In [40]:
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=model_parameters_grid,
    scoring='roc_auc', 
    cv=3,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(data_train_prepared[x_cols], data_train_prepared[y_col].values.ravel())

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [41]:
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor puntuación:", grid_search.best_score_)

Mejores hiperparámetros: {'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 50, 'n_estimators': 200}
Mejor puntuación: 0.7873420309895427


In [None]:
y_pred = best_model.predict(data_test_prepared[x_cols])

print(classification_report(data_test_prepared[y_col], y_pred))

              precision    recall  f1-score   support

           0       0.96      0.78      0.86      7659
           1       0.23      0.66      0.34       768

    accuracy                           0.77      8427
   macro avg       0.59      0.72      0.60      8427
weighted avg       0.89      0.77      0.81      8427



In [None]:
best_model = grid_search.best_estimator_
y_proba = best_model.predict_proba(data_test_prepared[x_cols])

In [43]:
metrics.roc_auc_score(data_test_prepared[y_col], y_proba[:,1])

0.7907760001849675

In [45]:
df_model_results = pd.DataFrame({'model_parameters': grid_search.cv_results_['params'],
                                         'model_rank': grid_search.cv_results_['rank_test_score'],
                                         'auc_score_mean': grid_search.cv_results_['mean_test_score'],
                                         'auc_score_std': grid_search.cv_results_['std_test_score']})
df_model_results['auc_score_cv'] = df_model_results['auc_score_std'] / df_model_results['auc_score_mean']

In [46]:
df_model_results

Unnamed: 0,model_parameters,model_rank,auc_score_mean,auc_score_std,auc_score_cv
0,"{'max_depth': None, 'min_impurity_decrease': 0...",33,0.757063,0.011879,0.015691
1,"{'max_depth': None, 'min_impurity_decrease': 0...",23,0.769074,0.014866,0.019330
2,"{'max_depth': None, 'min_impurity_decrease': 0...",9,0.777544,0.012890,0.016578
3,"{'max_depth': None, 'min_impurity_decrease': 0...",6,0.780888,0.014441,0.018493
4,"{'max_depth': None, 'min_impurity_decrease': 0...",5,0.784526,0.014083,0.017951
...,...,...,...,...,...
103,"{'max_depth': 8, 'min_impurity_decrease': 0.05...",73,0.689820,0.010067,0.014594
104,"{'max_depth': 8, 'min_impurity_decrease': 0.05...",73,0.689820,0.010067,0.014594
105,"{'max_depth': 8, 'min_impurity_decrease': 0.05...",73,0.689820,0.010067,0.014594
106,"{'max_depth': 8, 'min_impurity_decrease': 0.05...",73,0.689820,0.010067,0.014594


In [47]:
pd.DataFrame({'variable': grid_search.feature_names_in_, 'importance': grid_search.best_estimator_.feature_importances_})

Unnamed: 0,variable,importance
0,ANTIGUEDAD,0.035263
1,EDAD,0.021177
2,FLAG_LIMA_PROVINCIA,0.001374
3,FLG_BANCARIZADO,0.000766
4,FLG_NOMINA,0.019950
...,...,...
82,nro_submotivos,0.001171
83,nro_tipos_requerimiento,0.000594
84,tipo_Reclamo,0.002411
85,tipo_Solicitud,0.002563
