In [None]:
from pycaret.classification import *
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

import sys
sys.path.insert(1, '../utils')
from run_models.train_predict import train_model, calification_model
from outliers.outlier import outlier

In [None]:
experiment_name = 'experiment_5'

## Lectura de Datos

In [None]:
data  = pd.read_csv(r"..\data\{}\pmod_base_model.csv".format(experiment_name))
data.head()

In [None]:
print("Cantidad de valores:", data.shape)
print("Cantidad de datos con valores infinitos: ", (data.isin([np.inf, -np.inf]).sum()).sum())
print("Cantidad de datos con valores nulos: ", (data.isna().sum()).sum())
print("Cantidad de datos con valores negativos: ", ((data.drop(columns=["marca_pago", "ajustes_banco"]) < 0).sum()).sum())

In [None]:
data_model = data[data["fecha_var_rpta_alt"] < 202401]
data_oot = data[data["fecha_var_rpta_alt"] == 202401]

## Encoding variables categoricas

In [None]:
print(data_model["marca_pago"].value_counts(normalize=True).to_dict())
encoding_dict_mp = data_model["marca_pago"].value_counts(normalize=True).to_dict()
data_model["marca_pago"] = data_model["marca_pago"].map(encoding_dict_mp)
data_model["marca_pago"].value_counts()

In [None]:
print(data_model["ajustes_banco"].value_counts(normalize=True).to_dict())
encoding_dict_ab = data_model["ajustes_banco"].value_counts(normalize=True).to_dict()
data_model["ajustes_banco"] = data_model["ajustes_banco"].map(encoding_dict_ab)
data_model["ajustes_banco"].value_counts()

In [None]:
print(data_model["lote"].value_counts(normalize=True).to_dict())
encoding_dict_lt = data_model["lote"].value_counts(normalize=True).to_dict()
data_model["lote"] = data_model["lote"].map(encoding_dict_lt)
data_model["lote"].value_counts()

## Escalamiento de variables

In [None]:
MM = MinMaxScaler()
data_model_norm = MM.fit_transform(data_model.drop(columns=['nit_enmascarado', 'num_oblig_enmascarado',
                                                            'num_oblig_orig_enmascarado', 'fecha_var_rpta_alt', 
                                                            'var_rpta_alt','marca_pago', 'ajustes_banco', 'lote']))
data_model_norm = pd.DataFrame(data_model_norm, columns=data_model.drop(columns=['nit_enmascarado', 'num_oblig_enmascarado',
                                                            'num_oblig_orig_enmascarado', 'fecha_var_rpta_alt', 
                                                            'var_rpta_alt','marca_pago', 'ajustes_banco', 'lote']).columns)
data_model_norm["var_rpta_alt"] = data_model["var_rpta_alt"].values
data_model_norm["marca_pago"] = data_model["marca_pago"].values
data_model_norm["ajustes_banco"] = data_model["ajustes_banco"].values
data_model_norm["lote"] = data_model["lote"].values
data_model_norm.head()

## Eliminación de atípicos

In [None]:
df_outliers = outlier(data_model_norm)
data_model_norm = df_outliers.reset_index(drop=True)
print(data_model_norm.shape)
data_model_norm.head()

## Entrenar Modelo

In [None]:

models = ['xgboost']
best_model = train_model(data_model_norm, "var_rpta_alt", models)

In [None]:
best_model = best_model
print(best_model)

## Tunear Modelo

In [None]:
params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7],
    'learning_rate': [0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

best_model_tuned = tune_model(best_model, fold=5, custom_grid=params, optimize='F1')
print(best_model_tuned)

In [None]:
plot_model(best_model_tuned, plot='feature')
plot_model(best_model_tuned, plot = 'auc')
plot_model(best_model_tuned, plot = 'confusion_matrix')

## Calificar

In [None]:
data_oot.head()

In [None]:
data_calification = data_oot.copy()
data_calification["marca_pago"] = data_oot["marca_pago"].map(encoding_dict_mp)
data_calification["ajustes_banco"] = data_oot["ajustes_banco"].map(encoding_dict_ab)
data_calification["var_rpta_alt"] = data_oot["var_rpta_alt"].map(encoding_dict_lt)
data_calification.head()

In [None]:
vars_model = data_model_norm.columns
vars_no_scaler = ['var_rpta_alt','marca_pago', 'ajustes_banco', 'lote']

In [None]:

data_calification_ = data_calification[vars_model].drop(columns=vars_no_scaler)
    
data_calification_norm = data_calification_
data_calification_norm = MM.transform(data_calification_norm.values)
data_calification_norm = pd.DataFrame(data_calification_norm, columns=data_calification_.columns)

In [None]:
data_calification_norm["marca_pago"] = data_calification["marca_pago"].values
data_calification_norm["ajustes_banco"] = data_calification["ajustes_banco"].values
data_calification_norm["lote"] = data_calification["lote"].values
data_calification_norm.head()

In [None]:

predictios = predict_model(best_model_tuned, data=data_calification_norm[best_model_tuned.feature_names_in_], raw_score=True)
data_calification["var_rpta_alt"] = predictios["prediction_label"].values
data_calification["Prob_uno"] = predictios["prediction_score_1"].values
data_calification.head()

### Archivo submit

In [None]:
data_calification["ID"] = data_calification["nit_enmascarado"].astype(str) + "#" + data_calification["num_oblig_orig_enmascarado"].astype(str) + "#" + data_calification["num_oblig_enmascarado"].astype(str)
new_dataframe = data_calification[["ID", "var_rpta_alt"]]
print(new_dataframe.isna().sum())
new_dataframe.to_csv(r"..\data\{}\pmod_base_model_calification.csv".format(experiment_name), index=False)

### Archivo Entrega

In [None]:
data_calification["ID"] = data_calification["nit_enmascarado"].astype(str) + "#" + data_calification["num_oblig_orig_enmascarado"].astype(str) + "#" + data_calification["num_oblig_enmascarado"].astype(str)
new_dataframe = data_calification[["ID", "var_rpta_alt", "Prob_uno"]]
print(new_dataframe.isna().sum())
new_dataframe.to_csv(r"..\documentacion\pmod_base_model_calification.csv", index=False)

### Almacenar modelo

In [None]:
save_model(best_model_tuned, 'models/best_model_tuned')

In [None]:
import joblib
scaler_filename = "models/scaler_mm.save"
joblib.dump(MM, scaler_filename) 

In [None]:
best_model_tuned.feature_names_in_