In [None]:
from pycaret.classification import *
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np

import sys
sys.path.insert(1, '../utils')
from run_models.train_predict import train_model, calification_model
from outliers.outlier import outlier

In [None]:
experiment_name = 'experiment_3_4'

## Lectura de Datos

In [None]:
data  = pd.read_csv(r"..\data\{}\pmod_base_model.csv".format(experiment_name))
data.head()

In [None]:
print("Cantidad de valores:", data.shape)
print("Cantidad de datos con valores infinitos: ", (data.isin([np.inf, -np.inf]).sum()).sum())
print("Cantidad de datos con valores nulos: ", (data.isna().sum()).sum())
print("Cantidad de datos con valores negativos: ", ((data.drop(columns=["marca_pago"]) < 0).sum()).sum())

In [None]:
data["marca_pago"] = data["marca_pago"].fillna("NA")
data.shape

In [None]:
data_model = data[data["fecha_var_rpta_alt"] < 202401]
data_oot = data[data["fecha_var_rpta_alt"] == 202401]

In [None]:
print("Cantidad de valores:", data_model.shape)
print("Cantidad de datos con valores infinitos: ", (data_model.isin([np.inf, -np.inf]).sum()).sum())
print("Cantidad de datos con valores nulos: ", (data_model.isna().sum()).sum())
print("Cantidad de datos con valores negativos: ", ((data_model.drop(columns=["marca_pago"]) < 0).sum()).sum())

In [None]:
encoding_dict = data_model["marca_pago"].value_counts(normalize=True).to_dict()
data_model["marca_pago"] = data_model["marca_pago"].map(encoding_dict)
data_model["marca_pago"].value_counts()

In [None]:
MM = MinMaxScaler()
data_model_norm = MM.fit_transform(data_model.drop(columns=['nit_enmascarado', 'num_oblig_enmascarado',
                                                            'num_oblig_orig_enmascarado', 'fecha_var_rpta_alt', 
                                                            'var_rpta_alt',]))
data_model_norm = pd.DataFrame(data_model_norm, columns=data_model.columns[5:])
data_model_norm["var_rpta_alt"] = data_model["var_rpta_alt"].values
data_model_norm.head()

In [None]:
df_outliers = outlier(data_model_norm)
data_model_norm = df_outliers.reset_index(drop=True)
print(data_model_norm.shape)
data_model_norm.head()

## Entrenar Modelo

In [None]:

models = ['xgboost', 'lightgbm']
best_model = train_model(data_model_norm, "var_rpta_alt", models)

In [None]:
best_model = best_model[0]
print(best_model)

## Tunear Modelo

In [None]:
params = {
    'n_estimators' : [400, 450],
    'max_depth' : [9,10],
    'learning_rate' : [.1],
    'objective' : ['reg:squarederror'],
}

best_model_tuned = tune_model(best_model, fold=5, custom_grid=params, optimize='F1')
print(best_model_tuned)

In [None]:
plot_model(best_model_tuned, plot='feature')
plot_model(best_model_tuned, plot = 'auc')
plot_model(best_model_tuned, plot = 'confusion_matrix')

## Calificar

In [None]:
data_calification = data_oot
data_calification["marca_pago"] = data_calification["marca_pago"].map(encoding_dict)
data_calification.head()

In [None]:
vars_model = data_model_norm.columns
vars_model[:-1]

In [None]:
data_calification = calification_model(best_model_tuned, data_calification, vars_model[:-1], scaler = MM)
data_calification.head()

In [None]:
data_calification["ID"] = data_calification["nit_enmascarado"].astype(str) + "#" + data_calification["num_oblig_orig_enmascarado"].astype(str) + "#" + data_calification["num_oblig_enmascarado"].astype(str)
new_dataframe = data_calification[["ID", "var_rpta_alt"]]
print(new_dataframe.isna().sum())
new_dataframe.to_csv(r"..\data\{}\pmod_base_model_calification.csv".format(experiment_name), index=False)