# Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.calibration import calibration_curve
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

import xgboost as xgb
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

from obiwan import *

pd.set_option('display.max_columns', None)

# Functions

In [None]:
# Calcular PowerStat (Kolmogorov-Smirnov Statistic)
def calculate_power_stat(y_true, y_pred_proba):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    ks_statistic = max(tpr - fpr)
    return ks_statistic * 100

# Avaliar o poder discriminatório por variável
def power_stat_by_feature(X, y, model):
    feature_power_stats = {}
    for feature in X.columns:
        X_feature = X[[feature]]
        model.fit(X_feature, y)
        y_pred_proba = model.predict_proba(X_feature)[:, 1]
        power_stat = calculate_power_stat(y, y_pred_proba)
        feature_power_stats[feature] = "{:.2f}".format(power_stat)

    return feature_power_stats


# Teste de Calibração Visual
def plot_calibration_curve(y_true, y_pred_proba, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_pred_proba, n_bins=n_bins)
    plt.plot(prob_pred, prob_true, marker='o')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('Predicted probability')
    plt.ylabel('True probability')
    plt.title('Calibration Curve')
    plt.show()

def jeffreys_test(y_true, y_pred_proba, threshold=0.5):
    pred = (y_pred_proba >= threshold).astype(int)
    confusion_matrix = pd.crosstab(y_true, pred)
    test_statistic, p_value = stats.chi2_contingency(confusion_matrix)[:2]
    return p_value


# Teste de Grau de Escala (exemplo: comparando predições acima/abaixo de 0.5)
def scale_test(y_true, y_pred_proba, threshold=0.5):
    groups = y_pred_proba > threshold
    group_means = [np.mean(y_pred_proba[groups == g]) for g in np.unique(groups)]
    return np.std(group_means)


# Índice de Estabilidade de População (PSI)
def calculate_psi(expected, actual, buckettype='bins', buckets=10):
    def scale_range (input, min, max):
        input += -(np.min(input))
        input /= np.max(input) / (max - min)
        input += min
        return input
    
    breakpoints = np.linspace(0, 1, buckets + 1)

    expected_percents = np.histogram(expected, bins=breakpoints)[0] / len(expected)
    actual_percents = np.histogram(actual, bins=breakpoints)[0] / len(actual)
    
    psi_value = np.sum((actual_percents - expected_percents) * np.log(actual_percents / expected_percents))
    return psi_value

# Load data

In [1]:
df = pd.read_csv('_T1_AMOSTRA_CI.csv', sep=';')

df['DTREF'] = pd.to_datetime(df['DTREF'])

target = "DEF_12"

NameError: name 'pd' is not defined

In [None]:
df.head(5)

# Model

In [None]:
cols1 = ["DTREF",
     "SCR_MARITAL_STATUS", "SCR_AGE", "SCR_LEVEL_STUDIES_AGE", "SCR_HABIT", "SCR_TYPE_CONTR", "SCR_PROFFESION",
     "SCR_OP_CRED_POS",	"SCR_SDO3M_PREST_POS", "SCR_REC_PRZ", "SCR_ANT", "SCR_NUM_DEB", "SLDULTRESM", "VALPREST",
     "SCR_DIAS_SLD_DEV", target
            ]

df1 = df[cols1].dropna()

df1['SaldoDO3M/Prest'] = df1["SLDULTRESM"] / df1["VALPREST"]

df1 = df1.drop(columns=["SLDULTRESM", "VALPREST"])

In [None]:
df_train = df1[df1['DTREF'] <= '2021-12-31'] # ensino o modelo a prever
df_test = df1[df1['DTREF'] > '2021-12-31'] # testar se ele aprendeu bem

X_train = df_train.drop(columns=['DTREF', target])
y_train = df_train[target]

X_test = df_test.drop(columns=['DTREF', target])
y_test = df_test[target]

from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f'Antes do SMOTE: {y_train.value_counts()}')
print(f'Depois do SMOTE: {y_train_smote.value_counts()}')

In [None]:
X_train