In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datasets import load_dataset

In [15]:
#1. Carregar dados
dataset = load_dataset('thzliaa/pp_inadimplencia')
df = dataset['train'].to_pandas()

In [16]:
df.columns

Index(['data_base', 'uf', 'tcb', 'sr', 'cliente', 'ocupacao', 'porte',
       'modalidade', 'origem', 'indexador', 'vencido_acima_de_15_dias',
       'carteira_ativa', 'carteira_inadimplida_arrastada',
       'ativo_problematico', 'mes', 'mes_texto', 'ano', 'regiao',
       'inadimplente'],
      dtype='object')

In [17]:
# 2. Features e Target
X = df[['uf', 'tcb', 'sr', 'ocupacao', 'porte', 'modalidade', 'origem', 
        'indexador', 'carteira_ativa', ]]
y = df['inadimplente']  # 0 ou 1

# 3. Definir quais colunas são categóricas e numéricas
colunas_categoricas = ['uf', 'tcb', 'sr', 'ocupacao', 'porte', 'modalidade', 'origem', 
        'indexador']
colunas_numericas = ['carteira_ativa']

In [18]:
# 4. Pré-processamento
preprocessador = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), colunas_categoricas),
    ('num', StandardScaler(), colunas_numericas)
])

In [19]:
# 6. Separação treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
# 7. Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessador),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [21]:
# 8. Regressão Logística Pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessador),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [22]:
# 9. Treinamento
rf_pipeline.fit(X_train, y_train)
lr_pipeline.fit(X_train, y_train)

In [23]:
# 10. Avaliação
for name, model in [('Random Forest', rf_pipeline), ('Regressão Logística', lr_pipeline)]:
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    print(f"\n{name}")
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_proba))


Random Forest
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    144577
           1       1.00      1.00      1.00    361500

    accuracy                           0.99    506077
   macro avg       0.99      0.99      0.99    506077
weighted avg       0.99      0.99      0.99    506077

ROC AUC: 0.9982699574839256

Regressão Logística
              precision    recall  f1-score   support

           0       0.73      0.55      0.63    144577
           1       0.84      0.92      0.87    361500

    accuracy                           0.81    506077
   macro avg       0.78      0.73      0.75    506077
weighted avg       0.80      0.81      0.80    506077

ROC AUC: 0.8827548783009099


In [25]:
# 11. Importância das features - Random Forest
importances = rf_pipeline.named_steps['classifier'].feature_importances_
feature_names = (rf_pipeline.named_steps['preprocessor']
                                .transformers_[0][2] + 
                 list(rf_pipeline.named_steps['preprocessor']
                                .transformers_[1][1]
                                .get_feature_names_out(colunas_categoricas)))

feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp.values[:10], y=feat_imp.index[:10])
plt.title('Top 10 Importância - Random Forest')
plt.show()

ValueError: input_features is not equal to feature_names_in_

In [None]:
# 12. Importância das features - Regressão Logística
coefficients = lr_pipeline.named_steps['classifier'].coef_[0]
feat_imp_lr = pd.Series(coefficients, index=feature_names).sort_values(key=abs, ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp_lr.values[:10], y=feat_imp_lr.index[:10])
plt.title('Top 10 Coeficientes - Regressão Logística')
plt.show()

In [None]:
# 8. Salvar pipeline inteiro
# with open('modelo_pipeline.pkl', 'wb') as f:
#     pickle.dump(pipeline, f)