# Lendo as bibliotecas usadas

In [None]:
import pandas as pd
import itertools
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import *
from sklearn.model_selection import GridSearchCV
import time
from scipy.stats import ks_2samp
from xgboost import plot_importance
import seaborn as sns
from scipy import stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, \
roc_curve, precision_recall_curve, auc, average_precision_score

import warnings

warnings.filterwarnings('ignore')

from datetime import datetime
from sklearn.inspection import plot_partial_dependence
import shap

# import scikitplot as skplt

import json
import os
import boto3
from bayes_opt import BayesianOptimization
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

# Importanto a base de dados 

### função read_file
A função abaixo vai tentar ler os dados localmente da pasta data, mas caso o arquivo não exista,
será baixado o arquivo do s3 e salvo localmente na pasta data. 

In [None]:
# df_final = pd.read_parquet('../data/default_mod_nodoc_ocp3.parquet')

df_final = 

df_final = df_final[df_final['safra'] >= '2019-05']
df_final = df_final[df_final['safra'] <= '2022-07']

## var resposta
target_ = ''
df_final = df_final[df_final[target_].notnull()]

In [None]:
df_final

## Removendo as Variaveis com nenhuma ou baixa relevancia pro modelo

In [None]:
drop_col = []


df_final.drop(columns=drop_col,inplace=True)

In [None]:
train_features = df_final.columns.values[15:] 

In [None]:
print(len(train_features))
print(train_features)

# Variável Resposta

Nessa etapa, vamos visualizar a variável resposta em função do tempo. Alem disso, vamos analisar tammbém a influência do control_group x bad rate no tempo

In [None]:
def plot_stability_target(df,bland):
    
    plt.figure(figsize=(16, 8)) 
    figsize=(16,8)
    
    sns.set(font_scale=1.5) 
    df["safra"] = pd.to_datetime(df["safra"])
    
    ax2 = df.groupby("safra")[target_].mean().plot(figsize=figsize,title =bland+ '-target_fpd30')
    
    
    ax2.set(ylim=(0,0.3)) 
    
    plt.savefig(img_path + 'bad rate' +'open')

plot_stability_target(df_final,'Open')

In [None]:
df_var_final = df_final[df_final[target_].notnull()]
df_var_final[target_] = df_var_final[target_].astype(int)
df_train, df_test = train_test_split(df_var_final,test_size=0.3, random_state=101)
print("numero de variáveis",len(train_features))

In [None]:
print(len(df_train))
print(len(df_test))

# Modelo: XGBoost

Nesse etapa, vamos criar nosso modelo usando o XGBoost. Para estimar os hyperparâmetro do modelo, vamos usar um Grid Search que vai realizar uma combinação de um range de valores previamente informado e encontrar o melhor modelo, a partir desses valores ja conhecidos. 

O treinamento do Modelo será feito usando Cross-Validation. 

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

In [None]:
train_features = []

In [None]:
X_train = df_train[train_features].copy()
Y_train = df_train[target_].astype(int).copy()

X_test = df_test[train_features].copy()
Y_test = df_test[target_].astype(int).copy()

pbounds = {'learning_rate': (0.01, 1.0),
    'n_estimators': (10,100),
    'max_depth': (3,30),    # Change for 
    'subsample': (0.1, 1.0),  # Change for Big datasets
    'colsample': (0, 1.0),  # Change for Datasets with lots of features
    'gamma': (0, 10),
    'min_child_weight': (10,100)}


def xgboost_hyper_param(learning_rate,
                        n_estimators,
                        max_depth,
                        subsample,
                        colsample,
                        gamma,
                        min_child_weight):   
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    clf = XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        gamma=gamma,
        min_child_weight=min_child_weight,
        eval_metric = 'auc')
    return np.mean(cross_val_score(clf, X_train, Y_train, cv=10, scoring='roc_auc'))
    
optimizer = BayesianOptimization(
        f=xgboost_hyper_param,
        pbounds=pbounds,
        random_state=1,)    

optimizer.maximize(n_iter=15, init_points=8, acq='ei')

parameters = optimizer.max['params']

with open('./config/parameters.json','w') as p:
    json.dump(parameters,p)

In [None]:
with open('./config/parameters_optimizer/FL - parameters_fpd30.json','r') as p:
    parameters1 = json.load(p)
parameters1

In [None]:
print("parameters1")

In [None]:
xgb_model = xgb.XGBClassifier(gamma = parameters1['gamma'],
                              learning_rate = parameters1['learning_rate'],
                              max_depth = int(parameters1['max_depth']),
                              min_child_weight =120, #int(parameters1['min_child_weight']),
                              n_estimators = int(parameters1['n_estimators']),
                              subsample = parameters1['subsample'],
                              eval_metric='auc') 

X_train = df_train[train_features].astype(float)
Y_train = df_train[target_].astype(int).copy()
xgb_model.fit(X_train,   ## Somente a coluna das variáveis explicativas
        Y_train)   ## Variável resposta
        

# Validação do Modelo 

A Validação do modelo será feita, calculando:


* ROC e KS na base de teste
* ROC por Safra
* Gráfico de Feature importance (métrica: SHAP)
* SHAP das variaves 
* Gráfico de dependências Parciais
* Gráficos para analise da distribuição da probabilidade


In [None]:
def get_ks(df,target):  
    
    df_default_0 = df.loc[df[target] == 0]
    df_default_1 = df.loc[df[target] == 1]
    
    return ks_2samp(df_default_0["Probability"],df_default_1["Probability"])

def get_ks_safra(df,target_,prob_):
    
    
    df['safra'] = pd.to_datetime(df['safra']).dt.strftime('%Y-%m')
    ks = []
    dt = []
    figsize=(10,5)
    safra = df.sort_values("safra").safra.unique()
    for s in safra:   
        df_temp = df[df["safra"] == s]
        
        
        ks_ = get_ks(df =  df_temp,target = target_,prob = prob_)
        ks.append(ks_)
        dt.append(s)   
                          
    df_ks = pd.DataFrame({'safra':dt,
                          'KS':ks})           
    return df_ks


def evaluate_model(df_train,df_test,target,model,train_features):
    
    df_train['Probability'] = model.predict_proba(df_train[train_features])[:,1]
    df_test['Probability'] = model.predict_proba(df_test[train_features])[:,1]
         
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (14, 6))
    # Plot AUC Curve

    fpr_train, tpr_train, threshold_train = roc_curve(df_train[target], df_train['Probability'])
    roc_auc_train = auc(fpr_train, tpr_train)

    fpr_test, tpr_test, threshold_test = roc_curve(df_test[target], df_test['Probability'])
    roc_auc_test = auc(fpr_test, tpr_test)
    sns.set(font_scale=1.5) 
    title = 'Receiver Operating Characteristic (ROC) Curve' 
    ax = axes[0]
    ax.plot(fpr_train , tpr_train , color='darkorange', label = 'AUC = {}'.format(round(roc_auc_train,3))) 
    ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
    ax.legend(loc="lower right")
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')
    ax.set_title('Train - ' +target_)
    
    ax = axes[1]
    ax.plot(fpr_test , tpr_test , color='darkorange', label = 'AUC = {}'.format(round(roc_auc_test,3))) 
    ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
    ax.legend(loc="lower right")
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')
    ax.set_title('Validation - ' +target_)
    
     
    fig.suptitle(title)
    plt.savefig(img_path + 'auc roc' +'open_' + target_)

In [None]:
evaluate_model(df_train = df_train
               ,df_test = df_test
               ,target  =  target_
               ,model   = xgb_model
               ,train_features =  train_features)

In [None]:
print('Train')
print(get_ks(df_train,target_))
print('Test')
print(get_ks(df_test,target_))

In [None]:
def get_auc_safra(df_test,df_train,target):
    
    df_safra = df_test.append(df_train)
    df_safra['safra'] = pd.to_datetime(df_safra['safra']).dt.strftime('%Y-%m')
    roc = []
    figsize=(10,5)
    safra = df_safra.sort_values("safra").safra.unique()
    for s in safra:   
        df_temp = df_safra[df_safra["safra"] == s]
        fpr, tpr, threshold = roc_curve(df_temp[target], df_temp['Probability'])
        roc_auc = auc(fpr, tpr)
        roc.append(roc_auc)
    sns.set(font_scale=1.5) 
    df_safra_roc = pd.DataFrame({"safra":safra,"auc_roc":roc})
    df_safra_roc['safra'] = pd.to_datetime(df_safra_roc['safra'])
    df_safra_roc = df_safra_roc.set_index('safra')
    ax = df_safra_roc.plot(figsize = figsize)
    ax.set(ylim=(0.4, 1))
    ax.set_title('AUC ROC ' + 'fpd30')
    plt.savefig(img_path + 'roc_safra' +'open')

In [None]:
get_auc_safra(df_test,df_train,target_)

In [None]:
def get_ks_safra(df_test,df_train,target):
    
    df_safra = df_test.append(df_train)
    df_safra['safra'] = pd.to_datetime(df_safra['safra']).dt.strftime('%Y-%m')
    ks = []
    dt = []
    figsize=(10,5)
    safra = df_safra.sort_values("safra").safra.unique()
    for s in safra:   
        df_temp = df_safra[df_safra["safra"] == s]
        ks_ = get_ks(df =  df_temp,target = target_)
        ks.append(ks_[0])
        dt.append(s)   
       
    df_ks = pd.DataFrame({'safra':dt,
                          'KS':ks}).set_index('safra')
                          
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (14, 6))
    ax = df_ks.plot(ax = ax,title = 'KS por Safra')
    ax.set(ylim=(0.25,1))
    plt.show()

In [None]:
get_ks_safra(df_test = df_test, df_train = df_train, target= target_)

### Distribuição da prob

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=False)
sns.set(font_scale=1) 
ay = sns.kdeplot(ax=axes[0],data=df_temp, x="Probability", common_norm=False,hue='sacam_tudo',fill=True,color= ["blue","red"])
ax = sns.boxplot(ax=axes[1],x='sacam_tudo', y="Probability", data=df_temp)

### Features Importance das variáveis

In [None]:
plt.figure(figsize=(5, 3))
xgb_model_shap = xgb_model
shap_values = shap.TreeExplainer(xgb_model_shap).shap_values(df_train[train_features])
shap.summary_plot(shap_values, df_train[train_features].reset_index().drop(columns="index"),show=False)
plt.title('Impact Positive or Negative on Probability Model')
plt.savefig(img_path + 'shap_impact_' +target_)

In [None]:
df_test['Open_Finance_Score']

In [None]:
shap_values = shap.TreeExplainer(xgb_model_shap).shap_values(df_test[train_features])
shap.summary_plot(shap_values, df_test[train_features], plot_type="bar",title="Feature Importance",show = False)
plt.title('Feature Importance - ' + str(target_))
plt.savefig(img_path + 'shap_feature_importance_' +target_)

In [None]:
shap_sum = np.abs(shap_values).mean(axis=0)
importance_df = pd.DataFrame([train_features, shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)

In [None]:
features1 = importance_df.sort_values("shap_importance",ascending = False)["column_name"][0:6].values
features2 = importance_df.sort_values("shap_importance",ascending = False)["column_name"][7:13].values
features3 = importance_df.sort_values("shap_importance",ascending = False)["column_name"][14:21].values

In [None]:
importance_df[importance_df.shap_importance <= 0.0]["column_name"].values

In [None]:
list(importance_df['column_name'])

In [None]:
plot_vars = importance_df.sort_values("shap_importance",ascending = False)

### Dependências  Parciais das Variaveis

In [None]:
df_dep = df_test[df_test["scrcrdpnm6mmlv3"].notnull()]
df_dep = df_dep[df_dep["trend_ploan_due_3m"].notnull()]
sns.set(font_scale=1.5) 
sns.set(rc={'figure.figsize':(15,8)})
plot_partial_dependence(xgb_model,
                        features=features1, 
                        X=df_dep[train_features], 
                        feature_names=train_features)
plt.savefig(img_path + 'partial_dependence1' +target_)

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
plot_partial_dependence(xgb_model,
                        features=features2, 
                        X=df_dep[train_features], 
                        feature_names=train_features)
plt.savefig(img_path + 'partial_dependence2' +target_)

In [None]:
sns.set(rc={'figure.figsize':(15,12)})
plot_partial_dependence(xgb_model,
                        features=features3, 
                        X=df_dep[train_features], 
                        feature_names=train_features)
plt.savefig(img_path + 'partial_dependence3' +target_)

# Análise por Brand

In [None]:
def evaluate_model_brand(df_train,df_test,target,model,train_features,brand,color):
    
    df_train['Probability'] = model.predict_proba(df_train[train_features])[:,1]
    df_test['Probability'] = model.predict_proba(df_test[train_features])[:,1]
         
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (12, 5))
    # Plot AUC Curve

    fpr_train, tpr_train, threshold_train = roc_curve(df_train[target], df_train['Probability'])
    roc_auc_train = auc(fpr_train, tpr_train)
    
    fpr_test, tpr_test, threshold_test = roc_curve(df_test[target], df_test['Probability'])
    roc_auc_test = auc(fpr_test, tpr_test)
    sns.set(font_scale=1) 
    title = brand + '- ROC' 
    ax = axes[0]
    ax.plot(fpr_train , tpr_train , color=color, label = 'AUC = {}'.format(round(roc_auc_train,3))) 
    ax.plot([0, 1], [0, 1], color=color, linestyle='--')
    ax.legend(loc="lower right")
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')
    ax.set_title('Train')
    
    ax = axes[1]
    ax.plot(fpr_test , tpr_test , color=color, label = 'AUC = {}'.format(round(roc_auc_test,3))) 
    ax.plot([0, 1], [0, 1], color=color, linestyle='--')
    ax.legend(loc="lower right")
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')
    ax.set_title('Validation')
    
     
    fig.suptitle(title)
    plt.savefig(img_path + 'auc roc' + brand)

In [None]:
evaluate_model_brand(df_train = df_train[df_train['brand'] == 'GERU']
               ,df_test = df_test[df_test['brand'] == 'GERU']
               ,target  =  target_
               ,model   = xgb_model
               ,train_features =  train_features
               ,brand = 'GERU'
               ,color = 'darkblue')

In [None]:
evaluate_model_brand(df_train = df_train[df_train['brand'] == 'REBEL']
               ,df_test = df_test[df_test['brand'] == 'REBEL']
               ,target  =  target_
               ,model   = xgb_model
               ,train_features =  train_features
               ,brand = 'REBEL'
               ,color = 'darkgreen')

In [None]:
def get_auc_safra_brand(df_test,df_train,target):
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize = (12, 7))
    
    df_safra = df_test.append(df_train)
    
    
    df_safra['safra'] = pd.to_datetime(df_safra['safra']).dt.strftime('%Y-%m')
    roc_geru = []
    roc_rebel = []
    figsize=(15,8)
    safra = df_safra.sort_values("safra").safra.unique()
    
    
    df_geru = df_safra[df_safra['brand'] == 'GERU']
    df_rebel = df_safra[df_safra['brand'] == 'REBEL']
    
    for s in df_geru.safra.unique():   
        df_temp_g = df_geru[df_geru["safra"] == s]
        fpr_g, tpr_g, threshold_g = roc_curve(df_temp_g[target], df_temp_g['Probability'])
        roc_auc_geru = auc(fpr_g, tpr_g)
        roc_geru.append(roc_auc_geru)
        
        
    df_safra_roc_geru = pd.DataFrame({"safra":df_geru.safra.unique(),"auc_roc":roc_geru})
    df_safra_roc_geru['safra'] = pd.to_datetime(df_safra_roc_geru['safra'])
    df_safra_roc_geru = df_safra_roc_geru.set_index('safra')
        
    for s in df_rebel.safra.unique():   
        df_temp_r = df_rebel[df_rebel["safra"] == s]
        fpr_r, tpr_r, threshold_r = roc_curve(df_temp_r[target], df_temp_r['Probability'])
        roc_auc_rebel = auc(fpr_r, tpr_r)
        roc_rebel.append(roc_auc_rebel)   
    
    
    df_safra_roc_rebel= pd.DataFrame({"safra":df_rebel.safra.unique(),"auc_roc":roc_rebel})
    df_safra_roc_rebel['safra'] = pd.to_datetime(df_safra_roc_rebel['safra'])
    df_safra_roc_rebel = df_safra_roc_rebel.set_index('safra') 

    
    
    sns.set(font_scale=1.5)
    ax1 = axes[0]
    ax1 = df_safra_roc_geru.plot(ax = ax1,figsize = figsize)
    ax1.set(ylim=(0.4, 1))
    ax1.set_title('Geru - AUC ROC')
    
    ax2 = axes[1]
    ax2 = df_safra_roc_rebel.plot(ax = ax2,figsize = figsize)
    ax2.set(ylim=(0.4, 1))
    ax2.set_title('Rebel - AUC ROC')
     
    
    
    plt.savefig(img_path + 'roc_safra_brand' +'open')

In [None]:
get_auc_safra_brand(df_test,df_train,target_)

In [None]:
def box_dist_(df,brand):
    sns.reset_orig()
    fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=False)
    sns.set(font_scale=1) 
    ay = sns.kdeplot(ax=axes[0],data=df, x="Probability", common_norm=False,hue=target_,fill=True,color= ["blue","red"])
    ax = sns.boxplot(ax=axes[1],x=target_, y="Probability", data=df_test)
    fig.suptitle(brand)
    plt.savefig(img_path + 'dist_brand'+brand +target_)

In [None]:
box_dist_(df_test[df_test['brand'] == 'GERU'],'geru')

In [None]:
box_dist_(df_test[df_test['brand'] == 'REBEL'],'rebel')

# Criando Rating

No final do processo, vamos criar um rating para categorizar a probabilidade do modelo de tal forma que seja ordenavel. Dessa forma, vamos usar um método de clusterização da probabilidade que vai dividir a base em grupos. 

Após esse Processo, vamos criar um gráfico que mostra a estabilidade dos ratings em função do tempo. 

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
from feature_engine.discretisation import EqualWidthDiscretiser
from feature_engine.discretisation import DecisionTreeDiscretiser
from utils.ratings_funcoes_de_viz.ratings_funcoes_de_viz import statistic_anl_categ

In [None]:
def get_bins(df,bin_,,stat):
    
    probs = df[[prob]]
    
    discretizer21 = KBinsDiscretizer(n_bins=bin_, encode='ordinal', strategy='kmeans')
    discretizer21.fit(probs)
    discretizer21_transf = discretizer21.transform(probs)
    
    probs01 = pd.DataFrame(probs)
    probs01 = probs01.reset_index()
    discretizer21_transf = pd.DataFrame(discretizer21_transf)
    discretizer21_transf = pd.concat([probs01, discretizer21_transf], axis=1)

    df_corte = discretizer21_transf.groupby(0, as_index = False).agg({'Open_Finance_Score':['size','min', 'mean', 'median', 'max']})
    return df_corte["Open_Finance_Score"][stat]

In [None]:
df_rating_test = df_test[df_test['safra'] <= '2021-07']

In [None]:
df_rating_train = df_train[df_train['safra'] <= '2021-07']

In [None]:
df_full = df_rating_test.append(df_rating_train)

In [None]:
bin_ = get_bins(df_full,4,stat = "median")
print(bin_)
bins_   = [0,0.65,1]
print(bins_)

In [None]:
bin_[3] = 0.65

In [None]:
lower_limit = [0,bin_[3]]
upper_limit = [bin_[3],1]
rating = {'rating':['1','2'],
          'lower_limit':lower_limit,
          'upper_limit':upper_limit}

rating = pd.DataFrame(rating)

In [None]:
fig, ax = plt.subplots()
sns.set(font_scale=1.2) 
# hide axes
fig.patch.set_visible(True)
ax.axis('off')
ax.axis('tight')

df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))

ax.table(cellText=rating.values, colLabels=rating.columns, loc='center')

fig.tight_layout()
plt.savefig(img_path + 'rating_table_' +target_)
plt.show()

In [None]:
df_rating_train = df_train[df_train['safra'] >= '2021-01']

In [None]:
df_rating_test = df_test[df_test['safra'] >= '2021-01']

## Aplicando o Rating na base Inteira (treino + test)
## mensal

In [None]:
sns.set(rc={'figure.figsize':(12,7)})
sns.set(font_scale=2) 
df_full = df_rating_test.append(df_rating_train)
df_full['rating'] = pd.cut(df_full["score"] , bins=bins_,labels=['Approved','Declined'])
statistic_anl_categ(df_full, target_, 'rating','')
plt.savefig(img_path + 'rating_all_df' +target_)

In [None]:
df_full[df_full['safra'] == '2022-06-01']['fpd30'].describe()