---
---
# Previsão do Nível de Satisfação dos Clientes do Santander

### _Santander Customer Satisfaction_
---
---

### Previsão com modelo XGBoosting // _Prediction with XGBoosting model_

In [1]:
# Imports
import joblib
import pickle
import numpy as np
import pandas as pd
import sklearn
#from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score
from utils import *
import warnings
warnings.filterwarnings("ignore")
np.random.seed(31415)

In [2]:
# Versões dos pacotes usados neste jupyter notebook // Versions of packages used in this jupyter notebook
#!pip install -q -U watermark
%reload_ext watermark
%watermark -a "Tatiana Novaes Carvalho" --iversions

Author: Tatiana Novaes Carvalho

numpy     : 1.22.3
joblib    : 1.1.0
pandas    : 1.4.2
matplotlib: 3.5.1
sklearn   : 1.1.2
seaborn   : 0.11.2



In [3]:
# Ingestão de novos dados // New data ingestion
file = 'test'
df = pd.read_csv(f'../datasets/{file}.csv', index_col = 0)

# Leitura de scaler, pca e modelo
scaler = joblib.load(open('../models/scaler.pkl', 'rb'))
pca = joblib.load(open('../models/pca.pkl', 'rb'))
model = joblib.load(open('../models/best_model.pkl', 'rb'))

In [4]:
# Variáveis com valor zero
cols_zero = ['ind_var2_0', 'ind_var2', 'ind_var13_medio_0', 'ind_var13_medio', 'ind_var18_0', 'ind_var18', 'ind_var27_0',
             'ind_var28_0', 'ind_var28', 'ind_var27','ind_var41', 'ind_var46_0', 'ind_var46', 'num_var13_medio_0',
             'num_var13_medio', 'num_var18_0', 'num_var18', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27',
             'num_var41', 'num_var46_0', 'num_var46', 'saldo_var13_medio', 'saldo_var18', 'saldo_var28', 'saldo_var27',
             'saldo_var41', 'saldo_var46', 'delta_imp_amort_var18_1y3','delta_imp_reemb_var33_1y3',
             'delta_num_reemb_var33_1y3', 'imp_amort_var18_hace3', 'imp_amort_var34_hace3', 'imp_reemb_var13_hace3',
             'imp_reemb_var33_hace3', 'imp_reemb_var33_ult1', 'imp_trasp_var17_out_hace3', 'imp_trasp_var33_out_hace3',
             'num_var2_0_ult1', 'num_var2_ult1', 'num_meses_var13_medio_ult3', 'num_reemb_var13_hace3',
             'num_reemb_var33_hace3', 'num_reemb_var33_ult1', 'num_trasp_var17_out_hace3', 'num_trasp_var33_out_hace3',
             'saldo_var2_ult1', 'saldo_medio_var13_medio_hace3','saldo_medio_var13_medio_ult1']

#Variáveis excluídas por multicolinearidade 
correlated_features = ['imp_op_var41_comer_ult1', 'imp_op_var41_comer_ult3', 'imp_op_var41_efect_ult1',
                       'imp_op_var41_efect_ult3', 'imp_op_var41_ult1', 'num_var31_0','num_op_var40_ult3',
                       'num_op_var41_hace2', 'num_op_var41_hace3', 'num_op_var41_ult3', 'num_op_var41_ult1', 
                       'num_op_var39_ult3', 'num_op_var39_ult1', 'num_op_var41_comer_ult1','num_op_var41_comer_ult3',
                       'num_var37_med_ult2', 'num_var37_0', 'saldo_var8', 'saldo_var24', 'saldo_var42', 'saldo_var12',
                       'saldo_var13_corto', 'saldo_var31', 'saldo_var17','saldo_var26', 'saldo_var33', 'saldo_var44',
                       'saldo_medio_var17_hace2', 'saldo_medio_var17_hace3', 'saldo_medio_var17_ult1',
                       'saldo_medio_var17_ult3', 'imp_venta_var44_ult1','num_var22_ult3', 'num_var45_hace2', 
                       'num_var45_ult3', 'num_op_var39_comer_ult3', 'num_op_var41_efect_ult3', 'num_op_var41_efect_ult1',
                       'num_op_var39_efect_ult3','saldo_medio_var8_ult3', 'saldo_medio_var12_ult1',
                       'saldo_medio_var12_ult3', 'saldo_medio_var13_corto_ult1', 'saldo_medio_var13_corto_ult3',
                       'saldo_medio_var13_largo_ult1','saldo_medio_var13_largo_ult3', 'saldo_medio_var33_ult3', 
                       'saldo_medio_var44_ult3']

# Variáveis excluídas devido à forma da distribuição 
not_gauss_feat = ['num_var43_emit_ult1', 'delta_imp_aport_var13_1y3', 'num_var37', 'num_trasp_var11_ult1', 'num_var35',
                  'num_var45_ult1', 'num_ent_var16_ult1', 'var3', 'num_var22_ult1', 'num_op_var39_efect_ult1',
                  'num_var43_recib_ult1', 'num_var22_hace3', 'num_var22_hace2']

# Variáveis excluídas devido à esparsidade
sparse_feats = ['ind_var1', 'ind_var6_0', 'ind_var6', 'ind_var14', 'ind_var17_0', 'ind_var17', 'ind_var19', 'ind_var20_0',
                'ind_var20', 'ind_var29_0', 'ind_var29', 'ind_var31_0', 'ind_var31', 'ind_var32_cte', 'ind_var32_0',
                'ind_var32', 'ind_var33_0', 'ind_var33', 'ind_var34_0', 'ind_var34', 'ind_var40', 'ind_var39', 
                'ind_var44_0', 'ind_var44', 'num_var1', 'num_var6_0', 'num_var6', 'num_var14', 'num_var17',
                'num_var20_0', 'num_var20', 'num_op_var40_hace3', 'num_var29_0', 'num_var29', 'num_var31',
                'num_var32_0', 'num_var32', 'num_var33_0', 'num_var33', 'num_var34_0', 'num_var34', 'num_var40',
                'num_var39', 'num_var44_0', 'num_var44', 'saldo_var6', 'saldo_var29','saldo_var34',
                'delta_imp_amort_var34_1y3', 'delta_imp_aport_var17_1y3', 'delta_imp_aport_var33_1y3',
                'delta_imp_reemb_var13_1y3', 'delta_imp_reemb_var17_1y3','delta_imp_trasp_var17_in_1y3',
                'delta_imp_trasp_var17_out_1y3', 'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3',
                'delta_imp_venta_var44_1y3','delta_num_aport_var13_1y3', 'delta_num_aport_var17_1y3',
                'delta_num_aport_var33_1y3', 'delta_num_compra_var44_1y3', 'delta_num_reemb_var13_1y3',
                'delta_num_reemb_var17_1y3', 'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3',
                'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3','delta_num_venta_var44_1y3', 
                'imp_amort_var18_ult1', 'imp_amort_var34_ult1', 'imp_aport_var33_hace3', 'imp_aport_var33_ult1',
                'imp_var7_emit_ult1', 'imp_reemb_var17_hace3','imp_trasp_var17_in_hace3', 'imp_trasp_var17_in_ult1',
                'imp_trasp_var17_out_ult1', 'imp_trasp_var33_in_hace3', 'imp_trasp_var33_in_ult1',
                'imp_trasp_var33_out_ult1','imp_venta_var44_hace3', 'ind_var7_emit_ult1', 'ind_var7_recib_ult1',
                'num_aport_var13_ult1', 'num_aport_var17_hace3', 'num_aport_var17_ult1', 'num_aport_var33_hace3',
                'num_aport_var33_ult1', 'num_var7_emit_ult1', 'num_var7_recib_ult1', 'num_compra_var44_hace3',
                'num_compra_var44_ult1', 'num_meses_var13_largo_ult3', 'num_meses_var17_ult3','num_meses_var29_ult3',
                'num_meses_var33_ult3', 'num_meses_var44_ult3', 'num_op_var40_efect_ult1', 'num_op_var40_efect_ult3',
                'num_reemb_var13_ult1', 'num_reemb_var17_hace3','num_reemb_var17_ult1', 'num_sal_var16_ult1',
                'num_trasp_var17_in_hace3', 'num_trasp_var17_in_ult1', 'num_trasp_var17_out_ult1',
                'num_trasp_var33_in_hace3','num_trasp_var33_in_ult1', 'num_trasp_var33_out_ult1', 'num_venta_var44_hace3',
                'num_venta_var44_ult1', 'saldo_medio_var13_medio_hace2', 'saldo_medio_var13_medio_ult3',
                'saldo_medio_var29_hace2', 'saldo_medio_var29_hace3', 'saldo_medio_var29_ult1', 'saldo_medio_var29_ult3']

In [5]:
# Exclusão de features // Features removal
df2 = df.drop(columns = cols_zero + correlated_features + not_gauss_feat + sparse_feats, axis=1)

In [6]:
# Identificação das variáveis  // Variables identification

def identify_variables_type_new(df, n = 10):
    """
    Identifica variáveis categóricas e numéricas com base na quantidade de categorias de cada variável.
    Por padrão, variáveis com 10 ou menos categorias são consideradas categóricas.
    
    Identifies categorical and numerical features based on the quantity of categories.
    By default, features with less than or equal to 10 classes are categorical.
    """
    cat_features = []
    num_features = []
    
    for col in df.columns:
        if df[col].nunique() <= n:
            cat_features.append(col)
        else:
            num_features.append(col)
            
    return cat_features, num_features

def convert_variables_dtype_new(df, cat_features, num_features):
    """
    Converte o tipo de dados de variáveis categóricas e numéricas.
    Converts the data type of categorical and numerical variables.
    """
    for feat in cat_features:
        df[feat] = df[feat].astype('category')
        
    for feat in num_features:
        df[feat] = df[feat].astype('float64')
    
    return df

# Separação das variáveis categóricas e numéricas // Separation of categorial and numerical features
cat_features, num_features = identify_variables_type_new(df2)

# Conversão dos tipos de dados das variáveis // Conversion of variables data types
df2 = convert_variables_dtype_new(df2, cat_features, num_features)

In [7]:
# Padronização das variáveis // Standardization of the variables
col_std = [col for col in df2.columns]
df2.loc[:,col_std] = scaler.transform(df2.loc[:,col_std])

In [8]:
# Aplicação de PCA // Application of PCA
cols_pca = ['PCA'+str(i) for i in range(1,3)]
feat_pca = pca.transform(df2)
df_pca = pd.DataFrame(feat_pca, columns = cols_pca)
df3 = df_pca.merge(df2, left_index = True, right_index = True)

In [9]:
# Previsão com modelo XGBoosting // Prediction with XGBoosting model
# AUC Score 0.781494 e Acurácia 0.911355
prediction = model.predict(df3)

In [10]:
# Visualização das previsões // Predictions visualization

inv_scaler = scaler.inverse_transform(df2)
df_new = pd.DataFrame(inv_scaler, columns=df2.columns)

pred = pd.DataFrame(prediction)

df_new = df_new.merge(pred, left_index = True, right_index = True)
df_new = df_new.rename(columns={0: 'TARGET'})
df_new.loc[:, 'TARGET'] = df_new.loc[:,'TARGET'].map(lambda x: 'Satisfied' if x == 0 else 'Unsatisfied')
display(df_new.head(10))

Unnamed: 0,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,imp_op_var39_efect_ult1,...,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,var38,TARGET
0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,40532.1,Satisfied
1,35.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,45486.72,Satisfied
2,23.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,60.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,46993.95,Satisfied
3,24.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,187898.61,Satisfied
4,23.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,73649.73,Satisfied
5,43.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,53250.87,Unsatisfied
6,39.0,495.0,2334.42,4815.42,0.0,0.0,0.0,2.220446e-16,0.0,1560.0,...,0.0,0.0,7077.51,1.110223e-16,7599.0,0.0,0.0,0.0,58316.64,Satisfied
7,29.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,46898.49,Satisfied
8,53.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,110356.98,Satisfied
9,37.0,0.0,0.0,0.0,0.0,0.0,0.0,2.220446e-16,0.0,0.0,...,0.0,0.0,0.0,1.110223e-16,0.0,0.0,0.0,0.0,41366.49,Satisfied
