Libraries used

In [37]:
import os
import pickle

import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

Train Data Overview

In [38]:
path_file = "Datasets/treino.csv"

train_df = pd.read_csv(path_file)

In [39]:
print('Total Rows: {}\nTotal Columns: {}'.format(train_df.shape[0], train_df.shape[1]))
print('Columns: ', train_df.columns[0], train_df.columns[1], train_df.columns[2], train_df.columns[3])
print('Size of Train file in GB \'{}\' '.format(os.path.getsize(path_file)/10**9))

Total Rows: 110000
Total Columns: 11
Columns:  inadimplente util_linhas_inseguras idade vezes_passou_de_30_59_dias
Size of Train file in GB '0.005595578' 


In [40]:
train_df.head()

Unnamed: 0,inadimplente,util_linhas_inseguras,idade,vezes_passou_de_30_59_dias,razao_debito,salario_mensal,numero_linhas_crdto_aberto,numero_vezes_passou_90_dias,numero_emprestimos_imobiliarios,numero_de_vezes_que_passou_60_89_dias,numero_de_dependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


Performance metric used

&nbsp; &nbsp; 1. F1 Score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

&nbsp; &nbsp; 2. Precision: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html

&nbsp; &nbsp; 3. Recall: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

&nbsp; &nbsp; 4. Accuracy: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

&nbsp; &nbsp; 5. AUC: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

Exploratory Data Analysis

In [41]:
print(train_df.shape)
print(train_df.columns)

(110000, 11)
Index(['inadimplente', 'util_linhas_inseguras', 'idade',
       'vezes_passou_de_30_59_dias', 'razao_debito', 'salario_mensal',
       'numero_linhas_crdto_aberto', 'numero_vezes_passou_90_dias',
       'numero_emprestimos_imobiliarios',
       'numero_de_vezes_que_passou_60_89_dias', 'numero_de_dependentes'],
      dtype='object')


In [42]:
data_copy = train_df.copy()
duplicates = data_copy.duplicated(train_df.columns)
       
print('Total Duplicates:',duplicates.sum())
print('-'*50,'Removing Duplicates','-'*50)

data_copy = data_copy[~duplicates]

print('Shape of Data Frame Now',data_copy.shape)

Total Duplicates: 363
-------------------------------------------------- Removing Duplicates --------------------------------------------------
Shape of Data Frame Now (109637, 11)


In [57]:
train_df = data_copy

Analyzing class distribution

In [46]:
train_df.inadimplente.value_counts()

0    102319
1      7318
Name: inadimplente, dtype: int64

Observação: A accuracy deve ser usada em datasets com a mesma proporção de exemplos para cada classe, e quando as penalidades de acerto e erro para cada classe forem as mesmas. Em problemas com classes desproporcionais, ela causa uma falsa impressão de bom desempenho. Por exemplo, no nosso dataset 93,32% dos exemplos pertençam a classe 0 (não inadiplente), só de classificar todos os exemplos naquela classe já se atinge uma precisão de mais 93%, mesmo que todos os exemplos da outra classe estejam classificados incorretamente. Dessa forma, vamos escolher a métrica AUC (Area Under the ROC Curve). Esta é uma métrica interessante para tarefas com classes desproporcionais. Nela, mede-se a área sob uma curva formada pelo gráfico entre a taxa de exemplos positivos, que realmente são positivos, e a taxa de falsos positivos.

Analyzing salario_mensal column 

In [47]:
print("Describing salario_medio column")
print(train_df.salario_mensal.describe())

Describing salario_medio column
count    8.817700e+04
mean     6.641456e+03
std      1.338760e+04
min      0.000000e+00
25%      3.400000e+03
50%      5.400000e+03
75%      8.233000e+03
max      3.008750e+06
Name: salario_mensal, dtype: float64


Personas

In [48]:
print("Defaulter´s persona")
print("Average salario_mensal: %.2f" % train_df[train_df.inadimplente == 1].salario_mensal.mean())
print("Average idade: %.2f" % train_df[train_df.inadimplente == 1].idade.mean())
print("Average numero_emprestimos_imobiliarios: %.2f" % train_df[train_df.inadimplente == 1].numero_emprestimos_imobiliarios.mean())
print("Average numero_vezes_passou_90_dias: %.2f" % train_df[train_df.inadimplente == 1].numero_vezes_passou_90_dias.mean())
print("Average util_linhas_inseguras: %.2f" % train_df[train_df.inadimplente == 1].util_linhas_inseguras.mean())
print("Average vezes_passou_de_30_59_dias: %.2f" % train_df[train_df.inadimplente == 1].vezes_passou_de_30_59_dias.mean())
print("Average razao_debito: %.2f" % train_df[train_df.inadimplente == 1].razao_debito.mean())
print("Average numero_linhas_crdto_aberto: %.2f" % train_df[train_df.inadimplente == 1].numero_linhas_crdto_aberto.mean())
print("Average numero_de_vezes_que_passou_60_89_dias: %.2f" % train_df[train_df.inadimplente == 1].numero_de_vezes_que_passou_60_89_dias.mean())


Defaulter´s persona
Average salario_mensal: 5642.13
Average idade: 45.96
Average numero_emprestimos_imobiliarios: 0.99
Average numero_vezes_passou_90_dias: 2.00
Average util_linhas_inseguras: 2.85
Average vezes_passou_de_30_59_dias: 2.30
Average razao_debito: 304.65
Average numero_linhas_crdto_aberto: 7.90
Average numero_de_vezes_que_passou_60_89_dias: 1.74


In [49]:
print("Non defaulter´s persona")
print("Average salario_mensal: %.2f" % train_df[train_df.inadimplente == 0].salario_mensal.mean())
print("Average idade: %.2f" % train_df[train_df.inadimplente == 0].idade.mean())
print("Average numero_emprestimos_imobiliarios: %.2f" % train_df[train_df.inadimplente == 0].numero_emprestimos_imobiliarios.mean())
print("Average numero_vezes_passou_90_dias: %.2f" % train_df[train_df.inadimplente == 0].numero_vezes_passou_90_dias.mean())
print("Average util_linhas_inseguras: %.2f" % train_df[train_df.inadimplente == 0].util_linhas_inseguras.mean())
print("Average vezes_passou_de_30_59_dias: %.2f" % train_df[train_df.inadimplente == 0].vezes_passou_de_30_59_dias.mean())
print("Average razao_debito: %.2f" % train_df[train_df.inadimplente == 0].razao_debito.mean())
print("Average numero_linhas_crdto_aberto: %.2f" % train_df[train_df.inadimplente == 0].numero_linhas_crdto_aberto.mean())
print("Average numero_de_vezes_que_passou_60_89_dias: %.2f" % train_df[train_df.inadimplente == 0].numero_de_vezes_que_passou_60_89_dias.mean())


Non defaulter´s persona
Average salario_mensal: 6715.49
Average idade: 52.72
Average numero_emprestimos_imobiliarios: 1.03
Average numero_vezes_passou_90_dias: 0.12
Average util_linhas_inseguras: 6.17
Average vezes_passou_de_30_59_dias: 0.26
Average razao_debito: 359.67
Average numero_linhas_crdto_aberto: 8.51
Average numero_de_vezes_que_passou_60_89_dias: 0.11


Pre-Processing

In [68]:
X = train_df[['util_linhas_inseguras', 'idade', 'vezes_passou_de_30_59_dias', 'razao_debito', 'salario_mensal', 'numero_linhas_crdto_aberto', 'numero_vezes_passou_90_dias', 'numero_emprestimos_imobiliarios', 'numero_de_vezes_que_passou_60_89_dias', 'numero_de_dependentes']]
y = train_df["inadimplente"]

In [69]:
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

X = imp.fit_transform(X)

X.shape

(109637, 10)

In [70]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

X.shape

(109637, 10)

Model

In [71]:
class Metrics():
    def get_results(self, y_true, y_pred):
        f1 = f1_score(y_true, y_pred, zero_division=1)
        precision = precision_score(y_true, y_pred, zero_division=1)
        recall = recall_score(y_true, y_pred, zero_division=1)
        accuracy = accuracy_score(y_true, y_pred)
        auc = roc_auc_score(y_true, y_pred)

        return f1, precision, recall, accuracy, auc


In [72]:
kf = StratifiedKFold(n_splits=10)

metrics = Metrics()
n_fold = 1
best_result = {"F1": 0, "Precision": 0, "Recall": 0, "Accuracy": 0, "AUC": 0}

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    f1, precision, recall, accuracy, auc = metrics.get_results(y_test, y_pred)

    print(f"Score at fold {n_fold}: F1-Score = {f1} Precision = {precision} Recall = {recall} Accuracy = {accuracy} AUC = {auc}")

    if best_result["AUC"] < auc:
        best_result = {"F1": f1, "Precision": precision, "Recall": recall, "Accuracy": accuracy, "AUC": auc}

        pickle.dump(model, open("model.sav", 'wb'))

    n_fold += 1

print("Best Results")
print(best_result)

Score at fold 1: F1-Score = 0.27208121827411175 Precision = 0.5296442687747036 Recall = 0.1830601092896175 Accuracy = 0.9346041590660343 AUC = 0.5857149647308134
Score at fold 2: F1-Score = 0.2814667988107037 Precision = 0.5126353790613718 Recall = 0.19398907103825136 Accuracy = 0.9338744983582634 AUC = 0.590397584776358
Score at fold 3: F1-Score = 0.274390243902439 Precision = 0.5357142857142857 Recall = 0.18442622950819673 Accuracy = 0.9348777818314484 AUC = 0.5864957574436996
Score at fold 4: F1-Score = 0.2702149437052201 Precision = 0.5387755102040817 Recall = 0.18032786885245902 Accuracy = 0.9349689894199197 AUC = 0.5846420423230239
Score at fold 5: F1-Score = 0.2900919305413687 Precision = 0.5748987854251012 Recall = 0.19398907103825136 Accuracy = 0.9366107260124042 AUC = 0.5918635738303064
Score at fold 6: F1-Score = 0.3023023023023023 Precision = 0.5655430711610487 Recall = 0.2062841530054645 Accuracy = 0.9364283108354615 AUC = 0.5974735854941318
Score at fold 7: F1-Score = 0.2

Performing predictions on the test set

In [74]:
loaded_model = pickle.load(open("model.sav", 'rb'))

test_df = pd.read_csv("Datasets/teste.csv")

imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)

X_test = imp.fit_transform(test_df)

scaler = StandardScaler()

X_test = scaler.fit_transform(X_test)

test_df["inadimplente"] = loaded_model.predict(X_test)

test_df.to_csv("Datasets/teste_pred.csv", index=0)