# Projeto Introdução a Redes Neurais

**Equipe:**
- Eduardo Luiz Silva - <els6@cin.ufpe.br>
- Ianní Muliterno - <iwmb1@de.ufpe.br>
- Tu Chin Hung - <tch1@de.ufpe.br>
- Wellington Barbosa de Almeida - <wba@cin.ufpe.br>

**Professor:** Germano Crispim Vasconcelos - <gcv@cin.ufpe.br>

**Objetivo:**
> Realizar um estudo experimental sobre a aplicação de modelos de redes neurais em um problema do mundo real.

**Problema**
> Análise de Risco de Crédito, Com base no perfil de clientes, decidir a quem conceder crédito (risco de inadimplência) 

## Etapa 1 - Divisão da base de dados:

## Base de dados

[Análise de Risco de Crédito](http://www.cin.ufpe.br/~gcv/web_lci/TRN) (+-400K Registros para Treinamento e +-130K Registros para Teste). Tamanho: 731 MB 

Importando os modulos básicos

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import csv
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score

import matplotlib
matplotlib.use('nbagg')
import matplotlib.pyplot as plt

Antes de seguir para o próximo passo, lembre de realizar o download da base de dados na pasta `/data` e executar o script python para converte-la para `csv`

#### Convertendo em um [dataframe pandas](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)

In [2]:
data_set = pd.read_csv("data/trn.csv", sep=',', header=0, low_memory=False)

In [3]:
data_set.head()

Unnamed: 0,INDEX,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1,IND_BOM_1_2
0,0,1,1,1,0,0,0,0,0.135098,1,...,0,0,1,1,0,1,1,1,0,1
1,1,1,0,1,0,0,1,0,0.273504,1,...,0,1,0,1,1,0,0,0,1,0
2,2,1,0,1,0,0,1,0,0.28191,0,...,1,1,0,0,0,0,1,0,1,0
3,3,1,1,1,0,0,0,0,0.225741,0,...,1,1,0,1,1,0,1,0,1,0
4,4,1,1,0,0,0,1,0,0.480403,0,...,1,1,1,0,0,1,0,1,1,0


#### Estatísticas sobre as variáveis

In [4]:
data_set.describe()

Unnamed: 0,INDEX,UF_1,UF_2,UF_3,UF_4,UF_5,UF_6,UF_7,IDADE,SEXO_1,...,CEP4_7,CEP4_8,CEP4_9,CEP4_10,CEP4_11,CEP4_12,CEP4_13,CEP4_14,IND_BOM_1_1,IND_BOM_1_2
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4999.5,0.8924,0.6831,0.4726,0.2944,0.2537,0.2195,0.1843,0.4565028,0.5223,...,0.4283,0.4205,0.4221,0.4621,0.44,0.4306,0.4341,0.4267,0.6588,0.3412
std,2886.89568,0.30989,0.465291,0.499274,0.455795,0.43515,0.413929,0.387748,0.2542518,0.499527,...,0.494857,0.493664,0.493919,0.498586,0.496412,0.495185,0.495663,0.494623,0.474136,0.474136
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.506237e-16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2499.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2515817,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4999.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.4368994,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,7499.25,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.6596725,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,9999.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Pre-processamento e limpeza da base

In [5]:
# remove instancias duplicadas
data_set.drop_duplicates(inplace=True)

# merge das duas ultimas colunas na coluna 'y'
data_set = data_set.drop('IND_BOM_1_2', 1)
data_set = data_set.rename(columns={'IND_BOM_1_1': 'y'})

# renomeia algumas colunas por conveniência
data_set.rename(columns={'INDEX': 'index'}, inplace=True)

# encontra as variavies categoricas
for col in data_set.columns:
    if ((data_set[col].quantile() == 1.0) | (data_set[col].quantile() == 0.0)):
        data_set[col] = data_set[col].astype('category')

# Embaralha o dataset
data_set = data_set.sample(frac=1)

#### Computa quantidade de exemplos da cada classe

In [6]:
# Classe 1
A = data_set.loc[data_set.y == 1]
size_A = A['index'].count()

# Classe 2
B = data_set.loc[data_set.y == 0]
size_B = B['index'].count()

print("Quantidade de instancias da classe 1: {}".format(A['index'].count()))
print("Quantidade de instancias da classe 2: {}".format(B['index'].count()))

Quantidade de instancias da classe 1: 6588
Quantidade de instancias da classe 2: 3412


#### Particionamento dos Dados com K-folds
Utiliza *10-fold Cross Validation*

Em cada rodada:
- 1 fold para test
- 1 fold para validação
- 8 folds para treino

In [7]:
# TO DO: 4-fold



In [8]:
from sklearn.model_selection import train_test_split

In [9]:

'''
# Classe A
X = A.iloc[:, :-1]
y = A.iloc[:, -1]
AX_train, AX_test, Ay_train, Ay_test = train_test_split(X, y, test_size=1/4, random_state=42, stratify=y)

AX_train, AX_val, Ay_train, Ay_val = train_test_split(AX_train, Ay_train, test_size=1/3, random_state=42, stratify=Ay_train)

# Classe B
X = B.iloc[:, :-1]
y = B.iloc[:, -1]
BX_train, BX_test, By_train, By_test = train_test_split(X, y, test_size=1/4, random_state=42, stratify=y)

BX_train, BX_val, By_train, By_val = train_test_split(BX_train, By_train, test_size=1/3, random_state=42, stratify=By_train)
'''
X = data_set.iloc[:, :-1].values
y = data_set.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/3, random_state=42, stratify=y_train)

TypeError: take_nd() got an unexpected keyword argument 'axis'

In [None]:
print("Classe 1(A): X Treino: {}".format(AX_train['index'].count()))
print("Classe 1(A): X validação: {}".format(AX_val['index'].count()))
print("Classe 1(A): X test: {}".format(AX_test['index'].count()))
print("sum: {}".format(AX_train['index'].count() + AX_val['index'].count() + AX_test['index'].count()))

print("Classe 0(B): X Treino: {}".format(BX_train['index'].count()))
print("Classe 0(B): X validação: {}".format(BX_val['index'].count()))
print("Classe 0(B): X test: {}".format(BX_test['index'].count()))
print("sum: {}".format(BX_train['index'].count() + BX_val['index'].count() + BX_test['index'].count()))

#### Replicação (oversampling) da Classe Minoritária (classe 1) com [SMOTE](http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.SMOTE.html)

In [None]:
'''
3412 - 1647 -> B_test

1765 -> 75% -> SMOTE(1323) -> 3294
1765 -> 25% -> SMOTE(441) -> 1647



1765 -> 75% -> SMOTE(1323) -> 3294
1765 -> 25% -> SMOTE(441) -> 1647
'''

In [None]:
'''
# Alternativa 1: reduz a classe majoritaria até a minoritaria
if(size_A < size_B):
    B = B.iloc[:size_A]
else:
    A = A.iloc[:size_B]
'''
'''
# Alternativa 2: SMOTE
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)

X_train_to_smote = pd.concat([AX_train, BX_train]).values
y_train_to_smote = pd.concat([Ay_train, By_train]).values

X_val_to_smote = pd.concat([AX_val, BX_val]).values
y_val_to_smote = pd.concat([Ay_val, By_val]).values

X_train_smoted, y_train_smoted = sm.fit_sample(X_train_to_smote, y_train_to_smote)
X_val_smoted, y_val_smoted = sm.fit_sample(X_val_to_smote, y_val_to_smote)

print("Total com smote: X Treino: {}".format(np.count_nonzero(y_train_smoted)))
print("Total com smote: X validação: {}".format(np.count_nonzero(X_val_smoted)))

pd.DataFrame(X_train_smoted)
'''

#### Normaliza os dados

In [None]:
'''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(pd.concat([AX_train,BX_train]))
y_train = pd.concat([Ay_train,By_train])
X_val = scaler.transform(pd.concat([AX_val,BX_val]))
y_val = pd.concat([Ay_val,By_val])
X_test = scaler.transform(pd.concat([AX_test,BX_test]))
y_test = pd.concat([Ay_test,By_test])
'''

#### Treino

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

input_dim = X_train.shape[1]

classifier = Sequential()

classifier.add(Dense(16, activation='tanh', input_dim=input_dim))

classifier.add(Dense(1, activation='sigmoid'))

classifier.compile(optimizer='adam', loss='mean_squared_error')

# Treino
history = classifier.fit(X_train, y_train, batch_size=64, epochs=100000, callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val))

In [None]:
def extract_final_losses(history):
    """Função para extrair o melhor loss de treino e validação.
    
    Argumento(s):
    history -- Objeto retornado pela função fit do keras.
    
    Retorno:
    Dicionário contendo o melhor loss de treino e de validação baseado 
    no menor loss de validação.
    """
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    idx_min_val_loss = np.argmin(val_loss)
    return {'train_loss': train_loss[idx_min_val_loss], 'val_loss': val_loss[idx_min_val_loss]}

def plot_training_error_curves(history):
    """Função para plotar as curvas de erro do treinamento da rede neural.
    
    Argumento(s):
    history -- Objeto retornado pela função fit do keras.
    
    Retorno:
    A função gera o gráfico do treino da rede e retorna None.
    """
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    fig, ax = plt.subplots()
    ax.plot(train_loss, label='Train')
    ax.plot(val_loss, label='Validation')
    ax.set(title='Training and Validation Error Curves', xlabel='Epochs', ylabel='Loss (MSE)')
    ax.legend()
    plt.show()

def compute_performance_metrics(y, y_pred_class, y_pred_scores=None):
    accuracy = accuracy_score(y, y_pred_class)
    recall = recall_score(y, y_pred_class)
    precision = precision_score(y, y_pred_class)
    f1 = f1_score(y, y_pred_class)
    performance_metrics = (accuracy, recall, precision, f1)
    if y_pred_scores is not None:
        auroc = roc_auc_score(y, y_pred_scores)
        aupr = average_precision_score(y, y_pred_scores)
        performance_metrics = performance_metrics + (auroc, aupr)
    return performance_metrics

def print_metrics_summary(accuracy, recall, precision, f1, auroc=None, aupr=None):
    print()
    print("{metric:<18}{value:.4f}".format(metric="Accuracy:", value=accuracy))
    print("{metric:<18}{value:.4f}".format(metric="Recall:", value=recall))
    print("{metric:<18}{value:.4f}".format(metric="Precision:", value=precision))
    print("{metric:<18}{value:.4f}".format(metric="F1:", value=f1))
    if auroc is not None:
        print("{metric:<18}{value:.4f}".format(metric="AUROC:", value=auroc))
    if aupr is not None:
        print("{metric:<18}{value:.4f}".format(metric="AUPR:", value=aupr))

In [None]:
plot_training_error_curves(history)