In [20]:
#Importando as bibliotecas necessárias para manipular Dados e Redes Neurais
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Leitura de Dados e Discretização:

In [21]:
#Lendo o arquivo (que foi renomeado para dataset.csv)
data = pd.read_csv("dataset.csv")
#Renomeando colunas para melhorar visualização
data = data.rename(columns={"hypertension":"hyper_t","heart_disease":"heart_d","smoking_history":"smok_h","HbA1c_level":"HbA1c","blood_glucose_level":"blood_g_l","diabetes":"diab"})
#Removendo duplicatas
data.drop_duplicates(inplace=True)
#Mostrando resumo dos dados
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96146 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gender     96146 non-null  object 
 1   age        96146 non-null  float64
 2   hyper_t    96146 non-null  int64  
 3   heart_d    96146 non-null  int64  
 4   smok_h     96146 non-null  object 
 5   bmi        96146 non-null  float64
 6   HbA1c      96146 non-null  float64
 7   blood_g_l  96146 non-null  int64  
 8   diab       96146 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 7.3+ MB


Discretização dos Dados

In [22]:
#Verificando os valores presentes em 'gender', para discretizá-los
print(data["gender"].unique())
#Discretizando com base nas informações do código anterior
data["gender"] = data["gender"].replace({"Male":1,"Female":0,"Other":2})
#Fazendo cast para int64
data["gender"] = data["gender"].astype(np.int64)
#Mostrando resumo dos dados
data.info()

['Female' 'Male' 'Other']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 96146 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gender     96146 non-null  int64  
 1   age        96146 non-null  float64
 2   hyper_t    96146 non-null  int64  
 3   heart_d    96146 non-null  int64  
 4   smok_h     96146 non-null  object 
 5   bmi        96146 non-null  float64
 6   HbA1c      96146 non-null  float64
 7   blood_g_l  96146 non-null  int64  
 8   diab       96146 non-null  int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 7.3+ MB


In [23]:
#Removendo todas as linhas com idades não inteiras no dataset
data = data[data["age"].mod(1) == 0]
#Fazendo cast para int64
data["age"] = data["age"].astype(np.int64)
#Mostrando resumo dos dados
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94133 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gender     94133 non-null  int64  
 1   age        94133 non-null  int64  
 2   hyper_t    94133 non-null  int64  
 3   heart_d    94133 non-null  int64  
 4   smok_h     94133 non-null  object 
 5   bmi        94133 non-null  float64
 6   HbA1c      94133 non-null  float64
 7   blood_g_l  94133 non-null  int64  
 8   diab       94133 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 7.2+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["age"] = data["age"].astype(np.int64)


In [24]:
#Verificando os valores presentes em 'smok_h', para discretizá-los
print(data["smok_h"].unique())
#Discretizando com base nas informações do código anterior
data["smok_h"] = data["smok_h"].replace({"ever":2,"never":0,"current":2,"not current":0,"No Info":-1,"former":1})
#Fazendo cast para int64
data["smok_h"] = data["smok_h"].astype(np.int64)
#Mostrando resumo dos dados
data.info()

['never' 'No Info' 'current' 'former' 'ever' 'not current']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 94133 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gender     94133 non-null  int64  
 1   age        94133 non-null  int64  
 2   hyper_t    94133 non-null  int64  
 3   heart_d    94133 non-null  int64  
 4   smok_h     94133 non-null  int64  
 5   bmi        94133 non-null  float64
 6   HbA1c      94133 non-null  float64
 7   blood_g_l  94133 non-null  int64  
 8   diab       94133 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 7.2 MB


Normalizando Dados por MinMax

In [25]:
print(data)

       gender  age  hyper_t  heart_d  smok_h    bmi  HbA1c  blood_g_l  diab
0           0   80        0        1       0  25.19    6.6        140     0
1           0   54        0        0      -1  27.32    6.6         80     0
2           1   28        0        0       0  27.32    5.7        158     0
3           0   36        0        0       2  23.45    5.0        155     0
4           1   76        1        1       2  20.14    4.8        155     0
...       ...  ...      ...      ...     ...    ...    ...        ...   ...
99994       0   36        0        0      -1  24.60    4.8        145     0
99996       0    2        0        0      -1  17.37    6.5        100     0
99997       1   66        0        0       1  27.83    5.7        155     0
99998       0   24        0        0       0  35.42    4.0        100     0
99999       0   57        0        0       2  22.43    6.6         90     0

[94133 rows x 9 columns]


In [26]:
#Normalizando por MinMax
columns = ["age", "smok_h", "bmi", "HbA1c", "blood_g_l"]

for i, c in enumerate(columns):
    meanc = data[c].mean()
    stdc = data[c].std();
    data[c] = data[c].apply( lambda x: (x - meanc)/stdc  )
    print(f"Done ({i}/5)", end='\r')
print(data)

Done (0/5)Done (1/5)Done (2/5)Done (3/5)Done (4/5)       gender       age  hyper_t  heart_d    smok_h       bmi     HbA1c  \
0           0  1.705652        0        1 -0.049725 -0.349686  0.989909   
1           0  0.517822        0        0 -1.056554 -0.030409  0.989909   
2           1 -0.670008        0        0 -0.049725 -0.030409  0.152729   
3           0 -0.304522        0        0  1.963933 -0.610503 -0.498410   
4           1  1.522909        1        1  1.963933 -1.106656 -0.684450   
...       ...       ...      ...      ...       ...       ...       ...   
99994       0 -0.304522        0        0 -1.056554 -0.438124 -0.684450   
99996       0 -1.857838        0        0 -1.056554 -1.521865  0.896889   
99997       1  1.066052        0        0  0.957104  0.046037  0.152729   
99998       0 -0.852751        0        0 -0.049725  1.183740 -1.428609   
99999       0  0.654880        0        0  1.963933 -0.763396  0.989909   

       blood_g_l  diab  
0       0.040799   

In [27]:
#Normalizacao por Mean
# data['age'] = data['age'].apply(lambda x: (x - data['age'].mean())/(data['age'].std()))
# print("Done")
# data['smok_h'] = data['smok_h'].apply(lambda x: (x - data['smok_h'].mean())/(data['smok_h'].std()))
# print("Done")
# data['bmi'] = data['bmi'].apply(lambda x: (x - data['bmi'].mean())/(data['bmi'].std()))
# print("Done")
# data['blood_g_l'] = data['blood_g_l'].apply(lambda x: (x - data['blood_g_l'].mean())/(data['blood_g_l'].std()))
# print("Done")
# data['HbA1c'] = data['HbA1c'].apply(lambda x: (x - data['HbA1c'].mean())/(data['HbA1c'].std()))
# print("Done")

In [10]:
print(data)

       gender       age  hyper_t  heart_d    smok_h       bmi     HbA1c  \
0           0  1.705652        0        1 -0.049725 -0.349686  0.989909   
1           0  0.517822        0        0 -1.056554 -0.030409  0.989909   
2           1 -0.670008        0        0 -0.049725 -0.030409  0.152729   
3           0 -0.304522        0        0  1.963933 -0.610503 -0.498410   
4           1  1.522909        1        1  1.963933 -1.106656 -0.684450   
...       ...       ...      ...      ...       ...       ...       ...   
99994       0 -0.304522        0        0 -1.056554 -0.438124 -0.684450   
99996       0 -1.857838        0        0 -1.056554 -1.521865  0.896889   
99997       1  1.066052        0        0  0.957104  0.046037  0.152729   
99998       0 -0.852751        0        0 -0.049725  1.183740 -1.428609   
99999       0  0.654880        0        0  1.963933 -0.763396  0.989909   

       blood_g_l  diab  
0       0.040799     0  
1      -1.421173     0  
2       0.479391     0  

In [None]:
#Exporta o DataFrame como arquivo .csv
#data.to_csv("datasetNormMinMax.csv", index=False)

Adequação dos valores para usá-los na Rede Neural de Aprendizagem

In [28]:
#Transformando o DataFrame para Tensor
t = torch.Tensor(data.values)
#size_tLearn guarda o tam de 80% das posicoes de t
size_tLearn = int(0.8 * len(t))
#size_tLearn guarda o tam de 20% das posicoes de t
size_tTest = len(t) - size_tLearn
# Separa o tensor t em tLearn e tTeste. 80% para tLearn e 20% para tTest
tLearn, tTeste = torch.split(t, [size_tLearn, size_tTest])

In [29]:
#Slice para atribuir as entradas da rede neural a x. (Todas as linhas e todas as colunas exceto a última)
x = torch.Tensor(tLearn[:,:-1])
print(x.shape)
#Slice para atribuir as saídas do DataFrame a y. (Todas as linhas e apenas a última coluna)
y = torch.Tensor(tLearn[:,-1])
print(y.shape)
#Fazendo o reshape de y
y = y.reshape((y.shape[0], 1))
print(y.shape)

torch.Size([75306, 8])
torch.Size([75306])
torch.Size([75306, 1])


In [30]:
class MLPReLU(nn.Module):
    def __init__(self):
        super(MLPReLU,self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 20),
            nn.ReLU(),            
            nn.Linear(20, 20),
            nn.ReLU(),                   
            nn.Linear(20, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.layers(x)   

In [14]:
class MLPLeakyReLU(nn.Module):
    def __init__(self):
        super(MLPLeakyReLU,self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, 20),
            nn.LeakyReLU(),            
            nn.Linear(20, 20),
            nn.LeakyReLU(),                   
            nn.Linear(20, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.layers(x)   

Implementando a Rede Neural

In [31]:
#Instanciando a Rede Neural
net = MLPLeakyReLU()
#Taxa de aprendizado
learningRate = 0.01
#Inicializando otimizador
optimizer = torch.optim.SGD(net.parameters(),lr=learningRate)
#Usando Entropia Cruzada Binária (Não convergia usando MSELoss)
lossFn = nn.BCELoss()

#Função para mudar o otimizador arbitrariamente
def otimizador(x):
    print(x)
    global optimizer
    global learningRate
    if(x == "SGD"):    
        optimizer = torch.optim.SGD(net.parameters(),lr=learningRate)
    elif(x == "Adam"):
        optimizer = torch.optim.Adam(net.parameters(),lr=learningRate)
    elif(x == "Adagrad"):
        optimizer = torch.optim.Adagrad(net.parameters(),lr=learningRate)
    elif(x == "RMSprop"):    
        optimizer = torch.optim.RMSprop(net.parameters(),lr=learningRate)

#Inicializando o contador de epochs
epoch = 0

Loop de Treinamento

In [33]:
#Taxa de aprendizado
learningRate = 0.01
#Limite de epochs
epochs = 100000
#Valores de Loss considerado como convergência aceitável
limite = 0.0199

#SGD Adam Adagrad RMSprop
otimizador("Adam")

#Loop de treinamento
for e in range(epochs):
    optimizer.zero_grad()
    d = net(x)
    loss = lossFn(d,y)
    loss.backward()
    if(e % 100 == 99 ):
        print("epoch(%d): %.4f"%(e,loss.item()))
    optimizer.step()
    if(loss.item() <= limite):
        print("Epoch(%d), loss(%.4f)"%(epoch+e,loss.item()))
        break

epoch += e
print("Épocas: ",epoch)

Adam
epoch(99): 0.0863
epoch(199): 0.0835
epoch(299): 0.0829
epoch(399): 0.0826
epoch(499): 0.0825
epoch(599): 0.0825
epoch(699): 0.0823
epoch(799): 0.0821
epoch(899): 0.0821
epoch(999): 0.0819
epoch(1099): 0.0818
epoch(1199): 0.0817
epoch(1299): 0.0816
epoch(1399): 0.0816
epoch(1499): 0.0816
epoch(1599): 0.0816
epoch(1699): 0.0816
epoch(1799): 0.0814
epoch(1899): 0.0814
epoch(1999): 0.0815
epoch(2099): 0.0814
epoch(2199): 0.0817
epoch(2299): 0.0813
epoch(2399): 0.0813
epoch(2499): 0.0813
epoch(2599): 0.0813
epoch(2699): 0.0813
epoch(2799): 0.0812
epoch(2899): 0.0811
epoch(2999): 0.0812
epoch(3099): 0.0812
epoch(3199): 0.0811
epoch(3299): 0.0810
epoch(3399): 0.0811
epoch(3499): 0.0810
epoch(3599): 0.0810
epoch(3699): 0.0809
epoch(3799): 0.0809
epoch(3899): 0.0808
epoch(3999): 0.0814
epoch(4099): 0.0807
epoch(4199): 0.0808
epoch(4299): 0.0809
epoch(4399): 0.0806
epoch(4499): 0.0806
epoch(4599): 0.0805
epoch(4699): 0.0804
epoch(4799): 0.0808
epoch(4899): 0.0806
epoch(4999): 0.0804
epoch(

Adequação dos valores para usá-los na Rede Neural de Teste

In [None]:
#Slice para atribuir as entradas da rede neural a x. (Todas as linhas e todas as colunas exceto a última)
x = torch.Tensor(tTeste[:,:-1])
print(x.shape)
#Slice para atribuir as saídas do DataFrame a y. (Todas as linhas e apenas a última coluna)
y = torch.Tensor(tTeste[:,-1])
print(y.shape)
#Fazendo o reshape de y
y = y.reshape((y.shape[0], 1))
print(y.shape)

Loop de Teste

In [None]:
net.eval()  # Altera o modo para o modo de avaliação (influencia o comportamento de certas camadas, como Dropout)
with torch.no_grad():
    outputs = net(x)
    predicted = (outputs >= 0.5).float()  # Transforma as saídas em previsões binárias (0 ou 1)
    accuracy = (predicted == y).sum().item() / y.size(0)
    print("Accuracy: %.4f"%(accuracy))