In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from copy import deepcopy

#Carregando o dataset
hcc = pd.read_csv("hcc_dataset.csv", sep=",")

#Ajetiando os valores nulos
hcc.replace(np.nan, 'None', inplace=True) #Na tabela tem células com o valor None que ele interpreta como um np.nan, então precisamos garantir que ele vai entender isso como um valor válido
hcc.replace('?', np.nan, inplace=True) #As células vazias possuem uma '?', então aqui dizemos que essas células sõa NaN

for column in hcc.columns:
    #Convertendo os valores que são numéricos para float
    if hcc[column].dtype == 'object':
        try:
            if hcc[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull().any() and column != 'Nodules':
                hcc[column] = pd.to_numeric(hcc[column], errors='coerce')
        except ValueError:
            try:
                hcc[column] = pd.to_datetime(hcc[column], errors='coerce')
            except ValueError:
                pass

    #Tratando os valores de texto -  colocando para UPPERCASE e colocando algum valor nas células vazias
    if(hcc[column].dtype == 'object'):
        hcc[column] = hcc[column].str.upper()
        value = hcc[column].value_counts().idxmax() # para variáveis categóricas colocamos o valor mais frequente
    else:
        value = hcc[column].mean() # para valores numéricos colocamos a média
    hcc[column].replace(np.nan, value, inplace=True)


'''
Acho que falta normalizar os valores numéricos
'''
hcc.head()

Unnamed: 0,Gender,Symptoms,Alcohol,HBsAg,HBeAg,HBcAb,HCVAb,Cirrhosis,Endemic,Smoking,...,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil,Iron,Sat,Ferritin,Class
0,MALE,NO,YES,NO,NO,NO,NO,YES,NO,YES,...,150.0,7.1,0.7,1,3.5,0.5,85.598837,37.028941,438.997647,LIVES
1,FEMALE,YES,NO,NO,NO,NO,YES,YES,NO,YES,...,212.211605,8.961039,1.127089,1,1.8,1.93,85.598837,37.028941,438.997647,LIVES
2,MALE,NO,YES,YES,NO,YES,NO,YES,NO,YES,...,109.0,7.0,2.1,5,13.0,0.1,28.0,6.0,16.0,LIVES
3,MALE,YES,YES,NO,NO,NO,NO,YES,NO,YES,...,174.0,8.1,1.11,2,15.7,0.2,85.598837,37.028941,438.997647,DIES
4,MALE,YES,YES,YES,NO,YES,NO,YES,NO,YES,...,109.0,6.9,1.8,1,9.0,1.93,59.0,15.0,22.0,LIVES


In [2]:
encoded_hcc = deepcopy(hcc) # copia para não açterar o dataset original

columns_classification = { # Colunas que tem uma ordenação
    'PS': ['ACTIVE', 'RESTRICTED', 'AMBULATORY', 'SELFCARE', 'DISABLED'],
    'Encephalopathy': ['NONE', 'GRADE I/II', 'GRADE III/IV'],
    'Ascites': ['NONE', 'MILD', 'MODERATE/SEVERE']
}

# Transformando as variáveis categóricas em números
for column in encoded_hcc.columns:
    if hcc[column].dtype == 'object' and column != 'Nodules': # se for uma variável categórica, que não seja nodules, pois esse já é numérico
        if column in columns_classification:
            enc = OrdinalEncoder(categories=[columns_classification[column]]).set_output(transform="pandas") # vai codificar com ua ordem
        else:
            enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas") # vai criar colunas de 0's e 1's
        enc_transform = enc.fit_transform(encoded_hcc[[column]]) # faz a transformação da coluna codificada -> é uma nova tabela que precisa ser anexada no dataset

        # Aqui vamos considerar apenas as colunas importantes para que não seja anexada informação ambigua
        if not(column in columns_classification or column == 'Gender' or column == 'Class'):
            enc_transform = enc_transform.filter(regex='_YES$') # em colunas com Yes e No vamos considerar apenas o Yes, sendo ele 1 e o No 0
        elif column == 'Gender':
            enc_transform = enc_transform.filter(regex='_MALE$') # Como só tem 2 opção vamos considerar a coluna apenas dos homens, sendo para homem 1 e mulher 0
        elif column == 'Class':
            enc_transform = enc_transform.filter(regex='_LIVES$') # Como só tem 2 opção vamos considerar a coluna apenas das pessoas que sobreviveram, sendo para viver 1 e morrer 0
        encoded_hcc = encoded_hcc.drop(columns = [column]) # excluindo a coluna que originou a codificação, exenplo (Class)
        encoded_hcc = pd.concat([encoded_hcc, enc_transform], axis=1) # adicionando a codificação ao dataset

encoded_hcc.to_csv('encoded_hcc.csv', index=False) # criando um arquivo com esses dados

In [3]:
encoded_hcc



Unnamed: 0,Age,Grams_day,Packs_year,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,...,Varices_YES,Spleno_YES,PHT_YES,PVT_YES,Metastasis_YES,Hallmark_YES,PS,Encephalopathy,Ascites,Class_LIVES
0,67,137.000000,15.000000,1.530000,95.000000,13.700000,106.600000,4.900000,99.000000,3.400000,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,62,0.000000,20.464286,1.421851,19299.951146,12.879012,95.119753,1473.961549,113206.442654,3.445535,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,78,50.000000,50.000000,0.960000,5.800000,8.900000,79.800000,8.400000,472.000000,3.300000,...,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,1.0,1.0
3,77,40.000000,30.000000,0.950000,2440.000000,13.400000,97.100000,9.000000,279.000000,3.700000,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,76,100.000000,30.000000,0.940000,49.000000,14.300000,95.100000,6.400000,199.000000,4.100000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,40,71.008547,20.464286,0.840000,19299.951146,15.400000,109.200000,9.300000,184000.000000,4.600000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
161,68,0.000000,0.000000,1.330000,4887.000000,12.100000,88.900000,2.500000,141.000000,3.000000,...,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0
162,65,71.008547,48.000000,1.130000,75.000000,13.300000,90.000000,8.000000,385000.000000,4.300000,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
163,44,71.008547,20.464286,2.140000,94964.000000,15.600000,117.300000,5200.000000,118000.000000,4.800000,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0


**ESCALONAMENTO DOS DADOS**

In [4]:
colunas_numericas = encoded_hcc.select_dtypes(include=['int', 'float']).columns #seleciona apenas as colunas numéricas
colunas_nao_numericas=encoded_hcc.select_dtypes(exclude=['int', 'float']).columns#exclui as colunas numéricas, para depois concatenarmos com a anterior
scaler=MinMaxScaler()
dados_escalados_numericos=scaler.fit_transform(encoded_hcc[colunas_numericas])#chamamos o MinMaxScaler pra ele transformar os dados, somente das colunas numéricas
dados_escalados_numericos=pd.DataFrame(dados_escalados_numericos,columns=colunas_numericas) ## converte o array numpy dos dados escalados de volta para um DataFrame com os nomes originais das colunas numéricas
dados_escalados = pd.concat([dados_escalados_numericos, encoded_hcc[colunas_nao_numericas]], axis=1)#concatena os dados numéricos escalonadas com os dados das colunas não numéricas


In [5]:
dados_escalados

Unnamed: 0,Age,Grams_day,Packs_year,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,...,Spleno_YES,PHT_YES,PVT_YES,Metastasis_YES,Hallmark_YES,PS,Encephalopathy,Ascites,Class_LIVES,Nodules
0,0.643836,0.274000,0.029412,0.173367,0.000052,0.635036,0.740519,0.000208,0.000212,0.500000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
1,0.575342,0.000000,0.040126,0.146194,0.010660,0.575110,0.511372,0.113232,0.246634,0.515178,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
2,0.794521,0.100000,0.098039,0.030151,0.000003,0.284672,0.205589,0.000477,0.001025,0.466667,...,0.0,1.0,0.0,1.0,1.0,0.5,0.0,0.5,1.0,5
3,0.780822,0.080000,0.058824,0.027638,0.001347,0.613139,0.550898,0.000523,0.000604,0.600000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2
4,0.767123,0.200000,0.058824,0.025126,0.000026,0.678832,0.510978,0.000323,0.000430,0.733333,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,0.273973,0.142017,0.040126,0.000000,0.010660,0.759124,0.792415,0.000546,0.400869,0.900000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5
161,0.657534,0.000000,0.000000,0.123116,0.002699,0.518248,0.387226,0.000023,0.000303,0.366667,...,1.0,1.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,1
162,0.616438,0.142017,0.094118,0.072864,0.000041,0.605839,0.409182,0.000446,0.838779,0.800000,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,5
163,0.328767,0.142017,0.040126,0.326633,0.052456,0.773723,0.954092,0.399898,0.257078,0.966667,...,1.0,1.0,1.0,1.0,1.0,0.5,0.0,0.0,0.0,5


**SELEÇÃO DAS VARIÁVEIS MAIS IMPORTANTES, A SEREM USADAS NO ALGORITMO**

In [6]:
#colunas selecionadas, a partir da relevância medida nas etapas anteriores
#colunas_selecionadas=['Hemoglobin','Iron',' Albumin','Sat','MCV','AFP','Ferritin','ALP', 'PS','Symptoms_YES','Ascites','Metastasis_YES','Encephalopathy','Class_LIVES']
colunas_selecionadas=["Hemoglobin", " Albumin", "Iron", "Leucocytes", "INR", "Total_Bil", "Dir_Bil", "Ferritin", "ALP", "Class_LIVES"]
dados_selecionados=dados_escalados[colunas_selecionadas].copy()

In [7]:
dados_selecionados

Unnamed: 0,Hemoglobin,Albumin,Iron,Leucocytes,INR,Total_Bil,Dir_Bil,Ferritin,ALP,Class_LIVES
0,0.635036,0.500000,0.382138,0.000208,0.173367,0.044776,0.013699,0.196860,0.151954,1.0
1,0.575110,0.515178,0.382138,0.113232,0.146194,0.069352,0.062671,0.196860,0.215518,1.0
2,0.284672,0.466667,0.125000,0.000477,0.030151,0.002488,0.000000,0.007175,0.110062,1.0
3,0.613139,0.600000,0.382138,0.000523,0.027638,0.002488,0.003425,0.196860,0.176475,0.0
4,0.678832,0.733333,0.263393,0.000323,0.025126,0.009950,0.062671,0.009865,0.110062,1.0
...,...,...,...,...,...,...,...,...,...,...
160,0.759124,0.900000,0.382138,0.000546,0.000000,0.004975,0.062671,0.196860,0.110062,1.0
161,0.518248,0.366667,0.382138,0.000023,0.123116,0.082090,0.075342,0.196860,0.284780,0.0
162,0.605839,0.800000,0.382138,0.000446,0.072864,0.007463,0.062671,0.196860,0.183628,1.0
163,0.773723,0.966667,0.382138,0.399898,0.326633,0.019900,0.062671,0.196860,0.172388,0.0


**KNN ALGORITHM (n_neighbors=5)**

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Separar as variáveis preditoras (X) e a variável alvo (y)
X = dados_selecionados.drop(columns=['Class_LIVES'])
y = dados_selecionados['Class_LIVES']

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar o modelo KNN
knn = KNeighborsClassifier(n_neighbors=5)  # n_neighbors será ajustado para outros testes

# Treinar o modelo
knn.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = knn.predict(X_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

# Relatório de classificação
print("Relatório de Classificação")
print(classification_report(y_test, y_pred))

# Matriz de confusão
print("Matriz de confusão")
print(confusion_matrix(y_test, y_pred))


Acurácia: 0.64
Relatório de Classificação
              precision    recall  f1-score   support

         0.0       0.53      0.62      0.57        13
         1.0       0.72      0.65      0.68        20

    accuracy                           0.64        33
   macro avg       0.63      0.63      0.63        33
weighted avg       0.65      0.64      0.64        33

Matriz de confusão
[[ 8  5]
 [ 7 13]]


**KNN ALGORITHM (n_neighbors=3)**

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Separar as variáveis preditoras (X) e a variável alvo (y)
X = dados_selecionados.drop(columns=['Class_LIVES'])
y = dados_selecionados['Class_LIVES']

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar o modelo KNN
knn = KNeighborsClassifier(n_neighbors=3)  # n_neighbors será ajustado para outros testes

# Treinar o modelo
knn.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = knn.predict(X_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

# Relatório de classificação
print("Relatório de Classificação")
print(classification_report(y_test, y_pred))

# Matriz de confusão
print("Matriz de confusão")
print(confusion_matrix(y_test, y_pred))

Acurácia: 0.61
Relatório de Classificação
              precision    recall  f1-score   support

         0.0       0.50      0.54      0.52        13
         1.0       0.68      0.65      0.67        20

    accuracy                           0.61        33
   macro avg       0.59      0.59      0.59        33
weighted avg       0.61      0.61      0.61        33

Matriz de confusão
[[ 7  6]
 [ 7 13]]



**KNN ALGORITHM (n_neighbors=2)**




In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Separar as variáveis preditoras (X) e a variável alvo (y)
X = dados_selecionados.drop(columns=['Class_LIVES'])
y = dados_selecionados['Class_LIVES']

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar o modelo KNN
knn = KNeighborsClassifier(n_neighbors=2)  # n_neighbors será ajustado para outros testes

# Treinar o modelo
knn.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = knn.predict(X_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

# Relatório de classificação
print("Relatório de Classificação")
print(classification_report(y_test, y_pred))

# Matriz de confusão
print("Matriz de confusão")
print(confusion_matrix(y_test, y_pred))

Acurácia: 0.58
Relatório de Classificação
              precision    recall  f1-score   support

         0.0       0.48      0.77      0.59        13
         1.0       0.75      0.45      0.56        20

    accuracy                           0.58        33
   macro avg       0.61      0.61      0.58        33
weighted avg       0.64      0.58      0.57        33

Matriz de confusão
[[10  3]
 [11  9]]


**KNN ALGORITHM (n_neighbors=1)**

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Separar as variáveis preditoras (X) e a variável alvo (y)
X = dados_selecionados.drop(columns=['Class_LIVES'])
y = dados_selecionados['Class_LIVES']

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar o modelo KNN
knn = KNeighborsClassifier(n_neighbors=1)  # n_neighbors será ajustado para outros testes

# Treinar o modelo
knn.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = knn.predict(X_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Acurácia: {accuracy:.2f}')

# Relatório de classificação
print("Relatório de Classificação")
print(classification_report(y_test, y_pred))

# Matriz de confusão
print("Matriz de confusão")
print(confusion_matrix(y_test, y_pred))

Acurácia: 0.61
Relatório de Classificação
              precision    recall  f1-score   support

         0.0       0.50      0.46      0.48        13
         1.0       0.67      0.70      0.68        20

    accuracy                           0.61        33
   macro avg       0.58      0.58      0.58        33
weighted avg       0.60      0.61      0.60        33

Matriz de confusão
[[ 6  7]
 [ 6 14]]
