In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from copy import deepcopy

#Carregando o dataset
hcc = pd.read_csv("hcc_dataset.csv", sep=",")

#Ajetiando os valores nulos
hcc.replace(np.nan, 'None', inplace=True) #Na tabela tem células com o valor None que ele interpreta como um np.nan, então precisamos garantir que ele vai entender isso como um valor válido
hcc.replace('?', np.nan, inplace=True) #As células vazias possuem uma '?', então aqui dizemos que essas células sõa NaN

for column in hcc.columns:
    #Convertendo os valores que são numéricos para float
    if hcc[column].dtype == 'object':
        try:
            if hcc[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull().any() and column != 'Nodules':
                hcc[column] = pd.to_numeric(hcc[column], errors='coerce')
        except ValueError:
            try:
                hcc[column] = pd.to_datetime(hcc[column], errors='coerce')
            except ValueError:
                pass

    #Tratando os valores de texto -  colocando para UPPERCASE e colocando algum valor nas células vazias
    if(hcc[column].dtype == 'object'):
        hcc[column] = hcc[column].str.upper()
        value = hcc[column].value_counts().idxmax() # para variáveis categóricas colocamos o valor mais frequente
    else:
        value = hcc[column].mean() # para valores numéricos colocamos a média
    hcc[column].replace(np.nan, value, inplace=True)


'''
Acho que falta normalizar os valores numéricos
'''
hcc.head()

Unnamed: 0,Gender,Symptoms,Alcohol,HBsAg,HBeAg,HBcAb,HCVAb,Cirrhosis,Endemic,Smoking,...,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil,Iron,Sat,Ferritin,Class
0,MALE,NO,YES,NO,NO,NO,NO,YES,NO,YES,...,150.0,7.1,0.7,1,3.5,0.5,85.598837,37.028941,438.997647,LIVES
1,FEMALE,YES,NO,NO,NO,NO,YES,YES,NO,YES,...,212.211605,8.961039,1.127089,1,1.8,1.93,85.598837,37.028941,438.997647,LIVES
2,MALE,NO,YES,YES,NO,YES,NO,YES,NO,YES,...,109.0,7.0,2.1,5,13.0,0.1,28.0,6.0,16.0,LIVES
3,MALE,YES,YES,NO,NO,NO,NO,YES,NO,YES,...,174.0,8.1,1.11,2,15.7,0.2,85.598837,37.028941,438.997647,DIES
4,MALE,YES,YES,YES,NO,YES,NO,YES,NO,YES,...,109.0,6.9,1.8,1,9.0,1.93,59.0,15.0,22.0,LIVES


In [3]:
encoded_hcc = deepcopy(hcc) # copia para não açterar o dataset original

columns_classification = { # Colunas que tem uma ordenação
    'PS': ['ACTIVE', 'RESTRICTED', 'AMBULATORY', 'SELFCARE', 'DISABLED'],
    'Encephalopathy': ['NONE', 'GRADE I/II', 'GRADE III/IV'],
    'Ascites': ['NONE', 'MILD', 'MODERATE/SEVERE']
}

# Transformando as variáveis categóricas em números
for column in encoded_hcc.columns:
    if hcc[column].dtype == 'object' and column != 'Nodules': # se for uma variável categórica, que não seja nodules, pois esse já é numérico
        if column in columns_classification:
            enc = OrdinalEncoder(categories=[columns_classification[column]]).set_output(transform="pandas") # vai codificar com ua ordem
        else:
            enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas") # vai criar colunas de 0's e 1's
        enc_transform = enc.fit_transform(encoded_hcc[[column]]) # faz a transformação da coluna codificada -> é uma nova tabela que precisa ser anexada no dataset

        # Aqui vamos considerar apenas as colunas importantes para que não seja anexada informação ambigua
        if not(column in columns_classification or column == 'Gender' or column == 'Class'):
            enc_transform = enc_transform.filter(regex='_YES$') # em colunas com Yes e No vamos considerar apenas o Yes, sendo ele 1 e o No 0
        elif column == 'Gender':
            enc_transform = enc_transform.filter(regex='_MALE$') # Como só tem 2 opção vamos considerar a coluna apenas dos homens, sendo para homem 1 e mulher 0
        elif column == 'Class':
            enc_transform = enc_transform.filter(regex='_LIVES$') # Como só tem 2 opção vamos considerar a coluna apenas das pessoas que sobreviveram, sendo para viver 1 e morrer 0
        encoded_hcc = encoded_hcc.drop(columns = [column]) # excluindo a coluna que originou a codificação, exenplo (Class)
        encoded_hcc = pd.concat([encoded_hcc, enc_transform], axis=1) # adicionando a codificação ao dataset

encoded_hcc.to_csv('encoded_hcc.csv', index=False) # criando um arquivo com esses dados


Unnamed: 0,Gender,Symptoms,Alcohol,HBsAg,HBeAg,HBcAb,HCVAb,Cirrhosis,Endemic,Smoking,...,ALP,TP,Creatinine,Nodules,Major_Dim,Dir_Bil,Iron,Sat,Ferritin,Class
0,MALE,NO,YES,NO,NO,NO,NO,YES,NO,YES,...,150.000000,7.100000,0.700000,1,3.5,0.50,85.598837,37.028941,438.997647,LIVES
1,FEMALE,YES,NO,NO,NO,NO,YES,YES,NO,YES,...,212.211605,8.961039,1.127089,1,1.8,1.93,85.598837,37.028941,438.997647,LIVES
2,MALE,NO,YES,YES,NO,YES,NO,YES,NO,YES,...,109.000000,7.000000,2.100000,5,13.0,0.10,28.000000,6.000000,16.000000,LIVES
3,MALE,YES,YES,NO,NO,NO,NO,YES,NO,YES,...,174.000000,8.100000,1.110000,2,15.7,0.20,85.598837,37.028941,438.997647,DIES
4,MALE,YES,YES,YES,NO,YES,NO,YES,NO,YES,...,109.000000,6.900000,1.800000,1,9.0,1.93,59.000000,15.000000,22.000000,LIVES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,FEMALE,NO,YES,NO,NO,NO,YES,YES,NO,YES,...,109.000000,7.600000,0.700000,5,3.0,1.93,85.598837,37.028941,438.997647,LIVES
161,FEMALE,YES,NO,NO,NO,NO,NO,YES,NO,NO,...,280.000000,6.700000,0.700000,1,2.2,2.30,85.598837,37.028941,438.997647,DIES
162,MALE,NO,YES,NO,NO,NO,NO,YES,NO,YES,...,181.000000,7.500000,1.460000,5,18.6,1.93,85.598837,37.028941,438.997647,LIVES
163,MALE,NO,YES,YES,NO,YES,YES,YES,YES,YES,...,170.000000,8.400000,0.740000,5,18.0,1.93,85.598837,37.028941,438.997647,DIES


**DADOS ESCALONADOS**

In [4]:
colunas_numericas = hcc.select_dtypes(include=['int', 'float']).columns
scaler=MinMaxScaler()
colunas_nao_numericas=hcc.select_dtypes(exclude=['int', 'float']).columns
dados_escalados_numericos=scaler.fit_transform(hcc[colunas_numericas])
dados_escalados_numericos=pd.DataFrame(dados_escalados_numericos,columns=colunas_numericas)
dados_escalados = pd.concat([dados_escalados_numericos, hcc[colunas_nao_numericas]], axis=1)


In [None]:
dados_escalados

Unnamed: 0,Age,Grams_day,Packs_year,INR,AFP,Hemoglobin,MCV,Leucocytes,Platelets,Albumin,...,Spleno,PHT,PVT,Metastasis,Hallmark,PS,Encephalopathy,Ascites,Nodules,Class
0,0.643836,0.274000,0.029412,0.173367,0.000052,0.635036,0.740519,0.000208,0.000212,0.500000,...,NO,NO,NO,NO,YES,ACTIVE,NONE,NONE,1,LIVES
1,0.575342,0.000000,0.040126,0.146194,0.010660,0.575110,0.511372,0.113232,0.246634,0.515178,...,NO,NO,NO,NO,YES,ACTIVE,NONE,NONE,1,LIVES
2,0.794521,0.100000,0.098039,0.030151,0.000003,0.284672,0.205589,0.000477,0.001025,0.466667,...,NO,YES,NO,YES,YES,AMBULATORY,NONE,MILD,5,LIVES
3,0.780822,0.080000,0.058824,0.027638,0.001347,0.613139,0.550898,0.000523,0.000604,0.600000,...,NO,NO,NO,YES,YES,ACTIVE,NONE,NONE,2,DIES
4,0.767123,0.200000,0.058824,0.025126,0.000026,0.678832,0.510978,0.000323,0.000430,0.733333,...,NO,NO,NO,NO,YES,ACTIVE,NONE,NONE,1,LIVES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,0.273973,0.142017,0.040126,0.000000,0.010660,0.759124,0.792415,0.000546,0.400869,0.900000,...,NO,NO,NO,NO,NO,ACTIVE,NONE,NONE,5,LIVES
161,0.657534,0.000000,0.000000,0.123116,0.002699,0.518248,0.387226,0.000023,0.000303,0.366667,...,YES,YES,NO,NO,YES,AMBULATORY,NONE,NONE,1,DIES
162,0.616438,0.142017,0.094118,0.072864,0.000041,0.605839,0.409182,0.000446,0.838779,0.800000,...,YES,YES,NO,NO,YES,ACTIVE,NONE,NONE,5,LIVES
163,0.328767,0.142017,0.040126,0.326633,0.052456,0.773723,0.954092,0.399898,0.257078,0.966667,...,YES,YES,YES,YES,YES,AMBULATORY,NONE,NONE,5,DIES
