In [7]:
# Experimento simples de uso do Random Forest sem redução de dimensões e comparando com um baseline usando DummyClassifier

import pandas as pd

# Leitura do dataset
resultados_exames = pd.read_csv(".\data-set\exames.csv")

# Exibição dos 10 primeiros registros
resultados_exames.head(10)

Unnamed: 0,id,diagnostico,exame_1,exame_2,exame_3,exame_4,exame_5,exame_6,exame_7,exame_8,...,exame_24,exame_25,exame_26,exame_27,exame_28,exame_29,exame_30,exame_31,exame_32,exame_33
0,842302,M,17.99,10.38,122.8,103.78,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.786,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,103.78,1326.0,0.08474,0.07864,0.0869,...,158.8,1956.0,0.1238,0.1866,0.2416,0.786,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,103.78,1203.0,0.1096,0.1599,0.1974,...,152.5,1709.0,0.1444,0.4245,0.4504,0.786,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,103.78,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.786,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,103.78,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.786,0.1625,0.2364,0.07678,0.854454
5,843786,M,12.45,15.7,82.57,103.78,477.1,0.1278,0.17,0.1578,...,103.4,741.6,0.1791,0.5249,0.5355,0.786,0.1741,0.3985,0.1244,0.804347
6,844359,M,18.25,19.98,119.6,103.78,1040.0,0.09463,0.109,0.1127,...,153.2,1606.0,0.1442,0.2576,0.3784,0.786,0.1932,0.3063,0.08368,
7,84458202,M,13.71,20.83,90.2,103.78,577.9,0.1189,0.1645,0.09366,...,110.6,897.0,0.1654,0.3682,0.2678,0.786,0.1556,0.3196,0.1151,
8,844981,M,13.0,21.82,87.5,103.78,519.8,0.1273,0.1932,0.1859,...,106.2,739.3,0.1703,0.5401,0.539,0.786,0.206,0.4378,0.1072,
9,84501001,M,12.46,24.04,83.97,103.78,475.9,0.1186,0.2396,0.2273,...,97.65,711.4,0.1853,1.058,1.105,0.786,0.221,0.4366,0.2075,


In [8]:
# Exibindo a dimensionalidade do dataframe
resultados_exames.shape

(569, 35)

In [9]:
# Importação das bibliotecas para splits dos datasets
from sklearn.model_selection import train_test_split
from numpy import random

# Criando o SEED para controlar a aleatoriedade
SEED = 123143
random.seed(SEED)

# Separando as colunas de id, diagnóstico e exame_33 do dataset
# id está sendo excluída por não agregar valor ao processo de treinamento
# Á coluna diagnóstico está sendo escluída por ser o target da minha classificação
# A coluna exame_33 está sendo escluída por só conter valores vazios
valores_exames = resultados_exames.drop(columns=['id', 'diagnostico','exame_33'])
valores_exames.head(10)

Unnamed: 0,exame_1,exame_2,exame_3,exame_4,exame_5,exame_6,exame_7,exame_8,exame_9,exame_10,...,exame_23,exame_24,exame_25,exame_26,exame_27,exame_28,exame_29,exame_30,exame_31,exame_32
0,17.99,10.38,122.8,103.78,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.786,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,103.78,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.786,0.186,0.275,0.08902
2,19.69,21.25,130.0,103.78,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.786,0.243,0.3613,0.08758
3,11.42,20.38,77.58,103.78,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.786,0.2575,0.6638,0.173
4,20.29,14.34,135.1,103.78,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.786,0.1625,0.2364,0.07678
5,12.45,15.7,82.57,103.78,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.786,0.1741,0.3985,0.1244
6,18.25,19.98,119.6,103.78,1040.0,0.09463,0.109,0.1127,0.074,0.1794,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.786,0.1932,0.3063,0.08368
7,13.71,20.83,90.2,103.78,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.786,0.1556,0.3196,0.1151
8,13.0,21.82,87.5,103.78,519.8,0.1273,0.1932,0.1859,0.09353,0.235,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.786,0.206,0.4378,0.1072
9,12.46,24.04,83.97,103.78,475.9,0.1186,0.2396,0.2273,0.08543,0.203,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.786,0.221,0.4366,0.2075


In [10]:
# Capturando agora só as colunas de id e diagnóstico
diagnostico = resultados_exames.diagnostico
#diagnostico.head(10)

# Segregando os dados em bases de treino e teste e setando o tamanho da base de teste para 30% (o padrão da base de teste é de 25%)
treino_x, teste_x, treino_y, teste_y = train_test_split(valores_exames, diagnostico, test_size=0.3)

# treino_x.head()
# teste_x.head()
# treino_y.head()
# teste_y.head()
print("Treino x: " + str(treino_x.shape), " | Teste x: " + str(teste_x.shape), " | Treino y: " + str(treino_y.shape), " | Teste y: " + str(teste_y.shape))

Treino x: (398, 32)  | Teste x: (171, 32)  | Treino y: (398,)  | Teste y: (171,)


In [11]:
# Importação do Random Forest
from sklearn.ensemble import RandomForestClassifier

# Instanciando o Random Forest setando o parâmetro do número de árvores de decisão que serão geradas para 100 (por padrão são geradas 10 árvores) 
classificador = RandomForestClassifier(n_estimators = 100)

# Treinando o modelo com base nos datasets de treino
classificador.fit(treino_x, treino_y)

# Como o Random Forest não aceita valores vazios como entrada, precisamos tratar os valores vazios

# Imprimindo o atributo score do resultado da classificação
print("Resultado da classificação: %.2f%%" %(classificador.score(teste_x,teste_y)*100))

Resultado da classificação: 92.40%


In [12]:
# Precisamos ter um base line para saber se o resultado da nossa classificação é bom ou não. Para isso, iremos criar um novo modelo de classificação utilizando o dummy classifier.
from sklearn.dummy import DummyClassifier

# Criando o fator de aloiretariedade para o novo classificador
SEED = 123143
random.seed(SEED)

# Instanciando o novo classificador com a estratégia most_frequent: a predição sempre retorna o rótulo de classe mais frequente no argumento y observado passado para fit.
# O método predict_proba retorna o vetor codificado one-hot correspondente. Basicamente ele chuta o valor mais comum para y.
classificador_bobo = DummyClassifier(strategy= "most_frequent")

# Treinando o novo modelo 
classificador_bobo.fit(treino_x, treino_y)

resultado_classificador_bobo = round((classificador_bobo.score(teste_x, teste_y)*100),2)
resultado_classificador_rf = round((classificador.score(teste_x, teste_y)*100),2)
# Imprimindo o atributo score do novo resultado da classificação
print("Resultado da classificação boba: " + str(resultado_classificador_bobo) + "%")
print("Resultado da classificação com Random Forest: " + str(resultado_classificador_rf)+ "%")

# Comparando nosso resultado utilizando o Random Forest com o classificador bobo, podemos deduzir que o resultado do RF é muito bom e que nossa classificação está correta.
print("Diferença de : " + str(round((resultado_classificador_rf - resultado_classificador_bobo),2))+ "%")

Resultado da classificação boba: 66.67%
Resultado da classificação com Random Forest: 92.4%
Diferença de : 25.73%
