In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import make_scorer
from sklearn.svm import SVC
import pandas as pd

In [2]:
df = pd.read_csv('./agaricus_lepiota_small_c.csv')

# Codificação do atributo de saída (class): e → 0 e p → 1

In [3]:
transformers = [
   ('oe_class', OrdinalEncoder(categories=[['e', 'p']]), ['class']),
]

ct = ColumnTransformer(transformers=transformers, remainder='passthrough')

y_oe = ct.fit_transform(df)

df = pd.DataFrame(y_oe, columns=df.columns)
df['class'] = df['class'].astype(int)

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,x,s,y,t,a,f,w,b,g,...,s,w,w,p,w,o,p,n,v,d
1,0,f,s,y,f,n,f,c,b,p,...,s,w,w,p,w,o,f,n,y,g
2,0,k,s,w,f,c,f,w,b,g,...,s,w,n,p,w,t,e,w,n,g
3,0,f,f,n,t,n,f,c,b,w,...,s,g,w,p,w,o,p,k,v,d
4,1,x,s,w,t,p,f,c,n,w,...,s,w,w,p,w,o,p,n,s,u


# Exclusão de dados faltantes

In [4]:
df = df.drop('stalk-root', axis=1) 
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,x,s,y,t,a,f,w,b,g,...,s,w,w,p,w,o,p,n,v,d
1,0,f,s,y,f,n,f,c,b,p,...,s,w,w,p,w,o,f,n,y,g
2,0,k,s,w,f,c,f,w,b,g,...,s,w,n,p,w,t,e,w,n,g
3,0,f,f,n,t,n,f,c,b,w,...,s,g,w,p,w,o,p,k,v,d
4,1,x,s,w,t,p,f,c,n,w,...,s,w,w,p,w,o,p,n,s,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,x,f,p,f,c,f,w,n,n,...,s,w,w,p,w,o,p,n,v,d
996,1,x,y,n,f,n,f,c,n,w,...,y,w,y,p,w,o,e,w,v,d
997,0,x,f,g,f,n,f,c,b,u,...,s,g,g,p,w,o,e,k,y,d
998,0,b,s,w,t,a,f,c,b,b,...,s,g,w,p,w,o,p,h,y,p


# Codificação de atributos categóricos

In [5]:
nominal_attr = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 
           'gill-color', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 
           'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 
           'habitat']

for column in nominal_attr:
    transformers = [
        ('oe_' + column, OneHotEncoder(), [column])
    ]

    ct_oe = ColumnTransformer(
        transformers, remainder='passthrough'
    )

    X_oe = ct_oe.fit_transform(df)

    df[column] = X_oe[:, 0]

df 

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
996,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
997,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
998,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, make_scorer
import numpy as np

In [7]:
def custom_positive_accuracy(y_true, y_pred):
    return accuracy_score(y_true[y_true == 1], y_pred[y_true == 1])

In [8]:
X = df.drop('class', axis=1)
y = df['class']

In [9]:
from sklearn.model_selection import GridSearchCV #busca exaustiva para otimização de hiperparâmetro por validação cruzada

# Validação cruzada em dois níveis com KNN

In [10]:
def do_knn():
    positive_accuracy_scorer = make_scorer(custom_positive_accuracy)

    k1 = 10 #controla o número de vias da validação cruzada para estimar o desempenho do modelo
    k2 = 5 #controla o número de vida da validação cruzada para otimização de hiperparametros

    #usar o protocolo de validação cruzada estratificada
    skf = StratifiedKFold(n_splits=k1, shuffle=True, random_state=1)

    acuracias = []

    #a função split retorna os índices das instâncias que devem ser usadas para o treinamento e o teste.
    for idx_treino, idx_teste in skf.split(X, y):
        
        #extrair as instâncias de treinamento de acordo com os índices fornecidos pelo skf.split
        X_treino = X.iloc[idx_treino]
        y_treino = y.iloc[idx_treino]
        
        #extrair as instâncias de teste de acordo com os índices fornecidos pelo skf.split
        X_teste = X.iloc[idx_teste]
        y_teste = y.iloc[idx_teste]
        
        #colocar todas as variáveis na mesma escala, usando o conjunto de treinamento para calcular os parâmetros da escala
        ss = StandardScaler()
        ss.fit(X_treino)
        X_treino = ss.transform(X_treino)
        X_teste = ss.transform(X_teste)
        
        #combinações de parametros otimizar. Aqui estamos apenas otimizando o número de vizinhos mais próximos para o knn (k).
        #Entretanto, podemos colocar todos os valores de todos os parametros. O sklearn se encarrega de gerar todas as combinações.
        params = {'n_neighbors' : range(1,30,2)}

        #instanciar um KNN com parametros padrão
        knn = KNeighborsClassifier()

        #instanciar um GridSearchCV com k2 vias.
        knn = GridSearchCV(knn, params, cv=StratifiedKFold(n_splits=k2), scoring=positive_accuracy_scorer)
        
        #realizar a otimização dos hiperparâmetros e treinar o modelo final com a melhor combinação de hiperparametros com todos os dados de treinamento
        knn.fit(X_treino, y_treino)
        
        acuracias.append(positive_accuracy_scorer(knn, X_teste, y_teste))  # Use a métrica personalizada para calcular a acurácia
        
    #calcular as estatísticas da validação cruzada. Estas estatísticas nos dão uma confiança que, na média, este é o desempenho esperado
    #do classificador no mundo real.
    # print("min: %.2f, max: %.2f, avg +- std: %.2f+-%.2f" % (min(acuracias), max(acuracias), np.mean(acuracias), np.std(acuracias)))
    return acuracias

# Validação cruzada em dois níveis com SVM

In [23]:
def do_svm():
    k1 = 10 #controla o número de vias da validação cruzada para estimar o desempenho do modelo
    k2 = 5 #controla o número de vida da validação cruzada para otimização de hiperparametros

    positive_accuracy_scorer = make_scorer(custom_positive_accuracy)

    #usar o protocolo de validação cruzada estratificada
    skf = StratifiedKFold(n_splits=k1, shuffle=True, random_state=1)

    acuracias = []

    #a função split retorna os índices das instâncias que devem ser usadas para o treinamento e o teste.
    for idx_treino, idx_teste in skf.split(X, y):
        
        #extrair as instâncias de treinamento de acordo com os índices fornecidos pelo skf.split
        X_treino = X.iloc[idx_treino]
        y_treino = y.iloc[idx_treino]
        
        #extrair as instâncias de teste de acordo com os índices fornecidos pelo skf.split
        X_teste = X.iloc[idx_teste]
        y_teste = y.iloc[idx_teste]
        
        #colocar todas as variáveis na mesma escala, usando o conjunto de treinamento para calcular os parâmetros da escala
        ss = StandardScaler()
        ss.fit(X_treino)
        X_treino = ss.transform(X_treino)
        X_teste = ss.transform(X_teste)
        
        #combinações de parametros otimizar. Aqui estamos apenas otimizando o número de vizinhos mais próximos para o knn (k).
        #Entretanto, podemos colocar todos os valores de todos os parametros. O sklearn se encarrega de gerar todas as combinações.
        params = {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': ['scale', 'auto', 2e-2, 2e-3, 2e-4],
        }
        #instanciar um SVM com parametros padrão
        svm = SVC(kernel='rbf')

        #instanciar um GridSearchCV com k2 vias.
        svm = GridSearchCV(svm, params, cv=StratifiedKFold(n_splits=k2), scoring=positive_accuracy_scorer)

        #realizar a otimização dos hiperparâmetros e treinar o modelo final com a melhor combinação de hiperparametros com todos os dados de treinamento
        svm.fit(X_treino, y_treino)
        
        acuracias.append(positive_accuracy_scorer(svm, X_teste, y_teste))  # Use a métrica personalizada para calcular a acurácia
        
    return acuracias

In [24]:
def calcular_estatisticas(resultados):
    return np.mean(resultados), np.std(resultados), np.min(resultados), np.max(resultados)

def imprimir_estatisticas(resultados):
    media, desvio, mini, maxi = calcular_estatisticas(resultados)
    print("Resultados: %.2f +- %.2f, min: %.2f, max: %.2f" % (media, desvio, mini, maxi))

In [25]:
accs_knn = do_knn()
accs_svm = do_svm()

In [26]:
imprimir_estatisticas(accs_knn)
imprimir_estatisticas(accs_svm)

Resultados: 0.87 +- 0.03, min: 0.81, max: 0.94
Resultados: 0.89 +- 0.06, min: 0.77, max: 0.98


In [27]:
from scipy.stats import ttest_ind_from_stats

In [28]:
# calculamos a média e o desvio padrão dos resultados
media_knn, std_knn, _, _ = calcular_estatisticas(accs_knn)
media_svm, std_svm, _, _ = calcular_estatisticas(accs_svm)

#calcular o pvalor usando o teste t de Student para duas amostras independentes
_, pvalor = ttest_ind_from_stats(media_knn, std_knn, len(accs_knn), media_svm, std_svm, len(accs_svm))

In [29]:
def rejeitar_hip_nula(media_amostral1, desvio_padrao_amostral1, n1, media_amostral2, desvio_padrao_amostral2, n2, alpha=0.05):
    _, pvalor = ttest_ind_from_stats(media_amostral1, desvio_padrao_amostral1, n1, media_amostral2, desvio_padrao_amostral2, n2)
    return pvalor <= alpha

In [30]:
rejeitar_hip_nula(media_knn, std_knn, len(accs_knn), media_svm, std_svm, len(accs_svm))

False

Pelo retorno da função ```rejeitar_hip_nula``` ser false, podemos concluir que não há evidências suficientes para rejeitar a hipótese nula, ou seja, não há diferença significativa entre os classificadores.

**Você usaria algum classificador que criou para decidir se comeria ou não um cogumelo classificado por
ele? Justifique usando o desempenho obtido e o resultado do teste de hipótese.**

**R**: Não usaria, pois apesar de as acurácias máximas e as médias serem relativamente altas, houve casos nos quais o classificador obteve baixas acur´cias