In [2]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('mushrooms.tsv', sep="\t")
data.columns=["edibility","cap_shape", "cap_surface", "cap_color", "bruises?", "odor", "gill_attachment", "gill_spacing", "gill_size", "gill_color", "stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", "ring_type", "spore_print_color", "population", "habitat"]
data.dropna()

data["edible"] = data["edibility"].apply(
    lambda x: 1 if x == 'e' else 0
)

data = data.drop(columns=["edibility"])
data = pd.get_dummies(data=data, dtype=int)

features = [columns for columns in data.columns if columns!="edible"]

data_train, data_test = train_test_split(data, train_size=0.3)

### Dla 3 wybranych cech:

In [12]:
from sklearn.model_selection import GridSearchCV
import numpy as np

y_train = pd.DataFrame(data_train["edible"])
x_train = pd.DataFrame(data_train[features[3:6]])
y_expected = pd.DataFrame(data_test['edible'])
x_test = pd.DataFrame(data_test[features[3:6]])

mbayes = MultinomialNB(alpha=0.0001)
mbayes.fit(x_train,y_train.values.ravel())
y_pred = mbayes.predict(x_test)
accuracy = accuracy_score(y_expected, y_pred)

print(f'Dokładność modelu dla 3 wybranych cech: {accuracy}')

# Sprawdzanie dla jakiego parametru alpha model osiąga najlepsze wyniki: 

parameters = {'alpha': np.arange(0.0001,1,0.001)}
gridsearch = GridSearchCV(MultinomialNB(), parameters)
gridsearch.fit(x_train, y_train.values.ravel())
print(gridsearch.best_params_)

0.5655511811023622
{'alpha': 0.0001}


### Dla wszystkich cech:

In [14]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import numpy as np
import math 

y_train = pd.DataFrame(data_train["edible"])
x_train = pd.DataFrame(data_train[features])
y_expected = pd.DataFrame(data_test['edible'])
x_test = pd.DataFrame(data_test[features])

mbayes = MultinomialNB(alpha = 0.0001)
mbayes.fit(x_train, y_train.values.ravel())
y_pred = mbayes.predict(x_test)
accuracy = accuracy_score(y_expected, y_pred)

print(f'Dokładność modelu dla wszystkich wybranych cech: {accuracy}')

# Sprawdzanie dla jakiego parametru alpha model osiąga najlepsze wyniki: 

parameters = {'alpha': np.arange(0.0001,1,0.001)}
gridsearch = GridSearchCV(MultinomialNB(), parameters)
gridsearch.fit(x_train, y_train.values.ravel())
print(gridsearch.best_params_)

0.9940944881889764
{'alpha': 0.0001}
