In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, cv, Pool
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [2]:
all_data = pd.read_csv('Mushrooms.csv')

In [3]:
all_data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### [Dataset Source](https://www.kaggle.com/datasets/uciml/mushroom-classification)


### Attribute Information:
<ul>
<li><b>ID number</b></li>
<li><b>classes</b> : edible=e, poisonous=p</li>
<li><b>cap-shape</b> : bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s</li>
<li><b>cap-surface</b> fibrous=f,grooves=g,scaly=y,smooth=s</li>
<li><b>cap-color</b>: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y</li>
<li><b>bruises</b>: bruises=t,no=f</li>
<li><b>odor</b>: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s</li>
<li><b>gill-attachment</b>: attached=a,descending=d,free=f,notched=n</li>
<li><b>gill-spacing</b>: close=c,crowded=w,distant=d</li>
<li><b>gill-size</b>broad=b,narrow=n</li>
<li><b>gill-color</b>: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y</li>
<li><b>stalk-shape</b>: enlarging=e,tapering=t</li>
<li><b>stalk-root</b>: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?</li>
<li><b>stalk-surface-above-ring</b>: fibrous=f,scaly=y,silky=k,smooth=s</li>
<li><b>stalk-surface-below-ring</b>: fibrous=f,scaly=y,silky=k,smooth=s</li>
<li><b>stalk-color-above-ring</b>: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y</li>
<li><b>stalk-color-below-ring</b>: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y</li>
<li><b>veil-type</b>: partial=p,universal=u</li>
<li><b>veil-color</b>: brown=n,orange=o,white=w,yellow=y</li>
<li><b>ring-number</b>: none=n,one=o,two=t</li>
<li><b>ring-type</b>: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z</li>
<li><b>spore-print-color</b>: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y</li>
<li><b>population</b>: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y</li>
<li><b>habitat</b>: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d</li>
</ul>

In [4]:
s = (all_data.dtypes == 'object')
object_cols = list(s[s].index)
ordinal_encoder = OrdinalEncoder(dtype = int)
tmp = all_data.copy()
tmp[object_cols] = ordinal_encoder.fit_transform(tmp[object_cols])

X, y = tmp.drop(columns = 'class'), tmp['class']

In [5]:
skf = StratifiedKFold(shuffle = True, random_state=42)

## Training the models

In [6]:
results = pd.DataFrame(columns = ['f1', 'accuracy'])

### Decision tree

In [7]:
dt_params = {'max_depth': [5, 10, 50, 100, 200],
             'min_samples_split': [2, 10, 20],
             'min_samples_leaf': [1, 5, 10]}

In [8]:
dt_clf = GridSearchCV(tree.DecisionTreeClassifier(),
                      dt_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)
dt_clf.fit(X, y)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [9]:
dt_f1_mean = dt_clf.cv_results_['mean_test_f1'][dt_clf.cv_results_['rank_test_f1'].argmin()]
dt_acc_mean = dt_clf.cv_results_['mean_test_accuracy'][dt_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['Decision Tree'] = [dt_f1_mean, dt_acc_mean]

### Random forest

In [10]:
rf_params = {'n_estimators': [100, 200],
             'max_depth': [10, None],
             'min_samples_split': [2, 10],
             'min_samples_leaf': [1, 10]}

In [11]:
rf_clf = GridSearchCV(RandomForestClassifier(),
                      rf_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)
rf_clf.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [12]:
rf_f1_mean = rf_clf.cv_results_['mean_test_f1'][rf_clf.cv_results_['rank_test_f1'].argmin()]
rf_acc_mean = rf_clf.cv_results_['mean_test_accuracy'][rf_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['Random Forest'] = [rf_f1_mean, rf_acc_mean]

### xGboost

In [13]:
xgb_params = {'min_child_weight': [1, 5, 10],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'max_depth': [3, 4, 5]}

In [14]:
xgb = XGBClassifier(learning_rate=0.02,
                    n_estimators=600,
                    objective='binary:logistic',
                    silent=True)

In [15]:
xgb_rs = RandomizedSearchCV(xgb,
                            param_distributions = xgb_params,
                            n_iter = 30,
                            scoring = ['f1', 'accuracy'],
                            refit = 'f1',
                            n_jobs = -1,
                            cv = skf,
                            verbose = 1)
xgb_rs.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Parameters: { "silent" } are not used.



In [16]:
xgb_f1_mean = xgb_rs.cv_results_['mean_test_f1'][xgb_rs.cv_results_['rank_test_f1'].argmin()]
xgb_acc_mean = xgb_rs.cv_results_['mean_test_accuracy'][xgb_rs.cv_results_['rank_test_f1'].argmin()]

results.loc['xGboost'] = [xgb_f1_mean, xgb_acc_mean]

### CatBoost

In [17]:
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat = CatBoostClassifier(cat_features=cat_features)

In [18]:
def cv_scores(cv_data):
    cv_data.head(10)

    best_acc_value = cv_data['test-Accuracy-mean'].max()
    best_acc_iter = cv_data['test-Accuracy-mean'].values.argmax()

    best_f1_value = cv_data['test-F1-mean'].max()
    best_f1_iter = cv_data['test-F1-mean'].values.argmax()
    return best_f1_value, best_acc_value

In [19]:
cat_params = {'loss_function': 'Logloss',
              'iterations': 100,
              'custom_loss': ['F1', 'Accuracy'],
              'learning_rate': 0.5,}

In [20]:
train_pool = Pool(data=X, label=y, cat_features=cat_features, has_header=True)
cv_data = cv(params = cat_params,
             pool = train_pool,
             verbose = False,
             folds = skf,
             plot=True)

cat_f1_mean, cat_acc_mean = cv_scores(cv_data)

results.loc['CatBoost'] = [cat_f1_mean, cat_acc_mean]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]



bestTest = 0.000643712018
bestIteration = 49

Training on fold [1/5]

bestTest = 0.0005747078835
bestIteration = 78

Training on fold [2/5]

bestTest = 0.0005027081879
bestIteration = 56

Training on fold [3/5]

bestTest = 0.0007864869364
bestIteration = 39

Training on fold [4/5]

bestTest = 0.0004705575731
bestIteration = 28



### kNN

Since KNN cant handle categorical features, we will one-hot-encode every categorical feature and then reduce dimensionality with svd decomposition.

In [21]:
oh_enc = OneHotEncoder(drop = 'first')
X_oh = oh_enc.fit_transform(X)

In [22]:
decomp = TruncatedSVD()
knn = KNeighborsClassifier()

knn_pipe = Pipeline(steps=[('svd', decomp),
                           ('knn', knn)])

knn_params = {"svd__n_components": [2, 5, 10],
              "svd__n_iter": [5],
              "knn__n_neighbors": [1, 2, 3, 4, 5],
              "knn__weights" : ["uniform", "distance"],
              "knn__metric" : ["euclidean"]}

In [23]:
knn_clf = GridSearchCV(knn_pipe,
                      knn_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)

knn_clf.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [24]:
knn_f1_mean = knn_clf.cv_results_['mean_test_f1'][knn_clf.cv_results_['rank_test_f1'].argmin()]
knn_acc_mean = knn_clf.cv_results_['mean_test_accuracy'][knn_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['KNN'] = [knn_f1_mean, knn_acc_mean]

### Naive Bayes

In [25]:
gnb = GaussianNB()
gnb_pipe = Pipeline(steps=[('svd', decomp),
                           ('gnb', gnb)])
gnb_params = {"svd__n_components": [2, 5, 10],
              "svd__n_iter": [5]}

In [26]:
gnb_clf = GridSearchCV(gnb_pipe,
                       gnb_params,
                       cv = skf,
                       n_jobs = -1,
                       scoring = ['f1', 'accuracy'],
                       refit = 'f1',
                       verbose = 1)

gnb_clf.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [27]:
gnb_f1_mean = gnb_clf.cv_results_['mean_test_f1'][gnb_clf.cv_results_['rank_test_f1'].argmin()]
gnb_acc_mean = gnb_clf.cv_results_['mean_test_accuracy'][gnb_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['GaussianNB'] = [gnb_f1_mean, gnb_acc_mean]

### Logistic Regression

In [28]:
lr = LogisticRegression(max_iter=1000)
lr_params = [{'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 10],
              'solver': ['liblinear']},

             {'penalty': [None],
              'solver': ['lbfgs']},]

lr_clf = GridSearchCV(lr,
                      lr_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)

lr_clf.fit(X, y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [29]:
lr_f1_mean = lr_clf.cv_results_['mean_test_f1'][lr_clf.cv_results_['rank_test_f1'].argmin()]
lr_acc_mean = lr_clf.cv_results_['mean_test_accuracy'][lr_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['LogisticRegression'] = [lr_f1_mean, lr_acc_mean]

## Results

In [30]:
results

Unnamed: 0,f1,accuracy
Decision Tree,1.0,1.0
Random Forest,1.0,1.0
xGboost,1.0,1.0
CatBoost,1.0,1.0
KNN,0.999362,0.999385
GaussianNB,0.82464,0.842195
LogisticRegression,0.965668,0.966888


In [31]:
results.to_csv('msuhroom_resuts.csv')

In [32]:
results.round(3)

Unnamed: 0,f1,accuracy
Decision Tree,1.0,1.0
Random Forest,1.0,1.0
xGboost,1.0,1.0
CatBoost,1.0,1.0
KNN,0.999,0.999
GaussianNB,0.825,0.842
LogisticRegression,0.966,0.967
