# Data Wrangle same as notebook Data_Wrangling_0 but will try to impute using kvm as model instead of random forest classifier
* Explore and look at data
* check for shape, missing data, and characteristics
* Initiative to determine if a mushroom is edible or not via looking at its characteristics

In [1]:
# import packages

import numpy as np
import pandas as pd
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import klib
import plotly.figure_factory as ff
from IPython.core.display import display
from scipy import stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import joblib
# pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')
#np.set_printoptions(threshold=sys.maxsize)

plt.style.use('dark_background')
plt.rcParams.update({"grid.linewidth":0.5, "grid.alpha":0.5})
sns.set(style='ticks', context='talk')

In [2]:
path = r'../../data/raw/mushrooms.csv'

mushroom = pd.read_csv(path)
mushroom.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
mushroom.shape, mushroom.isnull().sum()

((8124, 23),
 class                       0
 cap-shape                   0
 cap-surface                 0
 cap-color                   0
 bruises                     0
 odor                        0
 gill-attachment             0
 gill-spacing                0
 gill-size                   0
 gill-color                  0
 stalk-shape                 0
 stalk-root                  0
 stalk-surface-above-ring    0
 stalk-surface-below-ring    0
 stalk-color-above-ring      0
 stalk-color-below-ring      0
 veil-type                   0
 veil-color                  0
 ring-number                 0
 ring-type                   0
 spore-print-color           0
 population                  0
 habitat                     0
 dtype: int64)

* The classes for the data appear to be poisonous, edible, so it appears to be binary like
* We have 8k rows and 23 columns
* There is no perceivable missing data but need to check whether any erratic data

In [4]:
mushroom.info(), mushroom.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

(None,
        class cap-shape cap-surface cap-color bruises  odor gill-attachment  \
 count   8124      8124        8124      8124    8124  8124            8124   
 unique     2         6           4        10       2     9               2   
 top        e         x           y         n       f     n               f   
 freq    4208      3656        3244      2284    4748  3528            7914   
 
        gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
 count          8124      8124       8124  ...                     8124   
 unique            2         2         12  ...                        4   
 top               c         b          b  ...                        s   
 freq           6812      5612       1728  ...                     4936   
 
        stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
 count                    8124                   8124      8124       8124   
 unique                      9                      9         1

* the data type needs to be converted to string or text
* perhaps improve readability for folder would helpful for future purposes
* to check any errors on data, need to check columns for unique values


In [5]:
# get max length of characters used for column names
mush_col = mushroom.columns

# convert each column to type string with given max character length
for col in mush_col:
    mushroom[col] = mushroom[col].astype('string')

# display converted column types to string
mushroom.dtypes

class                       string
cap-shape                   string
cap-surface                 string
cap-color                   string
bruises                     string
odor                        string
gill-attachment             string
gill-spacing                string
gill-size                   string
gill-color                  string
stalk-shape                 string
stalk-root                  string
stalk-surface-above-ring    string
stalk-surface-below-ring    string
stalk-color-above-ring      string
stalk-color-below-ring      string
veil-type                   string
veil-color                  string
ring-number                 string
ring-type                   string
spore-print-color           string
population                  string
habitat                     string
dtype: object

In [6]:
# with open('../../data/raw/opafy19nid/opafy19nid.dat') as f:
#     # next(f)
#     tmp = pd.DataFrame(l.rstrip().split() for l in f)
# print(tmp)

In [7]:
mushroom.nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

* we can look at documentation for names for each labeled values

In [8]:
# use to check if any values explain missingness
for count, col in enumerate(mush_col):
    print(count, col, mushroom[col].unique(), '\n')

0 class <StringArray>
['p', 'e']
Length: 2, dtype: string 

1 cap-shape <StringArray>
['x', 'b', 's', 'f', 'k', 'c']
Length: 6, dtype: string 

2 cap-surface <StringArray>
['s', 'y', 'f', 'g']
Length: 4, dtype: string 

3 cap-color <StringArray>
['n', 'y', 'w', 'g', 'e', 'p', 'b', 'u', 'c', 'r']
Length: 10, dtype: string 

4 bruises <StringArray>
['t', 'f']
Length: 2, dtype: string 

5 odor <StringArray>
['p', 'a', 'l', 'n', 'f', 'c', 'y', 's', 'm']
Length: 9, dtype: string 

6 gill-attachment <StringArray>
['f', 'a']
Length: 2, dtype: string 

7 gill-spacing <StringArray>
['c', 'w']
Length: 2, dtype: string 

8 gill-size <StringArray>
['n', 'b']
Length: 2, dtype: string 

9 gill-color <StringArray>
['k', 'n', 'g', 'p', 'w', 'h', 'u', 'e', 'b', 'r', 'y', 'o']
Length: 12, dtype: string 

10 stalk-shape <StringArray>
['e', 't']
Length: 2, dtype: string 

11 stalk-root <StringArray>
['e', 'c', 'b', 'r', '?']
Length: 5, dtype: string 

12 stalk-surface-above-ring <StringArray>
['s', 'f', '

* column 11, stalk-root has a missing value column ?
* next need to check how how many ? values in column 11
* Also veil type only has one distinct value and has no use so we will just drop this for now

In [9]:
# drop low variance column
mushroom.drop('veil-type', axis=1, inplace=True)
mush_col = mushroom.columns
mush_col

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type',
       'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [10]:
# reformat names to more interpreted labels
mapping = { 'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 'k': 'knobbed', 's': 'sunken'},
            'cap-surface': {'s': 'smooth', 'f': 'fibrous', 'g': 'grooves', 'y': 'scaly'},
            'cap-color': {'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'r': 'green', 'p':'pink',
                          'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow'},
            'bruises': {'t': 'bruises', 'f': 'no'},
            'odor': {'a': 'almond', 'l': 'anise', 'c': 'creosote', 'y': 'fishy', 'f': 'foul', 'm': 'musty',
                     'n': 'none', 'p': 'pungent', 's': 'spicy'},
            'gill-attachment': {'a': 'attached', 'd': 'descending', 'f': 'free','n': 'notched'},
            'gill-spacing': {'c': 'close', 'w': 'crowded', 'd': 'distant'},
            'gill-size': {'b': 'broad', 'n': 'narrow'},
            'gill-color': {'k': 'black', 'n': 'brown', 'b': 'buff', 'h': 'chocolate', 'g': 'gray', 'r': 'green',
                           'o': 'orange', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow'},
            'stalk-shape': {'e': 'enlarging', 't': 'tapering'},
            'stalk-root': {'b': 'bulbous', 'c': 'club', 'u': 'cup','e': 'equal', 'z': 'rhizomorphs',
                           'r': 'rooted', '?': 'missing'},
            'stalk-surface-above-ring': {'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 's': 'smooth'},
            'stalk-surface-below-ring': {'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 's': 'smooth'},
            'stalk-color-above-ring': {'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'o': 'orange',
                                       'p': 'pink', 'e': 'red', 'w': 'white', 'y': 'yellow'},
            'stalk-color-below-ring': {'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'o': 'orange',
                                       'p': 'pink', 'e': 'red', 'w': 'white', 'y': 'yellow'},
            'veil-color': {'n': 'brown', 'o': 'orange', 'w': 'white', 'y': 'yellow'},
            'ring-number': {'n': 'none', 'o': 'one', 't': 'two'},
            'ring-type': {'c': 'cobwebby', 'e': 'evanescent', 'f': 'flaring', 'l': 'large', 'n': 'none',
                          'p': 'pendant', 's': 'sheathing', 'z': 'zone'},
            'spore-print-color': {'k': 'black', 'n': 'brown', 'b': 'buff', 'h': 'chocolate', 'r': 'green',
                                  'o': 'orange', 'u': 'purple', 'w': 'white', 'y': 'yellow'},
            'population': {'a': 'abundant', 'c': 'clustered', 'n': 'numerous', 's': 'scattered',
                           'v': 'several', 'y': 'solitary'},
            'habitat': {'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'd': 'woods', 'p': 'paths',
                        'u': 'urban', 'w': 'waste'},
            'class': {'e': 'edible', 'p':'poisonous'}
          }
mushroom.replace(mapping, inplace = True)
mushroom.head()


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses


In [11]:
# make a column for missing and nonmissing data
missing_stalk_roots = [val  if val == 'missing' else 'non-missing' for val in mushroom['stalk-root']]

mushroom.insert(12, 'missing_stalk_roots', missing_stalk_roots)
mushroom.head()



Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses


# Predict and impute missing data
* Separate dataframe from missing and non missing data
* preprocess the known data
* split known data
* perform KNN to train and get score
* use this model to predict our missing data values

In [12]:
# drop mushroom missing column as we no longer need it
mushroom.drop('missing_stalk_roots', axis=1, inplace=True)
'missing_stalk_roots' in mushroom.columns

False

In [13]:
# separate known and unknown data
unknown_stalk = mushroom['stalk-root'] == 'missing'

known_mushroom = mushroom.loc[~unknown_stalk]
display(known_mushroom)

unknown_mushroom = mushroom.loc[unknown_stalk]
display(unknown_mushroom)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7986,edible,bell,scaly,brown,no,none,free,close,broad,white,...,scaly,scaly,brown,brown,white,two,pendant,white,solitary,paths
8001,edible,convex,scaly,brown,no,none,free,close,broad,white,...,scaly,scaly,brown,brown,white,two,pendant,white,solitary,paths
8038,edible,convex,scaly,gray,bruises,none,free,close,broad,white,...,smooth,smooth,white,white,white,two,pendant,white,solitary,paths
8095,poisonous,convex,scaly,cinnamon,no,musty,free,close,broad,yellow,...,silky,scaly,cinnamon,cinnamon,white,none,none,white,clustered,woods


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3984,edible,convex,scaly,buff,bruises,none,free,close,broad,red,...,smooth,smooth,red,white,white,two,evanescent,white,clustered,waste
4023,poisonous,convex,scaly,red,no,fishy,free,close,narrow,buff,...,silky,smooth,white,white,white,one,evanescent,white,several,paths
4076,edible,flat,scaly,purple,no,none,free,close,narrow,chocolate,...,smooth,fibrous,white,white,white,one,flaring,chocolate,solitary,woods
4100,poisonous,convex,scaly,red,no,fishy,free,close,narrow,buff,...,silky,smooth,pink,pink,white,one,evanescent,white,several,woods
4104,poisonous,convex,scaly,brown,no,foul,free,close,narrow,buff,...,smooth,smooth,pink,pink,white,one,evanescent,white,several,leaves
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,edible,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8120,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,brown,one,pendant,buff,several,leaves
8121,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8122,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,smooth,silky,white,white,white,one,evanescent,white,several,leaves


### Preprocessing
We are using a label encoder even though the categories are not ordinal.
This is due to the fact that we will be using a tree model and tree models have a behavior
to perform well with label encoders even when no ordering relationship is present
Also save time in not needing to perform one hot encoding and then pca

In [14]:
# preprocess known mushrooms
# also drop class (due to data leakage later in notebook)
X = known_mushroom.drop(['stalk-root', 'class'], axis=1)
y = known_mushroom[['stalk-root']]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [16]:
X_cols = X_train.columns

one_hot_X = OneHotEncoder(handle_unknown='ignore', sparse=False)
one_hot_y = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train = one_hot_X.fit_transform(X_train)
X_test = one_hot_X.transform(X_test)
# X_train = one_hot_X.fit_transform(X_train)
# X_test = one_hot_X.transform(X_test)

# encode y_train and fit y_test on that encoder
y_train = one_hot_y.fit_transform(y_train)
y_test = one_hot_y.transform(y_test)

In [17]:
X_train.shape, y_train.shape

((4233, 93), (4233, 4))

In [18]:
knn_pipe = Pipeline([('rf', KNeighborsClassifier())])
knn_param_grid = {}

In [19]:
knn_grid = GridSearchCV(knn_pipe, knn_param_grid, cv=5)
knn_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('rf', KNeighborsClassifier())]),
             param_grid={})

In [20]:
train_std = knn_grid.cv_results_['std_test_score'][knn_grid.best_index_]
train_score, test_score = knn_grid.best_score_, knn_grid.score(X_test, y_test)

In [21]:
print('train score:', train_score, 'train std score:', train_std, 'test score:', test_score)

train score: 1.0 train std score: 0.0 test score: 1.0


In [22]:
# save knn_model_for_imputataion
knn_model_for_imputation = knn_grid.best_estimator_
model_path = r'../../model/knn_model_for_imputation.sav'

joblib.dump(knn_model_for_imputation, model_path)

['../../model/knn_model_for_imputation.sav']

# evaluation
With the amount of data and strong correlations from earlier I am not as surprised that we are able to score a 100% accuracy
* default scoring of accuracy metric was used for random forest


In [23]:
# y_pred = rf_model_for_imputation.predict(X_test)
# # confusion_matrix(y_test, y_pred)
# # one_hot_y.inverse_transform(y_pred)#%%
#
# y_labels = np.unique(one_hot_y.inverse_transform(y_test)).tolist()
# plt.figure(figsize=(15,15))
# plot_confusion_matrix(rf_model_for_imputation, X_test, y_test, display_labels=y_labels, cmap='Blues_r')
# plt.show()

# Impute values for unknown mushrooms

In [24]:
X_unknown = unknown_mushroom.drop(['stalk-root', 'class'], axis=1)
y_unknown = unknown_mushroom['stalk-root']

In [25]:
X_unknown = one_hot_X.transform(X_unknown)

In [26]:
# Impute values for missing data
y_pred_unknown = knn_model_for_imputation.predict(X_unknown)
y_pred_unknown = one_hot_y.inverse_transform(y_pred_unknown)
y_pred_unknown

array([['bulbous'],
       ['bulbous'],
       ['equal'],
       ...,
       ['equal'],
       ['bulbous'],
       ['equal']], dtype=object)

# Replace missing data with predicted values

In [27]:
y_pred_unknown


array([['bulbous'],
       ['bulbous'],
       ['equal'],
       ...,
       ['equal'],
       ['bulbous'],
       ['equal']], dtype=object)

In [28]:
unknown_mushroom['stalk-root'] = y_pred_unknown
unknown_mushroom

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3984,edible,convex,scaly,buff,bruises,none,free,close,broad,red,...,smooth,smooth,red,white,white,two,evanescent,white,clustered,waste
4023,poisonous,convex,scaly,red,no,fishy,free,close,narrow,buff,...,silky,smooth,white,white,white,one,evanescent,white,several,paths
4076,edible,flat,scaly,purple,no,none,free,close,narrow,chocolate,...,smooth,fibrous,white,white,white,one,flaring,chocolate,solitary,woods
4100,poisonous,convex,scaly,red,no,fishy,free,close,narrow,buff,...,silky,smooth,pink,pink,white,one,evanescent,white,several,woods
4104,poisonous,convex,scaly,brown,no,foul,free,close,narrow,buff,...,smooth,smooth,pink,pink,white,one,evanescent,white,several,leaves
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,edible,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8120,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,brown,one,pendant,buff,several,leaves
8121,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8122,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,smooth,silky,white,white,white,one,evanescent,white,several,leaves


In [29]:
unknown_mushroom['stalk-root'].isnull().sum()

13

With KNN classifer we were only narrowed down to 13 None values which is better than our random forest model

In [30]:
known_mushroom['stalk-root'].unique(), unknown_mushroom['stalk-root'].unique()

(array(['equal', 'club', 'bulbous', 'rooted'], dtype=object),
 array(['bulbous', 'equal', 'club', None], dtype=object))

In [31]:
unknown_mushroom['stalk-root'].value_counts()

bulbous    2030
equal       407
club         30
Name: stalk-root, dtype: int64

In [32]:
unknown_mushroom['stalk-root'] = unknown_mushroom['stalk-root'].astype('string')

In [33]:
None_mush = unknown_mushroom['stalk-root'].isnull()

unknown_mushroom_2 = unknown_mushroom.loc[None_mush]
known_mushroom_2 = unknown_mushroom.loc[~None_mush]
unknown_mushroom_2

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
6203,edible,knobbed,smooth,white,no,none,free,crowded,broad,gray,...,silky,smooth,white,white,white,two,pendant,white,scattered,grasses
6396,edible,convex,smooth,white,no,none,free,crowded,broad,gray,...,silky,smooth,white,white,white,two,pendant,white,scattered,grasses
7003,edible,convex,smooth,white,no,none,free,crowded,broad,white,...,silky,silky,white,white,white,two,pendant,white,scattered,grasses
7026,edible,convex,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,orange,one,pendant,brown,clustered,leaves
7400,edible,bell,fibrous,white,no,none,free,crowded,broad,gray,...,smooth,silky,white,white,white,two,pendant,white,scattered,grasses
7526,edible,convex,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,brown,one,pendant,brown,clustered,leaves
7584,edible,bell,fibrous,white,no,none,free,crowded,broad,gray,...,silky,smooth,white,white,white,two,pendant,white,scattered,grasses
7623,edible,knobbed,smooth,white,no,none,free,crowded,broad,gray,...,smooth,smooth,white,white,white,two,pendant,white,scattered,grasses
7631,edible,bell,fibrous,white,no,none,free,crowded,broad,white,...,silky,silky,white,white,white,two,pendant,white,scattered,grasses
7640,edible,bell,smooth,white,no,none,free,crowded,broad,pink,...,smooth,smooth,white,white,white,two,pendant,white,scattered,grasses


In [34]:
display(known_mushroom_2)
display(known_mushroom)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3984,edible,convex,scaly,buff,bruises,none,free,close,broad,red,...,smooth,smooth,red,white,white,two,evanescent,white,clustered,waste
4023,poisonous,convex,scaly,red,no,fishy,free,close,narrow,buff,...,silky,smooth,white,white,white,one,evanescent,white,several,paths
4076,edible,flat,scaly,purple,no,none,free,close,narrow,chocolate,...,smooth,fibrous,white,white,white,one,flaring,chocolate,solitary,woods
4100,poisonous,convex,scaly,red,no,fishy,free,close,narrow,buff,...,silky,smooth,pink,pink,white,one,evanescent,white,several,woods
4104,poisonous,convex,scaly,brown,no,foul,free,close,narrow,buff,...,smooth,smooth,pink,pink,white,one,evanescent,white,several,leaves
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,edible,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8120,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,brown,one,pendant,buff,several,leaves
8121,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8122,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,smooth,silky,white,white,white,one,evanescent,white,several,leaves


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7986,edible,bell,scaly,brown,no,none,free,close,broad,white,...,scaly,scaly,brown,brown,white,two,pendant,white,solitary,paths
8001,edible,convex,scaly,brown,no,none,free,close,broad,white,...,scaly,scaly,brown,brown,white,two,pendant,white,solitary,paths
8038,edible,convex,scaly,gray,bruises,none,free,close,broad,white,...,smooth,smooth,white,white,white,two,pendant,white,solitary,paths
8095,poisonous,convex,scaly,cinnamon,no,musty,free,close,broad,yellow,...,silky,scaly,cinnamon,cinnamon,white,none,none,white,clustered,woods


In [35]:
known_mushroom = known_mushroom.append(known_mushroom_2)
known_mushroom

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,edible,knobbed,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8120,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,brown,one,pendant,buff,several,leaves
8121,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8122,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,smooth,silky,white,white,white,one,evanescent,white,several,leaves


# impute with added data
* repeat same steps to predict missing data


In [36]:
# preprocess updated known mushrooms
# also drop class (due to data leakage later in notebook)
X = known_mushroom.drop(['stalk-root', 'class'], axis=1)
y = known_mushroom[['stalk-root']]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [38]:
X_cols = X_train.columns

one_hot_X = OneHotEncoder(handle_unknown='ignore', sparse=False)
one_hot_y = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train = one_hot_X.fit_transform(X_train)
X_test = one_hot_X.transform(X_test)
# X_train = one_hot_X.fit_transform(X_train)
# X_test = one_hot_X.transform(X_test)

# encode y_train and fit y_test on that encoder
y_train = one_hot_y.fit_transform(y_train)
y_test = one_hot_y.transform(y_test)

In [39]:
X_train.shape, y_train.shape

((6083, 111), (6083, 4))

In [40]:
knn_grid_2 = GridSearchCV(knn_pipe, knn_param_grid, cv=5)
knn_grid_2.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('rf', KNeighborsClassifier())]),
             param_grid={})

In [41]:
train_std = knn_grid_2.cv_results_['std_test_score'][knn_grid_2.best_index_]
train_score, test_score = knn_grid_2.best_score_, knn_grid_2.score(X_test, y_test)

In [42]:
print('train score:', train_score, 'train std score:', train_std, 'test score:', test_score)

train score: 0.9871775396791074 train std score: 0.003311333746172482 test score: 0.985207100591716


In [43]:
# save knn_model_for_imputataion
knn_model_for_imputation_2 = knn_grid_2.best_estimator_
model_path = r'../../model/knn_model_for_imputation_3.sav'

joblib.dump(knn_model_for_imputation_2, model_path)

['../../model/knn_model_for_imputation_3.sav']

## With imputed predicted values our score actually drop, perhaps the predictions were not as accurate?


In [44]:
# Impute values for unknown mushrooms
unknown_mushroom_2

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
6203,edible,knobbed,smooth,white,no,none,free,crowded,broad,gray,...,silky,smooth,white,white,white,two,pendant,white,scattered,grasses
6396,edible,convex,smooth,white,no,none,free,crowded,broad,gray,...,silky,smooth,white,white,white,two,pendant,white,scattered,grasses
7003,edible,convex,smooth,white,no,none,free,crowded,broad,white,...,silky,silky,white,white,white,two,pendant,white,scattered,grasses
7026,edible,convex,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,orange,one,pendant,brown,clustered,leaves
7400,edible,bell,fibrous,white,no,none,free,crowded,broad,gray,...,smooth,silky,white,white,white,two,pendant,white,scattered,grasses
7526,edible,convex,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,brown,one,pendant,brown,clustered,leaves
7584,edible,bell,fibrous,white,no,none,free,crowded,broad,gray,...,silky,smooth,white,white,white,two,pendant,white,scattered,grasses
7623,edible,knobbed,smooth,white,no,none,free,crowded,broad,gray,...,smooth,smooth,white,white,white,two,pendant,white,scattered,grasses
7631,edible,bell,fibrous,white,no,none,free,crowded,broad,white,...,silky,silky,white,white,white,two,pendant,white,scattered,grasses
7640,edible,bell,smooth,white,no,none,free,crowded,broad,pink,...,smooth,smooth,white,white,white,two,pendant,white,scattered,grasses


In [45]:
X_unknown = unknown_mushroom_2.drop(['stalk-root', 'class'], axis=1)
y_unknown = unknown_mushroom_2['stalk-root']

In [46]:
X_unknown = one_hot_X.transform(X_unknown)

In [47]:
# Impute values for missing data
y_pred_unknown = knn_model_for_imputation_2.predict(X_unknown)
y_pred_unknown = one_hot_y.inverse_transform(y_pred_unknown)
# np.unique(y_pred_unknown, return_counts=True)
y_pred_unknown

array([[None],
       ['equal'],
       ['bulbous'],
       ['equal'],
       ['bulbous'],
       ['equal'],
       ['equal'],
       ['equal'],
       ['equal'],
       ['equal'],
       ['equal'],
       ['equal'],
       ['equal']], dtype=object)

# Replace missing data with predicted values

In [48]:
unknown_mushroom_2['stalk-root'] = y_pred_unknown
# np.unique(unknown_mushroom_2['stalk-root'], return_counts=True)
unknown_mushroom_2['stalk-root'].unique()

array([None, 'equal', 'bulbous'], dtype=object)

In [49]:
# append the dataframe back to one form
mushroom_imputed = known_mushroom.append(unknown_mushroom_2)
mushroom_imputed

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7631,edible,bell,fibrous,white,no,none,free,crowded,broad,white,...,silky,silky,white,white,white,two,pendant,white,scattered,grasses
7640,edible,bell,smooth,white,no,none,free,crowded,broad,pink,...,smooth,smooth,white,white,white,two,pendant,white,scattered,grasses
7725,edible,convex,fibrous,white,no,none,free,crowded,broad,gray,...,smooth,smooth,white,white,white,two,pendant,white,numerous,grasses
7748,edible,convex,fibrous,white,no,none,free,crowded,broad,white,...,smooth,silky,white,white,white,two,pendant,white,numerous,grasses


In [50]:
isna = mushroom_imputed['stalk-root'].isnull()
mushroom_imputed['stalk-root'].loc[isna] = 'missing'
# mushroom_imputed.isnull().sum()
mushroom_imputed['stalk-root'].loc[isna]

6203    missing
Name: stalk-root, dtype: object

In [51]:
# compare the values of missing mushroom dataset with imputed with replacement mushroom

print('original mushroom dataset', np.unique(mushroom['stalk-root'], return_counts=True))
print('imputed mushroom datasaet', np.unique(mushroom_imputed['stalk-root'], return_counts=True))

original mushroom dataset (array(['bulbous', 'club', 'equal', 'missing', 'rooted'], dtype=object), array([3776,  556, 1120, 2480,  192]))
imputed mushroom datasaet (array(['bulbous', 'club', 'equal', 'missing', 'rooted'], dtype=object), array([5808,  586, 1537,    1,  192]))


# Evaluation
Looks like from this imputation using KNN Classifier and One Hot Encoding for imputation,
the imputed dataset shows an increase in 3 categories, bulbous, club, equal; whereas 'rooted' seems least likely.
But even with 2 imputations we still have 1 missing value; we can drop this value as 1 missing data does not appear to be
that significant for our size

In [52]:
# maybe impute it anyways
missing_stalk = mushroom_imputed['stalk-root'] == 'missing'
unknown_mushroom_3 = mushroom_imputed.loc[missing_stalk]
known_mushroom_3 = mushroom_imputed.loc[~missing_stalk]

# impute with added data
* repeat same steps to predict missing data


In [53]:
# preprocess updated known mushrooms
# also drop class (due to data leakage later in notebook)
X = known_mushroom_3.drop(['stalk-root', 'class'], axis=1)
y = known_mushroom_3[['stalk-root']]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [55]:
X_cols = X_train.columns

one_hot_X = OneHotEncoder(handle_unknown='ignore', sparse=False)
one_hot_y = OneHotEncoder(handle_unknown='ignore', sparse=False)

X_train = one_hot_X.fit_transform(X_train)
X_test = one_hot_X.transform(X_test)
# X_train = one_hot_X.fit_transform(X_train)
# X_test = one_hot_X.transform(X_test)

# encode y_train and fit y_test on that encoder
y_train = one_hot_y.fit_transform(y_train)
y_test = one_hot_y.transform(y_test)

In [56]:
X_train.shape, y_train.shape

((6092, 111), (6092, 4))

In [57]:
knn_grid_3 = GridSearchCV(knn_pipe, knn_param_grid, cv=5)
knn_grid_3.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Pipeline(steps=[('rf', KNeighborsClassifier())]),
             param_grid={})

In [58]:
train_std = knn_grid_3.cv_results_['std_test_score'][knn_grid_3.best_index_]
train_score, test_score = knn_grid_3.best_score_, knn_grid_3.score(X_test, y_test)

In [59]:
print('train score:', train_score, 'train std score:', train_std, 'test score:', test_score)

train score: 0.9826010175505239 train std score: 0.004064075651419906 test score: 0.9857213195470211


In [60]:
# save knn_model_for_imputataion
knn_model_for_imputation_3 = knn_grid_3.best_estimator_
model_path = r'../../model/knn_model_for_imputation_3.sav'

joblib.dump(knn_model_for_imputation_3, model_path)

['../../model/knn_model_for_imputation_3.sav']

## With imputed predicted values our score actually drop, perhaps the predictions were not as accurate?


In [61]:
# Impute values for unknown mushrooms
unknown_mushroom_3

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
6203,edible,knobbed,smooth,white,no,none,free,crowded,broad,gray,...,silky,smooth,white,white,white,two,pendant,white,scattered,grasses


In [62]:
X_unknown = unknown_mushroom_3.drop(['stalk-root', 'class'], axis=1)
y_unknown = unknown_mushroom_3['stalk-root']

In [63]:
X_unknown = one_hot_X.transform(X_unknown)

In [64]:
# Impute values for missing data
y_pred_unknown = knn_model_for_imputation_3.predict(X_unknown)
y_pred_unknown = one_hot_y.inverse_transform(y_pred_unknown)
np.unique(y_pred_unknown, return_counts=True)
# y_pred_unknown

(array(['equal'], dtype=object), array([1]))

# Replace missing data with predicted values

In [65]:
unknown_mushroom_3['stalk-root'] = y_pred_unknown
# np.unique(unknown_mushroom_3['stalk-root'], return_counts=True)
unknown_mushroom_3['stalk-root'].unique()

array(['equal'], dtype=object)

In [66]:
# append the dataframe back to one form
mushroom_imputed = known_mushroom.append(unknown_mushroom_3)
mushroom_imputed


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8120,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,brown,one,pendant,buff,several,leaves
8121,edible,flat,smooth,brown,no,none,attached,close,broad,brown,...,smooth,smooth,orange,orange,orange,one,pendant,buff,clustered,leaves
8122,poisonous,knobbed,scaly,brown,no,fishy,free,close,narrow,buff,...,smooth,silky,white,white,white,one,evanescent,white,several,leaves
8123,edible,convex,smooth,brown,no,none,attached,close,broad,yellow,...,smooth,smooth,orange,orange,orange,one,pendant,orange,clustered,leaves


In [67]:
# compare the values of missing mushroom dataset with imputed with replacement mushroom

print('original mushroom dataset', np.unique(mushroom['stalk-root'], return_counts=True))
print('imputed mushroom datasaet', np.unique(mushroom_imputed['stalk-root'], return_counts=True))

original mushroom dataset (array(['bulbous', 'club', 'equal', 'missing', 'rooted'], dtype=object), array([3776,  556, 1120, 2480,  192]))
imputed mushroom datasaet (array(['bulbous', 'club', 'equal', 'rooted'], dtype=object), array([5806,  586, 1528,  192]))


This model imputer appears most balanced in imputing more variety to the stalk-root compared to the random forest
model; however, the scores for accuracy show that random forest classifier is better by 1% in terms of accuracy; but we
belive that there are other metrics we have not looked at that prove otherwise.
Whichever dataset we choose to use seems suitable for future notebook work

In [68]:
# save this new dataset
path_csv = r'../../data/processed/mushroom_imputed_2.csv'
mushroom_imputed.to_csv(path_csv, index=False)
