## Loading Important Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

## Loading the dataset

In [3]:
data=pd.read_csv('mushrooms.csv')

In [4]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
data.index

RangeIndex(start=0, stop=8124, step=1)

In [6]:
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [27]:
data.isnull().sum()     # no null values

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [None]:
# checking for null values by computing the unique values for each column

In [17]:
print(np.unique(data['class']))

['e' 'p']


In [10]:
print(np.unique(data['cap-shape']))

['b' 'c' 'f' 'k' 's' 'x']


In [12]:
print(np.unique(data['cap-surface']))

['f' 'g' 's' 'y']


In [14]:
print(np.unique(data['bruises']))

['f' 't']


In [15]:
print(np.unique(data['cap-color']))

['b' 'c' 'e' 'g' 'n' 'p' 'r' 'u' 'w' 'y']


In [18]:
print(np.unique(data['odor']))

['a' 'c' 'f' 'l' 'm' 'n' 'p' 's' 'y']


In [19]:
print(np.unique(data['gill-attachment']))

['a' 'f']


In [20]:
print(np.unique(data['gill-spacing']))

['c' 'w']


In [22]:
print(np.unique(data['gill-size']))

['b' 'n']


In [23]:
print(np.unique(data['gill-color']))

['b' 'e' 'g' 'h' 'k' 'n' 'o' 'p' 'r' 'u' 'w' 'y']


In [25]:
print(np.unique(data['stalk-shape']))

['e' 't']


In [26]:
print(np.unique(data['stalk-root']))     # '?' null values

['?' 'b' 'c' 'e' 'r']


In [42]:
data['stalk-root'].value_counts()       # 2480 null values

b    3776
?    2480
e    1120
c     556
r     192
Name: stalk-root, dtype: int64

In [51]:
# Dealing with missing values
data['stalk-root'].replace('?', 'b',inplace=True)

In [52]:
data['stalk-root'].value_counts()  

b    6256
e    1120
c     556
r     192
Name: stalk-root, dtype: int64

In [48]:
print(np.unique(data['stalk-surface-above-ring']))

['f' 'k' 's' 'y']


In [30]:
print(np.unique(data['stalk-surface-below-ring']))

['f' 'k' 's' 'y']


In [31]:
print(np.unique(data['stalk-color-above-ring']))

['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']


In [32]:
print(np.unique(data['stalk-color-below-ring']))

['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']


In [34]:
print(np.unique(data['veil-type']))

['p']


In [35]:
print(np.unique(data['veil-color']))

['n' 'o' 'w' 'y']


In [36]:
print(np.unique(data['ring-number']))

['n' 'o' 't']


In [37]:
print(np.unique(data['ring-type']))

['e' 'f' 'l' 'n' 'p']


In [41]:
print(np.unique(data['spore-print-color']))

['b' 'h' 'k' 'n' 'o' 'r' 'u' 'w' 'y']


In [40]:
print(np.unique(data['population']))

['a' 'c' 'n' 's' 'v' 'y']


In [39]:
print(np.unique(data['habitat']))

['d' 'g' 'l' 'm' 'p' 'u' 'w']


## Data Modeling

In [53]:
for col in data.columns:
  le=LabelEncoder()
  data[col]=le.fit_transform(data[col])

In [54]:
X=data.iloc[:,1:]
y=data['class']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [55]:
# KNN
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
knn_preds=knn.predict(X_test)
accuracy_score(y_test,knn_preds)

0.9975389663658737

In [56]:
# Decision tree
dec=DecisionTreeClassifier()
dec.fit(X_train,y_train)
dec_preds=dec.predict(X_test)
accuracy_score(y_test,dec_preds)

1.0

In [57]:
# SVM
svc=SVC()
svc.fit(X_train,y_train)
svc_preds=svc.predict(X_test)
accuracy_score(y_test,svc_preds)

0.9881050041017228

In [58]:
# Logistic regression
le=LogisticRegression()
le.fit(X_train,y_train)
le_preds=le.predict(X_test)
accuracy_score(y_test,le_preds)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9573420836751435