## 1.Loading the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
pd.options.display.max_columns=None
pd.options.display.max_rows=100

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [5]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

## 2.Loading the DataSet

In [6]:
df_mushroom=pd.read_csv('C:/Users/Sushree Jena/Desktop/imarticus/machine learning/Kaggle/mushroom/mushrooms.csv')

In [7]:
df_mushroom.shape

(8124, 23)

In [8]:
df_mushroom.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


## 3.Exploratory Data Analysis

In [9]:
df_mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

We can find from above that there are no missing values in the data.

In [11]:
df_mushroom.describe(include='object')

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [12]:
bivariate_cols=[x for x in df_mushroom if len(df_mushroom[x].unique())==2]
bivariate_cols

['class',
 'bruises',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'stalk-shape']

## 4.Data Encoding

In [13]:
#level encoding the target variable

In [14]:
le=LabelEncoder()

In [15]:
df_mushroom['class']=le.fit_transform(df_mushroom['class'])

In [16]:
df_mushroom['class'].value_counts()

0    4208
1    3916
Name: class, dtype: int64

In [17]:
#dummy encoding the categorical variables

In [18]:
df_mushroom_dummy=pd.get_dummies(df_mushroom,drop_first=True)

In [19]:
df_mushroom_dummy.shape

(8124, 96)

## 5.Separating the dependent and independent variables

In [20]:
X=df_mushroom_dummy.drop('class',axis=1)
y=df_mushroom_dummy['class']

In [21]:
X.shape,y.shape

((8124, 95), (8124,))

## 6.Splitting the data into train and test set

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1)

In [23]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((6499, 95), (1625, 95), (6499,), (1625,))

## 7.Data Scaling

In [24]:
sc=StandardScaler()

In [25]:
X_train_scaled=sc.fit_transform(X_train)
X_test_scaled=sc.transform(X_test)

## 8.Model Building

### 1.Logistic Regression

In [26]:
lr=LogisticRegression()

In [27]:
lr.fit(X_train_scaled,y_train)

In [28]:
predicted_value=lr.predict(X_test_scaled)

In [29]:
cm=confusion_matrix(y_test,predicted_value)
cm

array([[820,   0],
       [  0, 805]], dtype=int64)

In [30]:
print(classification_report(y_test,predicted_value))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



We can find 100% accuracy with Logistic Regression.

### 2.Decision Tree

In [31]:
dt=DecisionTreeClassifier()

In [32]:
dt.fit(X_train_scaled,y_train)

In [33]:
predicted_values=dt.predict(X_test_scaled)

In [34]:
cm1=confusion_matrix(y_test,predicted_values)
cm1

array([[820,   0],
       [  0, 805]], dtype=int64)

In [35]:
print(classification_report(y_test,predicted_values))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



We can find the model is 100% accurate with DecisionTreeClassifier model.

### 3.Random Forest

In [36]:
rf=RandomForestClassifier()

In [37]:
rf.fit(X_train_scaled,y_train)

In [38]:
predicted_values=rf.predict(X_test_scaled)

In [39]:
cm2=confusion_matrix(y_test,predicted_values)
cm2

array([[820,   0],
       [  0, 805]], dtype=int64)

In [40]:
print(classification_report(y_test,predicted_values))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



With RandomForestClassifier model, the model is 100% accurate.

### 4.Support Vector Machine

#### A.Linear Kernel

In [41]:
svc1=SVC(kernel='linear')

In [42]:
svc1.fit(X_train_scaled,y_train)

In [43]:
predicted_values=svc1.predict(X_test_scaled)

In [44]:
cm3=confusion_matrix(y_test,predicted_values)
cm3

array([[820,   0],
       [  0, 805]], dtype=int64)

In [45]:
print(classification_report(y_test,predicted_values))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



#### B.Radial Kernel

In [46]:
svc2=SVC(kernel='rbf')

In [47]:
svc2.fit(X_train_scaled,y_train)

In [48]:
predicted_values=svc2.predict(X_test_scaled)

In [49]:
cm4=confusion_matrix(y_test,predicted_values)
cm4

array([[820,   0],
       [  3, 802]], dtype=int64)

In [50]:
print(classification_report(y_test,predicted_values))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



#### C.Sigmoid Kernel

In [51]:
svc3=SVC(kernel='sigmoid')

In [52]:
svc3.fit(X_train_scaled,y_train)

In [53]:
predicted_values=svc3.predict(X_test_scaled)

In [54]:
cm5=confusion_matrix(y_test,predicted_values)
cm5

array([[813,   7],
       [ 16, 789]], dtype=int64)

In [55]:
print(classification_report(y_test,predicted_values))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       820
           1       0.99      0.98      0.99       805

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625



#### D.Poly Kernel

In [56]:
svc4=SVC(kernel='poly')

In [57]:
svc4.fit(X_train_scaled,y_train)

In [58]:
predicted_values=svc4.predict(X_test_scaled)

In [59]:
cm6=confusion_matrix(y_test,predicted_values)
cm6

array([[820,   0],
       [  2, 803]], dtype=int64)

In [60]:
print(classification_report(y_test,predicted_values))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       820
           1       1.00      1.00      1.00       805

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



From the above classification reports with SVC, we can find some misclassification with Radial, Sigmoid and Poly kernels.

We can conclude that for the dataset, we can use either Logistic Regression model, Decision Tree Classifier model, Random Forest Classifier model or SVC(linear kernel) model, as these models have 100% accurate predictions.