In [1]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
from scipy import stats
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB

In [2]:
# original mushroom.csv have been saved as source of truth, created a copy mush.csv for analysis
mushroom_raw_df = pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush.csv")

In [3]:
mushroom_raw_df.head()

Unnamed: 0,Mushroom ID,Cap Shape,Cap Color,Gill Size,Stalk Shape,Habitat,Edibility,Market Demand,Spore Print Color,Population,...,Stalk Surface Below Ring,Stalk Color Above Ring,Stalk Color Below Ring,Veil Type,Veil Color,Growth Rate,Inspection Date,Incubation date,Temperature,Measurement
0,1,sunken,brown,narrow,tapering,leaves,edible,1,orange,abundant,...,silky,white,brown,universal,orange,0.973096463,03/07/2021,03/02/2021,49,C
1,?,convex,brown,narrow,tapering,urban,edible,2,brown,several,...,scaly,pink,white,universal,white,?,03/05/2023,03/04/2023,13,F
2,3,flat,brown,broad,enlarging,grasses,edible,8,yellow,scattered,...,?,gray,orange,partial,orange,1.977619113,01/08/2022,01/04/2022,10,C
3,4,sunken,gray,narrow,tapering,?,poisonous,4,orange,scattered,...,scaly,gray,orange,universal,yellow,2.272762762,02/10/2023,02/02/2023,22,C
4,5,flat,red,?,enlarging,urban,edible,7,brown,several,...,fibrous,orange,white,partial,white,0.718873785,01/09/2023,01/04/2023,9,C


In [4]:
mushroom_raw_df.describe()

Unnamed: 0,Temperature
count,1000.0
mean,14.666
std,8.698742
min,0.0
25%,7.0
50%,15.0
75%,22.0
max,49.0


In [5]:
# mushroom_raw_categorical ["Cap Shape","Cap Color","Gill Size","Stalk Shape",
#                            "Habitat","Edibility","Spore Print Color","Population",
#                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
#                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
#                            "Measurement"]

# mushroom_raw_num["Mushroom ID","Market Demand","Bruising","Growth Rate","Temperature"]
# mushroom_raw_date["Inspection Date","Incubation date",]

In [6]:
# mushroom categorical columns
mushroom_raw_df.replace("?",np.nan,inplace=True)

# Include Edibility Column
'''
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Edibility","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                            "Measurement"]].isna()
#print(df_bool)
df_bool.isnull().sum()
'''


# Exclude Edibility Column
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                            "Measurement"]].isna()
#print(df_bool)
df_bool.isnull().sum()

Cap Shape                   0
Cap Color                   0
Gill Size                   0
Stalk Shape                 0
Habitat                     0
Spore Print Color           0
Population                  0
Odor                        0
Bruising                    0
Ring Type                   0
Stalk Surface Above Ring    0
Stalk Surface Below Ring    0
Stalk Color Above Ring      0
Stalk Color Below Ring      0
Veil Type                   0
Veil Color                  0
Measurement                 0
dtype: int64

In [7]:
# Ensuring Edibility NaN isn't replaced, will handle seprately
# mushroom_raw_df.head(21)

In [8]:
# include NaN, run describe()
mushroom_raw_df.describe()

Unnamed: 0,Temperature
count,1000.0
mean,14.666
std,8.698742
min,0.0
25%,7.0
50%,15.0
75%,22.0
max,49.0


In [9]:
# now we drop all rows with any NaN values
mushroom_drop_NaN = mushroom_raw_df.dropna()
mushroom_drop_NaN.describe()

Unnamed: 0,Temperature
count,119.0
mean,14.848739
std,8.99966
min,0.0
25%,8.0
50%,14.0
75%,23.0
max,49.0


# Before dropping NaN, we had 1000 rows of data, after dropped NaN, we only have 119 rows of data to work with, which is not a good representation of the dataset. I think we should explore other approaches to replace with NaN values.

# Identified those columns are mushroom's feature categorical columns, we choose to apply mode to replace NaN with the most common value of a column

# With a better understanding of the data, Edibility column is excluded from the mode replacement, as we don't want to misclassify or make inaccurate predictions, we want to ensure predict mushroom edibility safely.

In [10]:
# Exclude Edibility Column, it will be handled seprately soon.
for column in mushroom_raw_df.select_dtypes(include="object").columns:
    if column != "Edibility":
        mushroom_raw_df[column].fillna(mushroom_raw_df[column].mode()[0], inplace=True)
#print(mushroom_raw_df)
mushroom_raw_df.head(21)
#mushroom_raw_df.describe()

Unnamed: 0,Mushroom ID,Cap Shape,Cap Color,Gill Size,Stalk Shape,Habitat,Edibility,Market Demand,Spore Print Color,Population,...,Stalk Surface Below Ring,Stalk Color Above Ring,Stalk Color Below Ring,Veil Type,Veil Color,Growth Rate,Inspection Date,Incubation date,Temperature,Measurement
0,1,sunken,brown,narrow,tapering,leaves,edible,1,orange,abundant,...,silky,white,brown,universal,orange,0.973096463,03/07/2021,03/02/2021,49,C
1,1,convex,brown,narrow,tapering,urban,edible,2,brown,several,...,scaly,pink,white,universal,white,0.502267005,03/05/2023,03/04/2023,13,F
2,3,flat,brown,broad,enlarging,grasses,edible,8,yellow,scattered,...,silky,gray,orange,partial,orange,1.977619113,01/08/2022,01/04/2022,10,C
3,4,sunken,gray,narrow,tapering,urban,poisonous,4,orange,scattered,...,scaly,gray,orange,universal,yellow,2.272762762,02/10/2023,02/02/2023,22,C
4,5,flat,red,narrow,enlarging,urban,edible,7,brown,several,...,fibrous,orange,white,partial,white,0.718873785,01/09/2023,01/04/2023,9,C
5,6,bell,gray,narrow,tapering,grasses,poisonous,6,yellow,several,...,silky,orange,white,universal,orange,14.02415593,04/11/2023,04/05/2023,21,C
6,7,flat,brown,broad,tapering,woods,,5,yellow,scattered,...,scaly,white,white,universal,white,4.320348762,02/08/2023,02/04/2023,13,C
7,8,sunken,red,narrow,enlarging,urban,,3,yellow,several,...,silky,gray,orange,partial,yellow,1.450596518,02/07/2023,02/05/2023,2,C
8,9,conical,yellow,narrow,enlarging,grasses,edible,7,yellow,solitary,...,fibrous,brown,gray,universal,orange,3.545415336,01/07/2023,01/01/2023,8,C
9,10,bell,gray,broad,tapering,paths,poisonous,3,white,numerous,...,scaly,white,white,universal,yellow,3.866757255,04/05/2023,04/01/2023,10,C


# By applying mode to replace NaN, we have successfully preserved 1000 rows in count, mean remained at 14.666, std at 8.698742 as original raw data set.

In [11]:
# We need to address column Mushroom ID's indexing sequence - starting from 1
mushroom_raw_df.index = range(1,len(mushroom_raw_df)+1)
mushroom_raw_df["Mushroom ID"] = mushroom_raw_df.index
print(mushroom_raw_df.index)

#mushroom_raw_df.head()

mushroom_raw_df.describe()

RangeIndex(start=1, stop=1001, step=1)


Unnamed: 0,Mushroom ID,Temperature
count,1000.0,1000.0
mean,500.5,14.666
std,288.819436,8.698742
min,1.0,0.0
25%,250.75,7.0
50%,500.5,15.0
75%,750.25,22.0
max,1000.0,49.0


In [12]:
# Let's verify Market Demand ? has been replaced with mode, 
#print(mushroom_raw_df[["Market Demand"]].head(30))
mushroom_raw_df[["Market Demand"]].isnull().sum()

Market Demand    0
dtype: int64

In [13]:
print(mushroom_raw_df.isnull().sum())
#mushroom_raw_df.head(21)
#mushroom_raw_df.describe()

Mushroom ID                   0
Cap Shape                     0
Cap Color                     0
Gill Size                     0
Stalk Shape                   0
Habitat                       0
Edibility                   113
Market Demand                 0
Spore Print Color             0
Population                    0
Odor                          0
Bruising                      0
Ring Type                     0
Stalk Surface Above Ring      0
Stalk Surface Below Ring      0
Stalk Color Above Ring        0
Stalk Color Below Ring        0
Veil Type                     0
Veil Color                    0
Growth Rate                   0
Inspection Date               0
Incubation date               0
Temperature                   0
Measurement                   0
dtype: int64


In [14]:
# We convert Bruising column's Yes to 1 and No to 0 
mushroom_raw_df["Bruising"] = mushroom_raw_df["Bruising"].replace({"Yes": 1, "No": 0})
# Then we verify Bruising columns Yes or No has been converted to 1 and 0
#print(mushroom_raw_df[["Bruising"]].head(21))
mushroom_raw_df["Bruising"].isnull().sum() 

0

In [15]:
#print(mushroom_raw_df["Growth Rate"])
mushroom_raw_df["Growth Rate"].isnull().sum()
#mushroom_raw_df.head()
mushroom_raw_df.describe()

Unnamed: 0,Mushroom ID,Bruising,Temperature
count,1000.0,1000.0,1000.0
mean,500.5,0.559,14.666
std,288.819436,0.496755,8.698742
min,1.0,0.0,0.0
25%,250.75,0.0,7.0
50%,500.5,1.0,15.0
75%,750.25,1.0,22.0
max,1000.0,1.0,49.0


In [16]:
# Now, let's handle Edibility Column, taking conditional imputation approach about the missing values in Edibility column