In [1]:
# 4th version
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
mushroom_raw_df = pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush.csv")
mushroom_raw_df.head(10)

Unnamed: 0,Mushroom ID,Cap Shape,Cap Color,Gill Size,Stalk Shape,Habitat,Edibility,Market Demand,Spore Print Color,Population,...,Stalk Surface Below Ring,Stalk Color Above Ring,Stalk Color Below Ring,Veil Type,Veil Color,Growth Rate,Inspection Date,Incubation date,Temperature,Measurement
0,1,sunken,brown,narrow,tapering,leaves,edible,1,orange,abundant,...,silky,white,brown,universal,orange,0.973096463,03/07/2021,03/02/2021,49,C
1,?,convex,brown,narrow,tapering,urban,edible,2,brown,several,...,scaly,pink,white,universal,white,?,03/05/2023,03/04/2023,13,F
2,3,flat,brown,broad,enlarging,grasses,edible,8,yellow,scattered,...,?,gray,orange,partial,orange,1.977619113,01/08/2022,01/04/2022,10,C
3,4,sunken,gray,narrow,tapering,?,poisonous,4,orange,scattered,...,scaly,gray,orange,universal,yellow,2.272762762,02/10/2023,02/02/2023,22,C
4,5,flat,red,?,enlarging,urban,edible,7,brown,several,...,fibrous,orange,white,partial,white,0.718873785,01/09/2023,01/04/2023,9,C
5,6,bell,?,narrow,tapering,grasses,poisonous,6,yellow,several,...,?,orange,white,universal,orange,14.02415593,04/11/2023,04/05/2023,21,C
6,7,flat,brown,broad,tapering,woods,?,5,yellow,scattered,...,scaly,white,white,universal,white,4.320348762,02/08/2023,02/04/2023,13,C
7,8,sunken,red,narrow,enlarging,urban,?,3,yellow,several,...,?,gray,orange,partial,yellow,1.450596518,02/07/2023,02/05/2023,2,C
8,9,?,yellow,narrow,?,grasses,edible,7,yellow,solitary,...,fibrous,brown,gray,?,orange,3.545415336,01/07/2023,01/01/2023,8,C
9,10,bell,?,broad,tapering,paths,poisonous,3,white,?,...,scaly,white,white,universal,yellow,3.866757255,04/05/2023,04/01/2023,10,C


In [3]:
# Handle Mushroom ID column indexing
mushroom_raw_df["Mushroom ID"] = mushroom_raw_df.index+1
#print(mushroom_raw_df.iloc[:16])

In [4]:
# mushroom categorical columns
mushroom_raw_df.replace("?",np.nan,inplace=True)

# Include Edibility Column
'''
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Edibility","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                            ]].isna()
#print(df_bool)
df_bool.isnull().sum()
'''


# Exclude Edibility Column
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                           ]].isna()
#print(df_bool)
df_bool.isnull().sum()

Cap Shape                   0
Cap Color                   0
Gill Size                   0
Stalk Shape                 0
Habitat                     0
Spore Print Color           0
Population                  0
Odor                        0
Bruising                    0
Ring Type                   0
Stalk Surface Above Ring    0
Stalk Surface Below Ring    0
Stalk Color Above Ring      0
Stalk Color Below Ring      0
Veil Type                   0
Veil Color                  0
dtype: int64

In [5]:
# include NaN, run describe()
mushroom_raw_df.describe()

Unnamed: 0,Mushroom ID,Temperature
count,1000.0,1000.0
mean,500.5,14.666
std,288.819436,8.698742
min,1.0,0.0
25%,250.75,7.0
50%,500.5,15.0
75%,750.25,22.0
max,1000.0,49.0


In [6]:
# now we drop all rows with any NaN values
mushroom_drop_NaN = mushroom_raw_df.dropna()
mushroom_drop_NaN.describe()

Unnamed: 0,Mushroom ID,Temperature
count,131.0,131.0
mean,524.580153,14.519084
std,298.132184,8.966048
min,1.0,0.0
25%,233.0,7.5
50%,544.0,13.0
75%,789.0,22.5
max,995.0,49.0


# Before dropping NaN, we had 1000 rows of data, after dropped NaN, we only have 119 rows of data to work with, which is not a good representation of the dataset. I think we should explore other approaches to replace with NaN values.

# Identified those columns are mushroom's feature categorical columns, we choose to apply mode to replace NaN with the most common value of a column.

# With a better understanding of the data, Edibility column is excluded from the mode replacement, as we don't want to misclassify or make inaccurate predictions, we want to ensure predict mushroom edibility safely.

In [7]:
# Exclude Edibility Column, it will be handled seprately soon.
for column in mushroom_raw_df.select_dtypes(include="object").columns:
    if column != "Edibility":
        mushroom_raw_df[column].fillna(mushroom_raw_df[column].mode()[0], inplace=True)
#print(mushroom_raw_df)
#mushroom_raw_df.head(21)
#mushroom_raw_df.describe()

In [8]:
# make a copy of df - which excluded 'Edibility' column
mushroom_raw_df_tr=mushroom_raw_df.copy()
mushroom_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Mushroom ID               1000 non-null   int64 
 1   Cap Shape                 1000 non-null   object
 2   Cap Color                 1000 non-null   object
 3   Gill Size                 1000 non-null   object
 4   Stalk Shape               1000 non-null   object
 5   Habitat                   1000 non-null   object
 6   Edibility                 887 non-null    object
 7   Market Demand             1000 non-null   object
 8   Spore Print Color         1000 non-null   object
 9   Population                1000 non-null   object
 10  Odor                      1000 non-null   object
 11  Bruising                  1000 non-null   object
 12  Ring Type                 1000 non-null   object
 13  Stalk Surface Above Ring  1000 non-null   object
 14  Stalk Surface Below Ring 

In [9]:
# create a treated 'Edibility_Tr1' column hardcode "NotDefined"
'''
mushroom_raw_df_tr=mushroom_raw_df.copy()
mushroom_raw_df_tr["Edibility_Tr1"]=mushroom_raw_df_tr["Edibility"].fillna("NotDefined_1")
mushroom_raw_df_tr.info()
mushroom_raw_df_tr.head(21)
'''

'\nmushroom_raw_df_tr=mushroom_raw_df.copy()\nmushroom_raw_df_tr["Edibility_Tr1"]=mushroom_raw_df_tr["Edibility"].fillna("NotDefined_1")\nmushroom_raw_df_tr.info()\nmushroom_raw_df_tr.head(21)\n'

# We apply SimpleImputer to converted "Edibility" column's missing value "?" to "NotDefined_2". So we can easily identify NotDefined_2 cells are due to be handled when we understand other columns better. First we need to ensure "Mushroom ID" are properly indexed, "Temperature" is converted from Fahrenheit (F) to  Celsius (C).

In [10]:
# Appy SimpleImputer to treat "Edibility" column
from sklearn.impute import SimpleImputer

mushroom_raw_df_tr=mushroom_raw_df.copy()

# Create "NotDefined" with SimpleImputer
imp=SimpleImputer(strategy="constant",fill_value="NotDefined_2")

# create a treated 'Edibility_Tr2' column by using SimpleImputer fit_transform function
mushroom_raw_df_tr[["Edibility_Tr2"]]=imp.fit_transform(mushroom_raw_df_tr[["Edibility"]])
#mushroom_raw_df_tr[["Edibility_Tr2"]]
mushroom_raw_df_tr
#mushroom_raw_df_tr.head(50)

# replace treated "Edibility_Tr2" with "Edibility" column
mushroom_raw_df_tr["Edibility"]=mushroom_raw_df_tr["Edibility_Tr2"]
mushroom_raw_df_tr.drop(["Edibility_Tr2"],axis="columns",inplace=True)
mushroom_raw_df_tr

#----------------------
# Index "Mushroom ID" column
mushroom_raw_df_tr.set_index("Mushroom ID",inplace=True)
#mushroom_raw_df_tr["Mushroom ID"]=pd.to_numeric(mushroom_raw_df_tr["Mushroom ID"], errors="coerce")

# Identify rows with F temperatures and convert them to C ( there are no missing data in Temperature column, we have 0 )
f_rows=mushroom_raw_df_tr['Measurement']=='F'
mushroom_raw_df["Temperature_Tr"]=mushroom_raw_df_tr["Temperature"]
mushroom_raw_df_tr.loc[f_rows,"Temperature"]=(mushroom_raw_df_tr.loc[f_rows,"Temperature"]-32)*5/9

# Update the Measurement column to C for these rows
mushroom_raw_df_tr.loc[f_rows,"Measurement"]="C"

# Ensuring "Mushroom ID" is indexed correctly
#mushroom_raw_df_tr.set_index("Mushroom ID",inplace=True)

# Quickly spot check ensuring the F values have been converted to C values
#temps_index=mushroom_raw_df_tr.loc[[87,88,99,111,141],"Temperature"]
#print(mushroom_raw_df_tr)
#mushroom_raw_df_tr.head(114)

mushroom_raw_df_tr.info() #ensure we now have 1000 rows of data

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 1 to 1000
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Cap Shape                 1000 non-null   object 
 1   Cap Color                 1000 non-null   object 
 2   Gill Size                 1000 non-null   object 
 3   Stalk Shape               1000 non-null   object 
 4   Habitat                   1000 non-null   object 
 5   Edibility                 1000 non-null   object 
 6   Market Demand             1000 non-null   object 
 7   Spore Print Color         1000 non-null   object 
 8   Population                1000 non-null   object 
 9   Odor                      1000 non-null   object 
 10  Bruising                  1000 non-null   object 
 11  Ring Type                 1000 non-null   object 
 12  Stalk Surface Above Ring  1000 non-null   object 
 13  Stalk Surface Below Ring  1000 non-null   object 
 14  Stalk Color A

  -9.44444444  -5.          -8.33333333 -14.44444444 -16.66666667
  -3.88888889 -17.22222222 -11.11111111  -4.44444444  -5.55555556
  -8.88888889 -11.66666667 -16.66666667  -4.44444444  -9.44444444
  -4.44444444 -11.66666667 -11.11111111  -9.44444444 -13.33333333
  -6.11111111 -14.44444444  -2.22222222  -4.44444444  -7.77777778
  -3.33333333 -15.55555556 -16.11111111]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  mushroom_raw_df_tr.loc[f_rows,"Temperature"]=(mushroom_raw_df_tr.loc[f_rows,"Temperature"]-32)*5/9


# We now investigate has Temperature have any impact on mushroom's Growth Rate

In [11]:
corr_temp_growth=mushroom_raw_df_tr[["Growth Rate","Temperature"]].corr()
corr_temp_growth
# The correlation value of 0.049409 is very low, close to 0, indicated a very weak positive correlation between "Growth Rate" and "Temperature" columns.

Unnamed: 0,Growth Rate,Temperature
Growth Rate,1.0,0.049409
Temperature,0.049409,1.0


# With additional research about mushrooms, we need to include more features to analyses the mushrooms.
# We now investigate are other columns combined have strong influence of mushroom's Edibility.
# Identified columns are: "Growth Rate","Temperature","Incubation date","Habitat"

In [12]:
'''
# The only 2 key dated columns in datasets are "Inspection Date" and "Incubation Date", we need to convert them to datetime or numeric format
# "Incubation date"s "date" need to be renamed to "Date"

mushroom_raw_df_tr["Incubation Date"]=mushroom_raw_df_tr["Incubation date"]

mushroom_raw_df_tr["Inspection Date"]=pd.to_datetime(mushroom_raw_df_tr["Inspection Date"]) 
mushroom_raw_df_tr["Incubation Date"]=pd.to_datetime(mushroom_raw_df_tr["Incubation Date"]) 

# Drop "Incubation date" column
mushroom_raw_df_tr.drop(["Incubation date"],axis="columns",inplace=True)

#mushroom_raw_df_tr.info()
#mushroom_raw_df_tr.head(50)
'''

'\n# The only 2 key dated columns in datasets are "Inspection Date" and "Incubation Date", we need to convert them to datetime or numeric format\n# "Incubation date"s "date" need to be renamed to "Date"\n\nmushroom_raw_df_tr["Incubation Date"]=mushroom_raw_df_tr["Incubation date"]\n\nmushroom_raw_df_tr["Inspection Date"]=pd.to_datetime(mushroom_raw_df_tr["Inspection Date"]) \nmushroom_raw_df_tr["Incubation Date"]=pd.to_datetime(mushroom_raw_df_tr["Incubation Date"]) \n\n# Drop "Incubation date" column\nmushroom_raw_df_tr.drop(["Incubation date"],axis="columns",inplace=True)\n\n#mushroom_raw_df_tr.info()\n#mushroom_raw_df_tr.head(50)\n'

In [13]:
# Encode categorical "Habitat" column using OneHotEncoder

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder=LabelEncoder()
mushroom_raw_df_tr["Habitat"]=encoder.fit_transform(mushroom_raw_df_tr["Habitat"])

unique_habitats=mushroom_raw_df_tr["Habitat"].unique()
unique_habitats.sort()

## Validate unique 5 encoding: 0,1,2,3,4
print(unique_habitats)

#mushroom_raw_df_tr.info()
#mushroom_raw_df_tr.head(50)

[0 1 2 3 4]


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder


# The only 2 key dated columns in datasets are "Inspection Date" and "Incubation Date", we need to convert them to datetime or numeric format
# "Incubation date"s "date" need to be renamed to "Date"

mushroom_raw_df_tr["Incubation Date"]=mushroom_raw_df_tr["Incubation date"]

mushroom_raw_df_tr["Inspection Date"]=pd.to_datetime(mushroom_raw_df_tr["Inspection Date"]).astype('int64')
mushroom_raw_df_tr["Incubation Date"]=pd.to_datetime(mushroom_raw_df_tr["Incubation Date"]).astype('int64') 

# Drop "Incubation date" column
mushroom_raw_df_tr.drop(["Incubation date"],axis="columns",inplace=True)

mushroom_raw_df_tr.info()
mushroom_raw_df_tr.head(50)


# Export treated mushroom.csv as mush_Tr.csv
mushroom_raw_df_tr.to_csv('/Users/eupirate/Desktop/CA_data_exp_visdual/mush_Tr.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 1 to 1000
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Cap Shape                 1000 non-null   object 
 1   Cap Color                 1000 non-null   object 
 2   Gill Size                 1000 non-null   object 
 3   Stalk Shape               1000 non-null   object 
 4   Habitat                   1000 non-null   int64  
 5   Edibility                 1000 non-null   object 
 6   Market Demand             1000 non-null   object 
 7   Spore Print Color         1000 non-null   object 
 8   Population                1000 non-null   object 
 9   Odor                      1000 non-null   object 
 10  Bruising                  1000 non-null   object 
 11  Ring Type                 1000 non-null   object 
 12  Stalk Surface Above Ring  1000 non-null   object 
 13  Stalk Surface Below Ring  1000 non-null   object 
 14  Stalk Color A

# We've managed to cleanse our mushroom dataset by processing missing value with various of techniques. 
# Next we can carry out in depth analysis of "Edibility" column with treated new mush_Tr.csv

In [15]:
mush_Tr=pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush_Tr.csv")

mush_Tr_with_id=mush_Tr.copy()

# Index "Mushroom ID" column
mush_Tr_with_id.insert(0,"Mushroom ID",range(1,1+len(mush_Tr_with_id)))
mush_Tr_with_id.set_index("Mushroom ID",inplace=True)
#mush_Tr_with_id.head(50)

# Let's consider the conditional probability of edibility of each categorical features and create bins with numerical features.
# Habitat - habitat
# Growth Rate - GR
# Temperature - temp
# Incubation Date - ID

In [16]:
# Create bins for "Habitat" and calculate the probability 
prob_by_habitat=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby("Habitat")["Edibility"].apply(lambda x:(x=="edible").mean())
prob_by_habitat

Habitat
0    0.497041
1    0.468966
2    0.470588
3    0.567273
4    0.456790
Name: Edibility, dtype: float64

In [17]:
# Create bins for "Growth Rate" and calculate the probability
# Create 4 quantile bins
GR_bins=pd.qcut(mush_Tr_with_id["Growth Rate"],q=4,duplicates="drop")
prob_by_GR_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(GR_bins)["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_GR_bins

  prob_by_GR_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(GR_bins)["Edibility"].apply(lambda x: (x == "edible").mean())


Growth Rate
(0.501, 1.378]     0.477273
(1.378, 2.543]     0.511211
(2.543, 3.747]     0.513393
(3.747, 21.512]    0.509091
Name: Edibility, dtype: float64

In [18]:
# Create bins for "Temperature"and calculate the probability 
# Create 4 quantile bins
temp_bins=pd.qcut(mush_Tr_with_id["Temperature"],q=4,)

prob_by_temp_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(temp_bins)["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_temp_bins

  prob_by_temp_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(temp_bins)["Edibility"].apply(lambda x: (x == "edible").mean())


Temperature
(-17.779, 6.0]    0.484716
(6.0, 14.0]       0.482456
(14.0, 22.0]      0.486486
(22.0, 49.0]      0.562500
Name: Edibility, dtype: float64

In [19]:
# Convert "Incubation Date" to "Incubation Month" ensuring there are data for each month and calculate the probability 
mush_Tr_with_id["Incubation Date"]=pd.to_datetime(mush_Tr_with_id["Incubation Date"],errors="coerce")
# Now we have "Incubation Date" as datetime object, extract the mon
mush_Tr_with_id["Incubation Month"]=mush_Tr_with_id["Incubation Date"].dt.month
# Treate any missing values such as NaN and dropna
mush_Tr_with_id.dropna(subset=["Incubation Month"],inplace=True)
# Calculate the probability of each month
prob_by_ID_month=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby("Incubation Month")["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_ID_month

Incubation Month
1    0.518519
2    0.485030
3    0.548913
4    0.486034
5    0.476923
Name: Edibility, dtype: float64

In [20]:
mush_Tr_with_id.describe()

Unnamed: 0,Habitat,Market Demand,Growth Rate,Inspection Date,Temperature,Incubation Date,Incubation Month
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000,1000.0
mean,2.105,5.132,2.68384,1.67219e+18,13.874444,2022-12-23 14:25:26.400000,3.054
min,0.0,1.0,0.502267,1.578269e+18,-17.777778,2020-01-01 00:00:00,1.0
25%,1.0,3.0,1.377745,1.673222e+18,6.0,2023-01-04 00:00:00,2.0
50%,2.0,5.0,2.542528,1.677888e+18,14.0,2023-03-01 00:00:00,3.0
75%,3.0,7.0,3.746824,1.68102e+18,22.0,2023-04-04 00:00:00,4.0
max,4.0,10.0,21.511763,1.93104e+18,49.0,2031-03-04 00:00:00,5.0
std,1.392089,2.79184,1.825735,2.787587e+16,9.670373,,1.411764
