In [None]:
# ***visualizations after cell 33 ***

In [None]:
# 6th version
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
mushroom_raw_df = pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush.csv")
mushroom_raw_df.head(10)

In [None]:
# Handle Mushroom ID column indexing
mushroom_raw_df["Mushroom ID"] = mushroom_raw_df.index+1
#print(mushroom_raw_df.iloc[:16])

In [None]:
# mushroom categorical columns
mushroom_raw_df.replace("?",np.nan,inplace=True)

# Include Edibility Column
'''
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Edibility","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                            ]].isna()
#print(df_bool)
df_bool.isnull().sum()
'''


# Exclude Edibility Column
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                           ]].isna()
#print(df_bool)
df_bool.isnull().sum()

In [None]:
# include NaN, run describe()
mushroom_raw_df.describe()

In [None]:
# now we drop all rows with any NaN values
mushroom_drop_NaN = mushroom_raw_df.dropna()
mushroom_drop_NaN.describe()

# Before dropping NaN, we had 1000 rows of data, after dropped NaN, we only have 119 rows of data to work with, which is not a good representation of the dataset. I think we should explore other approaches to replace with NaN values.

# Identified those columns are mushroom's feature categorical columns, we choose to apply mode to replace NaN with the most common value of a column.

# With a better understanding of the data, Edibility column is excluded from the mode replacement, as we don't want to misclassify or make inaccurate predictions, mushroom's edibility safely is the highest priority.

In [None]:
# Exclude Edibility Column, it will be handled seprately soon.
for column in mushroom_raw_df.select_dtypes(include="object").columns:
    if column != "Edibility":
        mushroom_raw_df[column].fillna(mushroom_raw_df[column].mode()[0], inplace=True)
#print(mushroom_raw_df)
#mushroom_raw_df.head(21)
#mushroom_raw_df.describe()

In [None]:
# make a copy of df - which excluded 'Edibility' column
mushroom_raw_df_tr=mushroom_raw_df.copy()
mushroom_raw_df.info()

In [None]:
# create a treated 'Edibility_Tr1' column hardcode "NotDefined"
'''
mushroom_raw_df_tr=mushroom_raw_df.copy()
mushroom_raw_df_tr["Edibility_Tr1"]=mushroom_raw_df_tr["Edibility"].fillna("NotDefined_1")
mushroom_raw_df_tr.info()
mushroom_raw_df_tr.head(21)
'''

# We apply SimpleImputer to converted "Edibility" column's missing value "?" to "NotDefined_2". So we can easily identify NotDefined_2 cells are due to be handled when we understand other columns better. First we need to ensure "Mushroom ID" are properly indexed. 

In [None]:
# Appy SimpleImputer to treat "Edibility" column
from sklearn.impute import SimpleImputer

mushroom_raw_df_tr=mushroom_raw_df.copy()

# Create "NotDefined" with SimpleImputer
imp=SimpleImputer(strategy="constant",fill_value="NotDefined_2")

# create a treated 'Edibility_Tr2' column by using SimpleImputer fit_transform function
mushroom_raw_df_tr[["Edibility_Tr2"]]=imp.fit_transform(mushroom_raw_df_tr[["Edibility"]])
#mushroom_raw_df_tr[["Edibility_Tr2"]]
mushroom_raw_df_tr
#mushroom_raw_df_tr.head(50)

# replace treated "Edibility_Tr2" with "Edibility" column
mushroom_raw_df_tr["Edibility"]=mushroom_raw_df_tr["Edibility_Tr2"]
mushroom_raw_df_tr.drop(["Edibility_Tr2"],axis="columns",inplace=True)
mushroom_raw_df_tr

# Convert "Temperature" from Fahrenheit F to  Celsius C

In [None]:
#----------Temperature Column------------
# Index "Mushroom ID" column
mushroom_raw_df_tr.set_index("Mushroom ID",inplace=True)
#mushroom_raw_df_tr["Mushroom ID"]=pd.to_numeric(mushroom_raw_df_tr["Mushroom ID"], errors="coerce")

# Identify rows with F temperatures and convert them to C ( there are no missing data in Temperature column, we have 0 )
f_rows=mushroom_raw_df_tr['Measurement']=='F'
mushroom_raw_df["Temperature_Tr"]=mushroom_raw_df_tr["Temperature"]
mushroom_raw_df_tr.loc[f_rows,"Temperature"]=(mushroom_raw_df_tr.loc[f_rows,"Temperature"]-32)*5/9

# Update the Measurement column to C for these rows
mushroom_raw_df_tr.loc[f_rows,"Measurement"]="C"

# Ensuring "Mushroom ID" is indexed correctly
#mushroom_raw_df_tr.set_index("Mushroom ID",inplace=True)

# Quickly spot check ensuring the F values have been converted to C values
#temps_index=mushroom_raw_df_tr.loc[[87,88,99,111,141],"Temperature"]
#print(mushroom_raw_df_tr)
#mushroom_raw_df_tr.head(114)

mushroom_raw_df_tr.info() #ensure we now have 1000 rows of data

# Investigate has Temperature have any correlation on mushroom's Growth Rate.

In [None]:
corr_temp_growth=mushroom_raw_df_tr[["Growth Rate","Temperature"]].corr()
corr_temp_growth

# The correlation value of 0.049409 is very low, indicated a very weak positive correlation between "Growth Rate" and "Temperature" columns.

In [None]:
# Identify outliers in "Temperature" column

plt.figure(figsize=(10,6))
sns.set(style="whitegrid")
sns.boxplot(x="Temperature",y="Growth Rate",data=mushroom_raw_df_tr,palette="coolwarm")

plt.title("Mushroom Temperature Distribution of Outliers")
plt.xlabel("Temperature in C")
plt.yticks([])
plt.legend()
plt.show()


# By applying 3 quartiles 
# Q1 represents 25% of data value below this point
# Q2 represents 50% the median point
# Q3 represents 75% data value below this point
# By working out the difference between Q3 and Q1 to identify data points that are significantly higher or lower than the typical ranage.

Q1=mushroom_raw_df_tr["Temperature"].quantile(0.25)
Q3=mushroom_raw_df_tr["Temperature"].quantile(0.75)

outliers=Q3-Q1
print(outliers)

plt.figure(figsize=(10,6))
sns.set(style="whitegrid")
sns.boxplot(x=mushroom_raw_df_tr["Temperature"],palette="coolwarm")
plt.title("Mushroom Temperature Distribution of Outliers")
plt.xlabel("Temperature in C")
plt.legend()
plt.show()

print(mushroom_raw_df_tr.info())

# With above mushroom temperature distribution visual, we learned the majority of mushroom growth happens in normal range temperatures, but the minority outliers. It also validated the weak positive correlation between "Growth Rate" and "Temperature" column from previous correlation analysis.

# With additional research about growing mushrooms in commercial cultivation farms, we need to include more columns to analyse the mushrooms to investigate what key attributes influence mushroom's Edibility.
# Identified columns are: "Growth Rate","Temperature","Incubation date","Habitat".

In [None]:
'''
# The only 2 key dated columns in datasets are "Inspection Date" and "Incubation Date", we need to convert them to datetime or numeric format
# "Incubation date"s "date" need to be renamed to "Date"

mushroom_raw_df_tr["Incubation Date"]=mushroom_raw_df_tr["Incubation date"]

mushroom_raw_df_tr["Inspection Date"]=pd.to_datetime(mushroom_raw_df_tr["Inspection Date"]) 
mushroom_raw_df_tr["Incubation Date"]=pd.to_datetime(mushroom_raw_df_tr["Incubation Date"]) 

# Drop "Incubation date" column
mushroom_raw_df_tr.drop(["Incubation date"],axis="columns",inplace=True)

#mushroom_raw_df_tr.info()
#mushroom_raw_df_tr.head(50)
'''

In [None]:
# Encode categorical "Habitat" column using OneHotEncoder

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder=LabelEncoder()
mushroom_raw_df_tr["Habitat"]=encoder.fit_transform(mushroom_raw_df_tr["Habitat"])

unique_habitats=mushroom_raw_df_tr["Habitat"].unique()
unique_habitats.sort()

## Validate unique 5 encoding: 0,1,2,3,4
print(unique_habitats)

#mushroom_raw_df_tr.info()
#mushroom_raw_df_tr.head(50)

# We've managed to cleanse our mushroom dataset by processing missing value with various of techniques. managed to cleanse our mushroom dataset by processing missing values with various techniques.
# Next we can carry out in depth analysis and process of "Edibility" column with treated new mush_Tr.csv

In [None]:
mush_Tr=pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush_Tr.csv")
mush_Tr_with_id=mush_Tr.copy()
# Index "Mushroom ID" column
mush_Tr_with_id.insert(0,"Mushroom ID",range(1,1+len(mush_Tr_with_id)))
mush_Tr_with_id.set_index("Mushroom ID",inplace=True)
#mush_Tr_with_id.head(50)

# Let's consider the conditional probability of edibility of each categorical feature and create bins with numerical features.

# Habitat - habitat
# Growth Rate - GR
# Temperature - temp
# Incubation Date - ID
# Market Demand - MD

In [None]:
# Create bins for "Habitat" and calculate the probability 
prob_by_habitat=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby("Habitat")["Edibility"].apply(lambda x:(x=="edible").mean())
prob_by_habitat
# weak influence - we may don't have enough data

In [None]:
# Create bins for "Growth Rate" and calculate the probability
# Create 4 quantile bins
GR_bins=pd.qcut(mush_Tr_with_id["Growth Rate"],q=4,duplicates="drop")
prob_by_GR_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(GR_bins)["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_GR_bins
# weak influence - we may don't have enough data

In [None]:
# Based on the wide range of temperature, we try to normalizae and standardize it
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
mush_Tr_with_id["Temperature"]=scaler.fit_transform(mush_Tr_with_id[["Temperature"]])
mush_Tr_with_id

# Create bins for "Temperature"and calculate the probability 
# Create 4 quantile bins
temp_bins=pd.qcut(mush_Tr_with_id["Temperature"],q=4,)
prob_by_temp_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(temp_bins)["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_temp_bins
# weak influence - we may don't have enough data

In [None]:
# Convert "Incubation Date" to "Incubation Month" ensuring there are data for each month and calculate the probability 
mush_Tr_with_id["Incubation Date"]=pd.to_datetime(mush_Tr_with_id["Incubation Date"],errors="coerce")
# Now we have "Incubation Date" as datetime object, extract the mon
mush_Tr_with_id["Incubation Month"]=mush_Tr_with_id["Incubation Date"].dt.month
# Treate any missing values such as NaN and dropna
mush_Tr_with_id.dropna(subset=["Incubation Month"],inplace=True)
# Calculate the probability of each month
prob_by_ID_month=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby("Incubation Month")["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_ID_month
# weak influence - we may don't have enough data

In [None]:
# Create bins for "Market Demand" and calculate the probability 
# Create 4 quantile bins
MD_bins=pd.qcut(mush_Tr_with_id["Market Demand"],q=4,duplicates="drop")
prob_by_MD_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(MD_bins)["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_MD_bins
# weak influence - we may don't have enough data

In [None]:
# ****From here onwards, will be generated a new version of csv ****

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import numpy as np

# Read from the latest processed cleansed csv
mush_Tr_with_id=pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush_Tr_with_imputed_edibility.csv")

# Encode non numerica columns - as error 'sunken' fired up from "Cap Shape" column
encoders={}
for column in mush_Tr_with_id.columns:
    if mush_Tr_with_id[column].dtype==object and column != "Edibility":
        encode=LabelEncoder()
        mush_Tr_with_id[column]=encoder.fit_transform(mush_Tr_with_id[column].astype(str))
        encoders[column]=encoder
#encoders

In [None]:
# Replace "NotDefined_2" with NaN
mush_Tr_with_id["Edibility"]=mush_Tr_with_id["Edibility"].replace("NotDefined_2",np.nan)

## Apply KNN 1
## imputer=KNNImputer(n_neighbors=5)
## mush_Tr_with_id_imputed=imputer.fit_transform(mush_Tr_with_id)

# run code at this point, we get " ValueError: could not convert string to float: 'edible' "
# KNN can't process string but only numerical values. we will need to exclude "Edibility" and handle seprately 
# We temporarily remove "Edibility" column and then save it back after complete imputation 

edibility_column=mush_Tr_with_id["Edibility"]
mush_Tr_with_id=mush_Tr_with_id.drop("Edibility",axis=1)

# Apply KNN 2
imputer=KNNImputer(n_neighbors=5)
mush_Tr_with_id_imputed=imputer.fit_transform(mush_Tr_with_id)

# Convert imputed np arrary back to a DataFrame
mush_Tr_with_id_imputed=pd.DataFrame(mush_Tr_with_id_imputed,columns=mush_Tr_with_id.columns)

# Add "Edibility" column back to DataFrame
mush_Tr_with_id_imputed["Edibility"]=edibility_column.values

# mush_Tr_with_id_imputed

# Handle Mushroom ID column indexing
mush_Tr_with_id_imputed["Mushroom ID"] = mush_Tr_with_id_imputed.index+1

# output_file_path = "/Users/eupirate/Desktop/CA_data_exp_visdual/mush_Tr_with_imputed_edibility.csv"
# mush_Tr_with_id_imputed.to_csv(output_file_path, index=False)

# Export the latest csv file as "mush_Tr_with_imputed_edibility.csv" 

In [None]:
num_na_edibility=mush_Tr_with_id_imputed["Edibility"].isna().sum()
print(num_na_edibility)

# Now we split the "Edibility" values into 2 parts, known(edible, poisonous) and unknown (NaN)
# Our goal is to apply classification algorithm, use the part with known "Edibility" to train a classifier, and using "Edibility" as the target variable and the other features as predictors.

In [None]:
mush_Tr_with_id_imputed.head()

In [None]:
known_edibility=mush_Tr_with_id_imputed[mush_Tr_with_id_imputed["Edibility"].notna()]
# known_edibility
known_edibility=known_edibility.copy()
# Encode "Edibility" column of all "known" values, we then can apply in classification algorithms
encoder=LabelEncoder()
known_edibility.loc[:,"Edibility_Encoded"]=encoder.fit_transform(known_edibility["Edibility"])


unknown_edibility=mush_Tr_with_id_imputed[mush_Tr_with_id_imputed["Edibility"].isna()]
unknown_edibility

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Split data by known valued to train
from sklearn.model_selection import train_test_split
X=known_edibility.drop(["Edibility","Edibility_Encoded"],axis=1)
y=known_edibility["Edibility_Encoded"]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# Train KNN and Decision Tree classifier using training set
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Train the classifiers
knn=KNeighborsClassifier()
decision_tree=DecisionTreeClassifier()

knn.fit(X_train,y_train)
decision_tree.fit(X_train,y_train)

# Make prediction on the test
y_pred_knn=knn.predict(X_test)
y_pred_decision_tree=decision_tree.predict(X_test)

In [None]:
# Evaluate KNN classifier' scores:
from sklearn.metrics import classification_report,accuracy_score
print(f"KNN Accuracy:",accuracy_score(y_test,y_pred_knn))
print(f"KNN Classification Report:\n",classification_report(y_test,y_pred_knn))

KNN Accuracy score at: 0.4157303370786517, which is low. KNN don't perform well with our dataset.

In [None]:
# Evaluate Decision Tree classifier' scores:
print(f"Decision Tree Accuracy:",accuracy_score(y_test,y_pred_decision_tree))
print(f"Decision Tree Classification Report:\n",classification_report(y_test,y_pred_decision_tree))

Decision Tree Accuracy score at: 0.5224719101123596, it performed a little better than KNN classifier but not significantly.

In [None]:
# *** visualizations ***

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
mush_Tr_with_id_imputed["Edibility"]=mush_Tr_with_id_imputed["Edibility"].fillna("Unknown")

In [None]:
# Visualization 1 
# Distribution of Edible and Poisonous Mushroom ( Unclassified = Unknown )
plt.figure(figsize=(10, 6))
sns.countplot(x="Edibility",data=mush_Tr_with_id_imputed,palette="coolwarm")
plt.title("Distribution of Edible vs Poisonous Mushrooms")
plt.ylabel("Count")
plt.xlabel("Edibility Status")
plt.show()

In [None]:
# Visualization 2
# Market Demand by EEdibility
plt.figure(figsize=(10, 6))
sns.boxplot(x="Edibility", y="Market Demand",data=mush_Tr_with_id_imputed, palette="coolwarm")
plt.title("Market Demand by Edibility")
plt.ylabel("Market Demand")
plt.xlabel("Edibility Status")
plt.show()

In [None]:
# Visualization 3
# Habitat Distribution of Mushrooms
plt.figure(figsize=(10, 6))
habitat_counts=mush_Tr_with_id_imputed["Habitat"].value_counts()
habitat_labels=habitat_counts.index
plt.pie(habitat_counts,labels=habitat_labels,autopct="%1.1f%%",startangle=140,colors=sns.color_palette("Set2"))
plt.title("Habitat Distribution of Mushrooms")
plt.axis("equal")  
plt.show()

In [None]:
# Visualization 4
# Temperature Range for Mushroom Growth
plt.figure(figsize=(10, 6))
sns.histplot(data=mush_Tr_with_id_imputed,x="Temperature", hue="Edibility", multiple="stack", palette="cool")
plt.title("Temperature Range of Mushroom's Growth")
plt.xlabel("Temperature")
plt.ylabel("Count")
plt.show()

In [None]:
# Visualization 5 - drop
# Correlation Heatmap
plt.figure(figsize=(14, 10))
correlation_matrix=mush_Tr_with_id_imputed.select_dtypes(include=["float64", "int64"]).corr()
sns.heatmap(correlation_matrix,annot=True,cmap="coolwarm",fmt=".2f")
plt.title("Correlation Heatmap of Mushroom Attributes")
plt.show()


Conclusion:

Consider the risk of mushroom's Edibility when wrong predictions are made can be dangerous.
Given the datase and dictionary, we don't know how the raw data was collected, processed, were there human errors or sensor errors.

According to the Mushroom's Edibility Unknown category data, it is not positioned in extreme low or high of Market Demand, so it’s worth exploring further to unfold the potential commercial value.

For the business standpoint, we would recommend holding these mushrooms off for sales for further analysis, testing and classifying.
We recommend consulting with domain experts and seek their advice, based on these mushroom's characteristics and what additional features we should explore and identify for official certification if they turned out to be Edible.

With additional research, we are recommending these resources for this agriculture company seeking professional advice and evaluating the current classification process and obtaining ISO standard to prevent unknown, low quality data leaking into the rest of the business pipeline.

According to data, China, US and Japan are the highest mushroom producers in the world. Here are some useful resources to assist.
