In [None]:
# 5th version
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
mushroom_raw_df = pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush.csv")
mushroom_raw_df.head(10)

In [None]:
# Handle Mushroom ID column indexing
mushroom_raw_df["Mushroom ID"] = mushroom_raw_df.index+1
#print(mushroom_raw_df.iloc[:16])

In [None]:
# mushroom categorical columns
mushroom_raw_df.replace("?",np.nan,inplace=True)

# Include Edibility Column
'''
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Edibility","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                            ]].isna()
#print(df_bool)
df_bool.isnull().sum()
'''


# Exclude Edibility Column
df_bool = mushroom_raw_df[["Cap Shape","Cap Color","Gill Size","Stalk Shape",
                            "Habitat","Spore Print Color","Population",
                            "Odor","Bruising","Ring Type","Stalk Surface Above Ring","Stalk Surface Below Ring",
                            "Stalk Color Above Ring","Stalk Color Below Ring","Veil Type","Veil Color",
                           ]].isna()
#print(df_bool)
df_bool.isnull().sum()

In [None]:
# include NaN, run describe()
mushroom_raw_df.describe()

In [None]:
# now we drop all rows with any NaN values
mushroom_drop_NaN = mushroom_raw_df.dropna()
mushroom_drop_NaN.describe()

# Before dropping NaN, we had 1000 rows of data, after dropped NaN, we only have 119 rows of data to work with, which is not a good representation of the dataset. I think we should explore other approaches to replace with NaN values.

# Identified those columns are mushroom's feature categorical columns, we choose to apply mode to replace NaN with the most common value of a column.

# With a better understanding of the data, Edibility column is excluded from the mode replacement, as we don't want to misclassify or make inaccurate predictions, we want to ensure predict mushroom edibility safely.

In [None]:
# Exclude Edibility Column, it will be handled seprately soon.
for column in mushroom_raw_df.select_dtypes(include="object").columns:
    if column != "Edibility":
        mushroom_raw_df[column].fillna(mushroom_raw_df[column].mode()[0], inplace=True)
#print(mushroom_raw_df)
#mushroom_raw_df.head(21)
#mushroom_raw_df.describe()

In [None]:
# make a copy of df - which excluded 'Edibility' column
mushroom_raw_df_tr=mushroom_raw_df.copy()
mushroom_raw_df.info()

In [None]:
# create a treated 'Edibility_Tr1' column hardcode "NotDefined"
'''
mushroom_raw_df_tr=mushroom_raw_df.copy()
mushroom_raw_df_tr["Edibility_Tr1"]=mushroom_raw_df_tr["Edibility"].fillna("NotDefined_1")
mushroom_raw_df_tr.info()
mushroom_raw_df_tr.head(21)
'''

# We apply SimpleImputer to converted "Edibility" column's missing value "?" to "NotDefined_2". So we can easily identify NotDefined_2 cells are due to be handled when we understand other columns better. First we need to ensure "Mushroom ID" are properly indexed, "Temperature" is converted from Fahrenheit (F) to  Celsius (C).

In [None]:
# Appy SimpleImputer to treat "Edibility" column
from sklearn.impute import SimpleImputer

mushroom_raw_df_tr=mushroom_raw_df.copy()

# Create "NotDefined" with SimpleImputer
imp=SimpleImputer(strategy="constant",fill_value="NotDefined_2")

# create a treated 'Edibility_Tr2' column by using SimpleImputer fit_transform function
mushroom_raw_df_tr[["Edibility_Tr2"]]=imp.fit_transform(mushroom_raw_df_tr[["Edibility"]])
#mushroom_raw_df_tr[["Edibility_Tr2"]]
mushroom_raw_df_tr
#mushroom_raw_df_tr.head(50)

# replace treated "Edibility_Tr2" with "Edibility" column
mushroom_raw_df_tr["Edibility"]=mushroom_raw_df_tr["Edibility_Tr2"]
mushroom_raw_df_tr.drop(["Edibility_Tr2"],axis="columns",inplace=True)
mushroom_raw_df_tr

#----------------------
# Index "Mushroom ID" column
mushroom_raw_df_tr.set_index("Mushroom ID",inplace=True)
#mushroom_raw_df_tr["Mushroom ID"]=pd.to_numeric(mushroom_raw_df_tr["Mushroom ID"], errors="coerce")

# Identify rows with F temperatures and convert them to C ( there are no missing data in Temperature column, we have 0 )
f_rows=mushroom_raw_df_tr['Measurement']=='F'
mushroom_raw_df["Temperature_Tr"]=mushroom_raw_df_tr["Temperature"]
mushroom_raw_df_tr.loc[f_rows,"Temperature"]=(mushroom_raw_df_tr.loc[f_rows,"Temperature"]-32)*5/9

# Update the Measurement column to C for these rows
mushroom_raw_df_tr.loc[f_rows,"Measurement"]="C"

# Ensuring "Mushroom ID" is indexed correctly
#mushroom_raw_df_tr.set_index("Mushroom ID",inplace=True)

# Quickly spot check ensuring the F values have been converted to C values
#temps_index=mushroom_raw_df_tr.loc[[87,88,99,111,141],"Temperature"]
#print(mushroom_raw_df_tr)
#mushroom_raw_df_tr.head(114)

mushroom_raw_df_tr.info() #ensure we now have 1000 rows of data

In [None]:
# We now investigate has Temperature have any impact on mushroom's Growth Rate

In [None]:
corr_temp_growth=mushroom_raw_df_tr[["Growth Rate","Temperature"]].corr()
corr_temp_growth
# The correlation value of 0.049409 is very low, close to 0, indicated a very weak positive correlation between "Growth Rate" and "Temperature" columns.

# With additional research about mushrooms, we need to include more features to analyses the mushrooms.
# We now investigate are other columns combined have strong influence of mushroom's Edibility.
# Identified columns are: "Growth Rate","Temperature","Incubation date","Habitat"

In [None]:
'''
# The only 2 key dated columns in datasets are "Inspection Date" and "Incubation Date", we need to convert them to datetime or numeric format
# "Incubation date"s "date" need to be renamed to "Date"

mushroom_raw_df_tr["Incubation Date"]=mushroom_raw_df_tr["Incubation date"]

mushroom_raw_df_tr["Inspection Date"]=pd.to_datetime(mushroom_raw_df_tr["Inspection Date"]) 
mushroom_raw_df_tr["Incubation Date"]=pd.to_datetime(mushroom_raw_df_tr["Incubation Date"]) 

# Drop "Incubation date" column
mushroom_raw_df_tr.drop(["Incubation date"],axis="columns",inplace=True)

#mushroom_raw_df_tr.info()
#mushroom_raw_df_tr.head(50)
'''

In [None]:
# Encode categorical "Habitat" column using OneHotEncoder

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder=LabelEncoder()
mushroom_raw_df_tr["Habitat"]=encoder.fit_transform(mushroom_raw_df_tr["Habitat"])

unique_habitats=mushroom_raw_df_tr["Habitat"].unique()
unique_habitats.sort()

## Validate unique 5 encoding: 0,1,2,3,4
print(unique_habitats)

#mushroom_raw_df_tr.info()
#mushroom_raw_df_tr.head(50)

In [None]:
# Encode categorical "Habitat" column using OneHotEncoder

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder=LabelEncoder()
mushroom_raw_df_tr["Habitat"]=encoder.fit_transform(mushroom_raw_df_tr["Habitat"])

unique_habitats=mushroom_raw_df_tr["Habitat"].unique()
unique_habitats.sort()

## Validate unique 5 encoding: 0,1,2,3,4
print(unique_habitats)

#mushroom_raw_df_tr.info()
#mushroom_raw_df_tr.head(50)

# We've managed to cleanse our mushroom dataset by processing missing value with various of techniques. 
# Next we can carry out in depth analysis of "Edibility" column with treated new mush_Tr.csv

In [None]:
mush_Tr=pd.read_csv("/Users/eupirate/Desktop/CA_data_exp_visdual/mush_Tr.csv")

mush_Tr_with_id=mush_Tr.copy()

# Index "Mushroom ID" column
mush_Tr_with_id.insert(0,"Mushroom ID",range(1,1+len(mush_Tr_with_id)))
mush_Tr_with_id.set_index("Mushroom ID",inplace=True)
#mush_Tr_with_id.head(50)

# Let's consider the conditional probability of edibility of each categorical features and create bins with numerical features.
# Habitat - habitat
# Growth Rate - GR
# Temperature - temp
# Incubation Date - ID

In [None]:
# Create bins for "Habitat" and calculate the probability 
prob_by_habitat=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby("Habitat")["Edibility"].apply(lambda x:(x=="edible").mean())
prob_by_habitat

In [None]:
# Create bins for "Growth Rate" and calculate the probability
# Create 4 quantile bins
GR_bins=pd.qcut(mush_Tr_with_id["Growth Rate"],q=4,duplicates="drop")
prob_by_GR_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(GR_bins)["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_GR_bins

In [None]:
# Create bins for "Temperature"and calculate the probability 
# Create 4 quantile bins
temp_bins=pd.qcut(mush_Tr_with_id["Temperature"],q=4,)

prob_by_temp_bins=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby(temp_bins)["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_temp_bins

In [None]:
# Convert "Incubation Date" to "Incubation Month" ensuring there are data for each month and calculate the probability 
mush_Tr_with_id["Incubation Date"]=pd.to_datetime(mush_Tr_with_id["Incubation Date"],errors="coerce")
# Now we have "Incubation Date" as datetime object, extract the mon
mush_Tr_with_id["Incubation Month"]=mush_Tr_with_id["Incubation Date"].dt.month
# Treate any missing values such as NaN and dropna
mush_Tr_with_id.dropna(subset=["Incubation Month"],inplace=True)
# Calculate the probability of each month
prob_by_ID_month=mush_Tr_with_id[mush_Tr_with_id["Edibility"]!="NotDefined_2"].groupby("Incubation Month")["Edibility"].apply(lambda x: (x == "edible").mean())
prob_by_ID_month

In [None]:
mush_Tr_with_id.describe()