In [None]:
%load_ext autoreload
%autoreload 2


import os
import sys
import matplotlib.pyplot as plt


while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
sys.path.append('src')
os.getcwd()

In [12]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
df = pd.concat([mushroom.data.features, mushroom.data.targets], axis=1)

In [None]:
df.info()

All objects have the same datatype = object

In [None]:
df.describe()
#count: Total number of non-null (non-missing) entries in the column
#unique: Number of distinct categories in the column
#top: Most frequent category
#freq: Frequency count of the top category

In [None]:
#cheking names of columns
df.columns.tolist()

In [None]:
#Checking null values
df.isnull().sum()

Conclusion: stalk-root have too many null values so it should be dropped

In [None]:
#checking duplicate values 
df.nunique()

**Caterogical variability analysis**

In [None]:
def categorical_variability(df):
    variability = {}
    for col in df.select_dtypes(include='object').columns:
        counts = df[col].value_counts(normalize = True)
        variability[col] = counts.max()
    
    variability_df = pd.DataFrame(
        {"Feature": variability.keys(), "DominantCategoryProportion": variability.values()}
    ).sort_values("DominantCategoryProportion", ascending=False)
    
    return variability_df


variability_df = categorical_variability(df.drop(columns=["poisonous"]))
print(variability_df.head(10))


veil-type = 1.0: It has only one category so it tells us nothing about the edibility of the mushrooms.

We will check other categories with high dominance:


In [None]:
for col in variability_df[variability_df['DominantCategoryProportion'] > 0.9]['Feature']:
    print(f"\nTarget Distribution for {col}:")
    print(pd.crosstab(df[col], df['poisonous'], normalize='index'))

Because 'veil type' has only one value we can see proportion between edible and poisonous mushroom

'Veil colors' 'n', 'o' are strong indicators when it cames to edible mushrooms. The same is for the 'y' which is 100% poisonous. 'w' is just neutral indicator similar to the proportion between edible and poisonous.

'Gill attachment': strong -> 'a'(91.4% edible)
'Ring number: strong -> 'n'(100% poisonous), 't'(88% edible)

Proportions will be easier to read with the boxplots below


In [None]:
categorical_cols = [col for col in df.columns if col != 'poisonous' and df[col].dtype == 'object']

for col in categorical_cols[:]:
    pd.crosstab(df[col], df['poisonous'], normalize='index').plot.bar(stacked=True, color=['green','red'])
    plt.title(f'Poisonous Proportion by {col}')
    plt.ylabel('Proportion')
    plt.show()

# Summary #
1. All objects have the same datatype = object
2. Stalk-root have too many null values so it should be dropped
3. After doing dominant category proportion we obtained that the Veil-type = 1.0: It means that it has only one category so it tells us nothing about the edibility of the mushrooms.(can be dropped)
4. Plots:\
Some mushrooms have the characteristics so unique that are only appearing in one subcategory(only in e or only in p):

**The mushroom is 100% poisonous if:**\
#spore-print-color = 'r'\
ring type = 'l', 'n'\
ring number = 'n'\
veil-color = 'y'\
stalk color above AND below ring = 'b', 'c', 'y'\
gill color = 'b', 'r'\
odor = 'c', 'f', 'm', 'p', 's', 'y'\
cap surface = 'g'\
cap shape = 'c'

**The mushroom is 100% edible if:**\
habitat = 'w'\
population = 'a', 'n'\
spore print color = 'b', 'o', 'u', 'y'\
ring type = 'r'\
veil color = 'n', 'o'\
stalk color above AND below ring = 'e', 'g', 'o'\
stalk root = 'r'\
gill color = 'e', 'o'\
odor = 'a', 'l'\
cap color = 'r', 'u'\
cap shape = 's'