In [None]:
import os, zipfile, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, classification_report
import joblib

ROOT = Path(".")
DATA = ROOT / "agaricus-lepiota.data"        # put the UCI file beside this script
FIG  = ROOT / "figures"
FIG.mkdir(exist_ok=True)

cols = ["class","cap-shape","cap-surface","cap-color","bruises","odor",
        "gill-attachment","gill-spacing","gill-size","gill-color",
        "stalk-shape","stalk-root",
        "stalk-surface-above-ring","stalk-surface-below-ring",
        "stalk-color-above-ring","stalk-color-below-ring",
        "veil-type","veil-color","ring-number","ring-type",
        "spore-print-color","population","habitat"]

df = pd.read_csv(DATA, header=None, names=cols)

# Drop veil-type if constant in your file copy
if df["veil-type"].nunique() == 1:
    df = df.drop(columns=["veil-type"])

X_full = df.drop(columns=["class"]).astype(str)
y = (df["class"] == "p").astype(int)  # poisonous=1

# 1) EDA plots
df["class"].replace({"e":"edible","p":"poisonous"}).value_counts().plot(kind="bar")
plt.title("Class Distribution"); plt.xlabel("Class"); plt.ylabel("Count"); plt.tight_layout()
plt.savefig(FIG/"class_distribution.png"); plt.close()

for col in ["odor","bruises","habitat","cap-color"]:
    ax = df.groupby([col,"class"]).size().unstack(fill_value=0)
    (ax.div(ax.sum(axis=1), axis=0)).plot(kind="bar", stacked=True)
    plt.title(f"{col} vs class (row-normalized)"); plt.tight_layout()
    plt.savefig(FIG/f"{col}_vs_class.png"); plt.close()

# 2) Two missing-value strategies
def impute_mode(series):
    vals = series.values
    if (vals == "?").any():
        non = [v for v in vals if v != "?"]
        mode = pd.Series(non).mode().iloc[0]
        return series.replace("?", mode)
    return series

X_s1 = X_full.copy()                      # S1: keep '?' as level
X_s2 = X_full.copy()                      # S2: impute mode for stalk-root
X_s2["stalk-root"] = impute_mode(X_s2["stalk-root"])

# Encoders
ct1 = ColumnTransformer([("oh", OneHotEncoder(handle_unknown="ignore"), X_s1.columns)])
ct2 = ColumnTransformer([("oh", OneHotEncoder(handle_unknown="ignore"), X_s2.columns)])
ord_enc1 = ColumnTransformer([("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), X_s1.columns)])

# Models
models = {
    "KNN(k=5)": KNeighborsClassifier(n_neighbors=5),
    "DecisionTree": DecisionTreeClassifier(random_state=0),
    "CategoricalNB": CategoricalNB()
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

def evaluate(Xdata, transformer, use_cnb=False):
    out = []
    for name, clf in models.items():
        if name == "CategoricalNB":
            pipe = Pipeline([("enc", ord_enc1 if not use_cnb else ord_enc1), ("clf", clf)])
        else:
            pipe = Pipeline([("enc", transformer), ("clf", clf)])
        scores = cross_validate(pipe, Xdata, y, cv=cv, scoring="accuracy")
        out.append((name, scores["test_score"].mean(), scores["test_score"].std()))
    return pd.DataFrame(out, columns=["Model","Mean Acc","Std"])

tbl_s1 = evaluate(X_s1, ct1, use_cnb=True).assign(Strategy="S1_keep_missing_as_level")
tbl_s2 = evaluate(X_s2, ct2, use_cnb=True).assign(Strategy="S2_mode_impute")
summary = pd.concat([tbl_s1, tbl_s2], ignore_index=True).sort_values(["Mean Acc"], ascending=False)
print(summary)

# 3) Fit best model (example: DecisionTree with S1) and export
best_strategy = "S1_keep_missing_as_level" if (tbl_s1["Mean Acc"].max() >= tbl_s2["Mean Acc"].max()) else "S2_mode_impute"
use_X = X_s1 if best_strategy.startswith("S1") else X_s2
use_ct = ct1 if best_strategy.startswith("S1") else ct2
best_name = summary.iloc[0]["Model"]
best_clf = models[best_name]
pipe = Pipeline([("enc", use_ct if best_name!="CategoricalNB" else ord_enc1), ("clf", best_clf)])
pipe.fit(use_X, y)

joblib.dump({"pipeline": pipe, "strategy": best_strategy, "features": list(use_X.columns)}, "mushroom_model.joblib")
print("Saved model: mushroom_model.joblib")

In [6]:
import streamlit as st, pandas as pd, joblib

st.set_page_config(page_title="Mushroom Classifier", layout="centered")
st.title("Mushroom Edibility Classifier")

bundle = joblib.load("mushroom_model.joblib")
pipe = bundle["pipeline"]; features = bundle["features"]

# Simple UI: build a row of inputs
st.write("Enter attributes (as in the UCI codes):")
row = {}
for f in features:
    row[f] = st.text_input(f, value="")

if st.button("Predict"):
    X = pd.DataFrame([row])[features].astype(str)
    yhat = pipe.predict(X)[0]
    proba = getattr(pipe, "predict_proba", lambda X: [[None, None]])(X)[0]
    label = "poisonous" if yhat==1 else "edible"
    st.subheader(f"Prediction: **{label.upper()}**")
    if proba[0] is not None:
        st.write(f"Probabilities â†’ edible: {proba[0]:.3f}, poisonous: {proba[1]:.3f}")

st.markdown("""
**Model card (summary)**  
- Data: UCI Mushroom (all categorical; two binary as per variables table; `stalk-root` had '?' missing).  
- Algorithm: pipeline chosen by 10-fold CV among KNN / DecisionTree / CategoricalNB.  
- Caveat: Foraging safety requires human expert verification.
""")



DeltaGenerator()