<a href="https://colab.research.google.com/github/zjkaminska/NWB-mushrooms/blob/main/copy%20NWB_mushroom_edibility_prediction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
import warnings
warnings.simplefilter(action="ignore")

In [60]:
print(df.info())
print(df.isnull().sum())

# The data is in a single column, separated by semicolons.
# We need to split this column into multiple columns.
df = df['class;cap-diameter;cap-shape;cap-surface;cap-color;does-bruise-or-bleed;gill-attachment;gill-spacing;gill-color;stem-height;stem-width;stem-root;stem-surface;stem-color;veil-type;veil-color;has-ring;ring-type;spore-print-color;habitat;season'].str.split(';', expand=True)

# Assign meaningful column names
df.columns = ['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

# Convert relevant columns to numeric, coercing errors to NaN
numeric_cols = ['cap-diameter', 'stem-height', 'stem-width']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Inspect missing values again after splitting and type conversion
print(df.isnull().sum())

# Handle missing values - for simplicity, we will drop rows with any missing values for now
df.dropna(inplace=True)

# Identify categorical columns (all columns except the numeric ones)
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Exclude the target variable from categorical columns for encoding
categorical_cols.remove('class')


# Apply one-hot encoding to categorical features
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

display(df_encoded.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                                             Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                                             --------------  ----- 
 0   class;cap-diameter;cap-shape;cap-surface;cap-color;does-bruise-or-bleed;gill-attachment;gill-spacing;gill-color;stem-height;stem-width;stem-root;stem-surface;stem-color;veil-type;veil-color;has-ring;ring-type;spore-print-color;habitat;season  61069 non-null  object
dtypes: object(1)
memory usage: 477.2+ KB
None
class;cap-diameter;cap-shape;cap

Unnamed: 0,class,cap-diameter,stem-height,stem-width,cap-shape_c,cap-shape_f,cap-shape_o,cap-shape_p,cap-shape_s,cap-shape_x,...,habitat_g,habitat_h,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,season_s,season_u,season_w
0,p,15.26,16.95,17.09,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
1,p,16.6,17.99,18.19,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
2,p,14.07,17.8,17.74,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
3,p,14.17,15.77,15.98,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,p,14.64,16.53,17.2,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True


In [61]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = df_encoded.drop('class', axis=1)
y = df_encoded['class']

# Split data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (48855, 111) (48855,)
Testing set shape: (12214, 111) (12214,)


In [62]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the model with default parameters
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

In [63]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='p') # Assuming 'p' is the positive class (poisonous)
recall = recall_score(y_test, y_pred, pos_label='p')
f1 = f1_score(y_test, y_pred, pos_label='p')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000


In [64]:
import ipywidgets as widgets
from IPython.display import display

# Create a dictionary to map categorical values to integer codes
categorical_features = df.select_dtypes(include='object').columns.tolist()
categorical_features.remove('class')

category_mapping = {}
for col in categorical_features:
    unique_values = df[col].unique().tolist()
    category_mapping[col] = {value: i for i, value in enumerate(unique_values)}

print("Categorical Feature Mapping:")
for col, mapping in category_mapping.items():
    print(f"{col}: {mapping}")

Categorical Feature Mapping:
cap-shape: {'x': 0, 'f': 1, 'p': 2, 'b': 3, 'c': 4, 's': 5, 'o': 6}
cap-surface: {'g': 0, 'h': 1, '': 2, 't': 3, 'y': 4, 'e': 5, 's': 6, 'l': 7, 'd': 8, 'w': 9, 'i': 10, 'k': 11}
cap-color: {'o': 0, 'e': 1, 'n': 2, 'g': 3, 'r': 4, 'w': 5, 'y': 6, 'p': 7, 'u': 8, 'b': 9, 'l': 10, 'k': 11}
does-bruise-or-bleed: {'f': 0, 't': 1}
gill-attachment: {'e': 0, '': 1, 'a': 2, 'd': 3, 's': 4, 'x': 5, 'p': 6, 'f': 7}
gill-spacing: {'': 0, 'c': 1, 'd': 2, 'f': 3}
gill-color: {'w': 0, 'n': 1, 'p': 2, 'u': 3, 'b': 4, 'g': 5, 'y': 6, 'r': 7, 'e': 8, 'o': 9, 'k': 10, 'f': 11}
stem-root: {'s': 0, '': 1, 'b': 2, 'r': 3, 'c': 4, 'f': 5}
stem-surface: {'y': 0, '': 1, 's': 2, 'k': 3, 'i': 4, 'h': 5, 't': 6, 'g': 7, 'f': 8}
stem-color: {'w': 0, 'y': 1, 'n': 2, 'u': 3, 'b': 4, 'l': 5, 'r': 6, 'p': 7, 'e': 8, 'k': 9, 'g': 10, 'o': 11, 'f': 12}
veil-type: {'u': 0, '': 1}
veil-color: {'w': 0, 'y': 1, '': 2, 'n': 3, 'e': 4, 'u': 5, 'k': 6}
has-ring: {'t': 0, 'f': 1}
ring-type: {'g': 0

In [65]:
# Create a dictionary to map the letter codes to descriptive labels based on the provided information
descriptive_mapping = {
    'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 's': 'sunken', 'p': 'spherical', 'o': 'others'},
    'cap-surface': {'i': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth', 'h': 'shiny', 'l': 'leathery', 'k': 'silky', 't': 'sticky', 'w': 'wrinkled', 'e': 'fleshy', '': 'unknown'},
    'cap-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black'},
    'does-bruise-or-bleed': {'t': 'yes', 'f': 'no'},
    'gill-attachment': {'a': 'adnate', 'x': 'adnexed', 'd': 'decurrent', 'e': 'free', 's': 'sinuate', 'p': 'pores', 'f': 'none', '': 'unknown'},
    'gill-spacing': {'c': 'close', 'd': 'distant', 'f': 'none', '': 'unknown'},
    'gill-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none', '': 'unknown'},
    'stem-root': {'b': 'bulbous', 's': 'swollen', 'c': 'club', 'u': 'cup', 'e': 'equal', 'z': 'rhizomorphs', 'r': 'rooted', 'f': 'none', '': 'unknown'},
    'stem-surface': {'i': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth', 'h': 'shiny', 'l': 'leathery', 'k': 'silky', 't': 'sticky', 'w': 'wrinkled', 'e': 'fleshy', 'f': 'none', '': 'unknown'},
    'stem-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none', '': 'unknown'},
    'veil-type': {'p': 'partial', 'u': 'universal', '': 'unknown'},
    'veil-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', '': 'unknown'},
    'has-ring': {'t': 'ring', 'f': 'none'},
    'ring-type': {'c': 'cobwebby', 'e': 'evanescent', 'r': 'flaring', 'g': 'grooved', 'l': 'large', 'p': 'pendant', 's': 'sheathing', 'z': 'zone', 'y': 'scaly', 'm': 'movable', 'f': 'none', '': 'unknown'},
    'spore-print-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', '': 'unknown'},
    'habitat': {'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'h': 'heaths', 'u': 'urban', 'w': 'waste', 'd': 'woods'},
    'season': {'s': 'spring', 'u': 'summer', 'a': 'autumn', 'w': 'winter'}
}

# Create input widgets for each feature
feature_widgets = {}

# Categorical features
# Exclude numerical features that the user does not want to use for input
excluded_features = ['cap-diameter', 'stem-height', 'stem-width']
input_categorical_features = [col for col in categorical_features if col not in excluded_features]


for col in input_categorical_features:
    # Get unique values from the dataframe
    unique_values = df[col].unique().tolist()

    # Create options for the dropdown using descriptive labels
    options = [('unsure', 'unsure')]  # Add 'unsure' option first

    for value in unique_values:
        if col in descriptive_mapping and value in descriptive_mapping[col]:
            options.append((descriptive_mapping[col][value], value))
        else:
            options.append((value, value)) # Use the original value if no descriptive mapping is found


    feature_widgets[col] = widgets.Dropdown(description=f'{col.replace("-", " ").title()}:', options=options)

# Create a button widget
predict_button = widgets.Button(description='Predict Edibility')

# Create an output widget
output_widget = widgets.Output()

In [66]:
# Arrange the widgets in a layout
input_widgets_list = [feature_widgets[col] for col in input_categorical_features]
ui = widgets.VBox(input_widgets_list + [predict_button, output_widget])

# Display the user interface
display(ui)

VBox(children=(Dropdown(description='Cap Shape:', options=(('unsure', 'unsure'), ('convex', 'x'), ('flat', 'f'…

In [67]:
import numpy as np

def on_predict_button_clicked(b):
    with output_widget:
        output_widget.clear_output()
        try:
            # Get input values from widgets
            input_data = {}
            for col in feature_widgets:
                input_data[col] = feature_widgets[col].value

            # Create a DataFrame from the input data
            input_df = pd.DataFrame([input_data])

            # Handle 'unsure' by replacing with NaN and then dropping columns with NaN for this specific input
            # This approach effectively skips the category for the prediction
            input_df.replace('unsure', np.nan, inplace=True)
            input_df_cleaned = input_df.dropna(axis=1)


            # Preprocess the input data to match the training data format (one-hot encoding)
            # Recreate categorical_features from the split dataframe
            categorical_features_split = df.select_dtypes(include='object').columns.tolist()
            if 'class' in categorical_features_split:
                categorical_features_split.remove('class')

            # Identify categorical columns in the cleaned input data
            input_categorical_features_cleaned = input_df_cleaned.select_dtypes(include='object').columns.tolist()


            # Apply one-hot encoding to categorical features in the cleaned input data
            # Ensure we only encode columns that were originally categorical and are present in the cleaned input
            columns_to_encode = [col for col in input_categorical_features_cleaned if col in categorical_features_split]
            input_df_encoded = pd.get_dummies(input_df_cleaned, columns=columns_to_encode, drop_first=True)


            # Ensure the input columns match the training data columns
            # Add missing columns with default value 0
            for col in X_train.columns:
                if col not in input_df_encoded.columns:
                    input_df_encoded[col] = 0
            # Drop extra columns that are not in the training data
            for col in input_df_encoded.columns:
                if col not in X_train.columns:
                    input_df_encoded = input_df_encoded.drop(col, axis=1)


            # Ensure the order of columns is the same as in the training data
            input_df_encoded = input_df_encoded[X_train.columns]


            # Make prediction and get probability estimates
            prediction = model.predict(input_df_encoded)
            prediction_proba = model.predict_proba(input_df_encoded)

            # Get the probability of the predicted class
            predicted_class_proba = np.max(prediction_proba) * 100


            # Display the prediction with certainty
            predicted_class = 'Poisonous' if prediction[0] == 'p' else 'Edible'
            print(f"Prediction: {predicted_class} ({predicted_class_proba:.2f}% certainty)")

        except Exception as e:
            print(f"Error during prediction: {e}")

# Link the button click event to the prediction function
predict_button.on_click(on_predict_button_clicked)

In [69]:
# Create a dictionary to map categorical values to integer codes
# This is done after splitting and type conversion to correctly identify categorical columns
categorical_features = df.select_dtypes(include='object').columns.tolist()
# Exclude the target variable from categorical features
if 'class' in categorical_features:
    categorical_features.remove('class')


category_mapping = {}
for col in categorical_features:
    unique_values = df[col].unique().tolist()
    category_mapping[col] = {value: i for i, value in enumerate(unique_values)}

print("Categorical Feature Mapping:")
for col, mapping in category_mapping.items():
    print(f"{col}: {mapping}")

Categorical Feature Mapping:
cap-shape: {'x': 0, 'f': 1, 'p': 2, 'b': 3, 'c': 4, 's': 5, 'o': 6}
cap-surface: {'g': 0, 'h': 1, '': 2, 't': 3, 'y': 4, 'e': 5, 's': 6, 'l': 7, 'd': 8, 'w': 9, 'i': 10, 'k': 11}
cap-color: {'o': 0, 'e': 1, 'n': 2, 'g': 3, 'r': 4, 'w': 5, 'y': 6, 'p': 7, 'u': 8, 'b': 9, 'l': 10, 'k': 11}
does-bruise-or-bleed: {'f': 0, 't': 1}
gill-attachment: {'e': 0, '': 1, 'a': 2, 'd': 3, 's': 4, 'x': 5, 'p': 6, 'f': 7}
gill-spacing: {'': 0, 'c': 1, 'd': 2, 'f': 3}
gill-color: {'w': 0, 'n': 1, 'p': 2, 'u': 3, 'b': 4, 'g': 5, 'y': 6, 'r': 7, 'e': 8, 'o': 9, 'k': 10, 'f': 11}
stem-root: {'s': 0, '': 1, 'b': 2, 'r': 3, 'c': 4, 'f': 5}
stem-surface: {'y': 0, '': 1, 's': 2, 'k': 3, 'i': 4, 'h': 5, 't': 6, 'g': 7, 'f': 8}
stem-color: {'w': 0, 'y': 1, 'n': 2, 'u': 3, 'b': 4, 'l': 5, 'r': 6, 'p': 7, 'e': 8, 'k': 9, 'g': 10, 'o': 11, 'f': 12}
veil-type: {'u': 0, '': 1}
veil-color: {'w': 0, 'y': 1, '': 2, 'n': 3, 'e': 4, 'u': 5, 'k': 6}
has-ring: {'t': 0, 'f': 1}
ring-type: {'g': 0