In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import kagglehub
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso

In [3]:
path = kagglehub.dataset_download("uciml/mushroom-classification")
data = pd.read_csv(path + "/mushrooms.csv")
print(data.head())

  class cap-shape cap-surface  ... spore-print-color population habitat
0     p         x           s  ...                 k          s       u
1     e         x           s  ...                 n          n       g
2     e         b           s  ...                 n          n       m
3     p         x           y  ...                 k          s       u
4     e         x           s  ...                 n          a       g

[5 rows x 23 columns]


In [8]:
# Alright, let's find which is our y since this is an supervised learning problem
print(data.columns)
# The target variable is "class" which indicates whether the mushroom is edible or poisonous
# Alright, let's get the data.
X = data.drop("class", axis=1)
y = data["class"]
print(X.head())

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')
  cap-shape cap-surface cap-color  ... spore-print-color population habitat
0         x           s         n  ...                 k          s       u
1         x           s         y  ...                 n          n       g
2         b           s         w  ...                 n          n       m
3         x           y         w  ...                 k          s       u
4         x           s         g  ...                 n          a       g

[5 rows x 22 columns]


In [9]:
# Should we encode the categorical variables? Yes, we turn these into numerical values
le = LabelEncoder()
for col in X.columns:
    X[col] = le.fit_transform(X[col])
y = le.fit_transform(y)
print(X.head())

   cap-shape  cap-surface  cap-color  ...  spore-print-color  population  habitat
0          5            2          4  ...                  2           3        5
1          5            2          9  ...                  3           2        1
2          0            2          8  ...                  3           2        3
3          5            3          8  ...                  2           3        5
4          5            2          3  ...                  3           0        1

[5 rows x 22 columns]


In [10]:
# Alright, now, since we're going to grab the basic of a Decision Tree, XGboost, and logistic regression. Not only that we will implement Neural Networks in to solve the problem.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
# Alright, let's run through Logistic Regression
# First off, the logistic regression's formula is sigmoid(w * x + b)
# Where w is the weights, x is the input features, and b is the bias term.
# The sigmoid function is defined as 1 / (1 + exp(-z)), where z is the linear combination of weights and features.
# The output of the sigmoid function is a value between 0 and 1, which can be interpreted as a probability.
# For binary classification, we typically set a threshold (e.g., 0.5) to decide the class label.
# If the output is greater than or equal to the threshold, we classify the instance as class 1; otherwise, we classify it as class 0.
# The model is trained using the maximum likelihood estimation method, which aims to find the parameters (weights and bias) that maximize the likelihood of the observed data.
# Now, moving on, the cost function used in logistic regression is the log-loss function, also known as binary cross-entropy loss.
# The log-loss function measures the difference between the predicted probabilities and the actual class labels.
# It is defined as:
# LogLoss = - (1/N) * Σ [y * log(p) + (1 - y) * log(1 - p)]
# This will give penalty to which y is far from p in terms of logarithmic scale.
# Where N is the number of instances, y is the actual class label (0 or 1), and p is the predicted probability of class 1.
# The goal of training the logistic regression model is to minimize the log-loss function, which is
# achieved using optimization algorithms such as gradient descent.
# The optimization process iteratively updates the weights and bias to reduce the log-loss until convergence.
# Once the model is trained, it can be used to predict the class labels for new instances by applying the sigmoid function to the linear combination of weights and features and then using the threshold to determine the class label.

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))
# The accuracy turns out to be 95%?
# But the confusion matrix seems... off. Because the true positive is 799. Isn't that big?
# Well, because the dataset is imbalanced. There are more edible mushrooms than poisonous ones.
# And, so, the model is biased towards predicting the majority class.
# This is a common issue in classification problems with imbalanced datasets.
# To address this, we can use techniques such as resampling the dataset (either oversampling the minority class or undersampling the majority class), using different evaluation metrics (such as precision, recall, and F1-score), or applying algorithms that are specifically designed to handle imbalanced datasets.
# How to? We can use the class_weight parameter in LogisticRegression to give more weight to the minority class.
log_reg_balanced = LogisticRegression(class_weight='balanced')
log_reg_balanced.fit(X_train, y_train)
y_pred_log_reg_balanced = log_reg_balanced.predict(X_test)
print("Balanced Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg_balanced))
print("Balanced Classification Report:\n", classification_report(y_test, y_pred_log_reg_balanced))
print("Balanced Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg_balanced))

#Now that we have a better model.



Logistic Regression Accuracy: 0.952
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95       843
           1       0.94      0.96      0.95       782

    accuracy                           0.95      1625
   macro avg       0.95      0.95      0.95      1625
weighted avg       0.95      0.95      0.95      1625

Confusion Matrix:
 [[799  44]
 [ 34 748]]
Balanced Logistic Regression Accuracy: 0.955076923076923
Balanced Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96       843
           1       0.94      0.96      0.95       782

    accuracy                           0.96      1625
   macro avg       0.95      0.96      0.96      1625
weighted avg       0.96      0.96      0.96      1625

Balanced Confusion Matrix:
 [[799  44]
 [ 29 753]]


In [15]:
# Now, let's move onto Decision Tree which is a little bit harder.
# Say that, you knew about a structure of a tree root and such things as that in DSA.
# A tree has its nodes and such.
# So thing is... the tree will have its leaf decide the outcome.
# The decision tree algorithm works by recursively splitting the data into subsets based on the feature that provides the best separation between classes.
# The splitting is done using a criterion such as Gini impurity or information gain.
# So, what is information gain and Gini impurity?
# Information gain measures the reduction in entropy (uncertainty) after a dataset is split on a particular feature.
# Entropy is a measure of the randomness or impurity in the dataset.
# The formula for entropy is:
# Entropy(S) = - Σ (p(x) * log2(p(x))), what is this? p(x) is the proportion of instances in class x.
# For instance, we have a dataset with 10 instances, 6 of which belong to class A and 4 belong to class B.
# The entropy of this dataset would be:
# Entropy(S) = - [(6/10) * log2(6/10) + (4/10) * log2(4/10)] = 0.97095
# And, from that, we know that impurity of the dataset is 0.97095.
# Now, when we split the dataset on a feature, we can calculate the entropy of each subset and the weighted average entropy of the subsets.
# The information gain is then calculated as the difference between the entropy of the original dataset and the weighted average entropy of the subsets.
# Say that we split the dataset on a feature that results in two subsets: one with 4 instances of class A and 1 instance of class B, and another with 2 instances of class A and 3 instances of class B.
# The entropy of the first subset would be:
# Entropy(S1) = - [(4/5) * log2(4/5) + (1/5) * log2(1/5)] = 0.72193
# The entropy of the second subset would be:
# Entropy(S2) = - [(2/5) * log2(2/5) + (3/5) * log2(3/5)] = 0.97095
# The weighted average entropy of the subsets would be:
# Weighted Entropy = (5/10) * Entropy(S1) + (5/10) * Entropy(S2) = 0.84644
# The information gain from splitting on this feature would be:
# Information Gain = Entropy(S) - Weighted Entropy = 0.97095 - 0.84644 = 0.12451
# Gini impurity is another measure of impurity used in decision trees.
# It measures the probability of incorrectly classifying a randomly chosen instance if it were randomly labeled according
# to the distribution of class labels in the dataset.
# The formula for Gini impurity is:
# Gini(S) = 1 - Σ (p(x)^2), where p(x) is the proportion of instances in class x.
# For the same dataset with 6 instances of class A and 4 instances of class B, the Gini impurity would be:
#  Gini(S) = 1 - [(6/10)^2 + (4/10)^2] = 0.48
# When we split the dataset on a feature, we can calculate the Gini impurity of each subset and the weighted average Gini impurity of the subsets.
# The Gini impurity is then calculated as the weighted average Gini impurity of the subsets.
# Using the same subsets as before, the Gini impurity of the first subset would be:
# Gini(S1) = 1 - [(4/5)^2 + (1/5)^2] = 0.32
# The Gini impurity of the second subset would be:
# Gini(S2) = 1 - [(2/5)^2 + (3/5)^2] = 0.48
# The weighted average Gini impurity of the subsets would be:
# Weighted Gini = (5/10) * Gini(S1) + (5/10) * Gini(S2) = 0.4
# The goal of the decision tree algorithm is to find the feature that provides the highest information gain or lowest Gini impurity at each node of the tree.


In [19]:
dt_clf = DecisionTreeClassifier(max_depth=5)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
#Wow, the accuracy is 100%? Is that even possible?
#Well, yes, because decision trees can easily overfit the training data, especially if the tree is allowed to grow deep without any constraints.
#Overfitting occurs when the model learns the noise and details of the training data to the extent that it negatively impacts the model's performance on new, unseen data.
#To prevent overfitting, we can use techniques such as pruning the tree, setting a maximum depth for the tree, or requiring a minimum number of samples per leaf node.
# Let's try to limit the depth of the tree to 5
# Holy moly, the accuracy is 97%?! Isn't that great?
# Imagine within the XGBOOST, it uses decision trees as its base learners.
# Each tree is built sequentially, with each new tree attempting to correct the errors made by the previous trees.
# The final prediction is made by combining the predictions of all the trees, typically through a weighted sum.
# XGBoost also includes regularization techniques to prevent overfitting, such as L1 (Lasso) and L2 (Ridge) regularization.
# And, not only that it boosts so much speed too!
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
# The accuracy is 100%? Again?
# Well, XGBoost is a powerful algorithm that can achieve high accuracy on many datasets,
# especially when the data is well-preprocessed and the model is properly tuned.
# However, achieving 100% accuracy on a real-world dataset is quite rare and often indicates overfitting.
# Overfitting occurs when the model learns the training data too well, including its noise and
# which can lead to poor generalization to new, unseen data.
# To mitigate overfitting, it's important to use techniques such as cross-validation,
# regularization, and early stopping during training.
# Additionally, evaluating the model using metrics beyond accuracy, such as precision, recall,
# and F1-score, can provide a more comprehensive understanding of the model's performance,
# especially in cases of imbalanced datasets.
# Finally, let's implement a simple Neural Network using Keras

Decision Tree Accuracy: 0.9766153846153847
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       843
           1       0.96      0.99      0.98       782

    accuracy                           0.98      1625
   macro avg       0.98      0.98      0.98      1625
weighted avg       0.98      0.98      0.98      1625

Decision Tree Confusion Matrix:
 [[814  29]
 [  9 773]]
XGBoost Accuracy: 1.0
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

XGBoost Confusion Matrix:
 [[843   0]
 [  0 782]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [20]:
# Moving onto Neural Networks
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
#Why do we exactly need 64 units and a relu? 
# The number of units (64 and 32) in each layer is a hyperparameter that can be tuned.
# The choice of 64 and 32 is somewhat arbitrary and can be adjusted based on the complexity of the problem and the size of the dataset.
# More units can allow the model to learn more complex patterns, but it also increases the risk of overfitting.
# So, how to choose the right number of units?
# A common approach is to start with a small number of units and gradually increase it while monitoring the model's performance on a validation set.
# The activation function 'relu' (Rectified Linear Unit) is commonly used in hidden layers of neural networks.
# The formula is f(x) = max(0, x)
# Which helps you find the maximum between 0 and x.
# It introduces non-linearity into the model, allowing it to learn complex patterns in the data.
# ReLU is computationally efficient and helps mitigate the vanishing gradient problem, which can occur with other activation functions like sigmoid or tanh.
# The choice of ReLU is often based on empirical results, as it has been found to work well in many scenarios.
# However, other activation functions such as Leaky ReLU, ELU, or SELU can also be considered based on the specific requirements of the problem.
# So, how to choose? Now, lets look at this mushroom problem.
# The mushroom classification problem is a binary classification task where the goal is to predict whether a mushroom is edible or poisonous based on its features.
# The input features are categorical variables that describe various characteristics of the mushrooms, such as cap shape, cap color, gill size, and habitat.
# The neural network architecture consists of an input layer, two hidden layers, and an output layer.
# The input layer has a number of neurons equal to the number of features in the dataset.
# The first hidden layer has 64 neurons, and the second hidden layer has 32 neurons
# Both hidden layers use the ReLU activation function to introduce non-linearity into the model. But, why?
# The output layer has a single neuron with a sigmoid activation function, which outputs a probability value between 0 and 1.
# The choice of 64 and 32 neurons in the hidden layers is somewhat arbitrary and can be adjusted based on the complexity of the problem and the size of the dataset.
# ReLU (Rectified Linear Unit) is a popular activation function that helps introduce non-linearity into the model.
# The final layer uses a sigmoid activation function because this is a binary classification problem, and sigmoid outputs a probability between 0 and 1.

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Neural Network Classification Report:\n", classification_report(y_test, y_pred_nn))
print("Neural Network Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nn))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9134 - loss: 0.2447 - val_accuracy: 0.9785 - val_loss: 0.0895
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9894 - loss: 0.0497 - val_accuracy: 0.9954 - val_loss: 0.0235
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9975 - loss: 0.0152 - val_accuracy: 0.9985 - val_loss: 0.0099
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9988 - loss: 0.0072 - val_accuracy: 1.0000 - val_loss: 0.0046
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9998 - loss: 0.0035 - val_accuracy: 1.0000 - val_loss: 0.0028
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9998 - loss: 0.0024 - val_accuracy: 1.0000 - val_loss: 0.0016
Epoch 7/50
[1m163/163[0m [32m━━━━━━━

In [None]:
# CRAZY! The accuracy is 100% again!
# Well, neural networks are powerful models that can learn complex patterns in data, especially when the data is well-preprocessed and the model is properly tuned.
# However, achieving 100% accuracy on a real-world dataset is quite rare and often indicates
# overfitting. Overfitting occurs when the model learns the training data too well, including its noise and
# which can lead to poor generalization to new, unseen data.
# To mitigate overfitting, it's important to use techniques such as cross-validation,
# regularization, and early stopping during training.
# But, hey, at least we got a good model!