#**Mushroom Classification** - using One-Hot Encoding

###1. Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

###2. Load dataset and handle missing data

In [None]:
# Load the dataset from the URL
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'

# Load the dataset into a DataFrame
df = pd.read_csv(url, header=None)

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Fill missing values with the mode (most frequent value)
df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)

# Print the first few rows of the dataset
print("First few rows of the dataset after handling missing data:")
print(df.head())

First few rows of the dataset after handling missing data:
  0  1  2  3  4  5  6  7  8  9   ... 13 14 15 16 17 18 19 20 21 22
0  p  x  s  n  t  p  f  c  n  k  ...  s  w  w  p  w  o  p  k  s  u
1  e  x  s  y  t  a  f  c  b  k  ...  s  w  w  p  w  o  p  n  n  g
2  e  b  s  w  t  l  f  c  b  n  ...  s  w  w  p  w  o  p  n  n  m
3  p  x  y  w  t  p  f  c  n  n  ...  s  w  w  p  w  o  p  k  s  u
4  e  x  s  g  f  n  f  w  b  k  ...  s  w  w  p  w  o  e  n  a  g

[5 rows x 23 columns]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)


###3. Encode categorical features

In [None]:
# One-hot encode the dataset
df_encoded = pd.get_dummies(df, columns=df.columns, drop_first=True)

# Print the first few rows after encoding
print("\nFirst few rows after one-hot encoding:")
print(df_encoded.head())

# Separate features and target variable
X = df_encoded.iloc[:, 1:]  # All columns except the first one as features
y = df.iloc[:, 0]           # The first column as target (not one-hot encoded)

# Initialize dictionaries to store accuracies
train_accuracies = {}
test_accuracies = {}


First few rows after one-hot encoding:
     0_p    1_c    1_f    1_k    1_s    1_x    2_g    2_s    2_y    3_c  ...  \
0   True  False  False  False  False   True  False   True  False  False  ...   
1  False  False  False  False  False   True  False   True  False  False  ...   
2  False  False  False  False  False  False  False   True  False  False  ...   
3   True  False  False  False  False   True  False  False   True  False  ...   
4  False  False  False  False  False   True  False   True  False  False  ...   

    21_n   21_s   21_v   21_y   22_g   22_l   22_m   22_p   22_u   22_w  
0  False   True  False  False  False  False  False  False   True  False  
1   True  False  False  False   True  False  False  False  False  False  
2   True  False  False  False  False  False   True  False  False  False  
3  False   True  False  False  False  False  False  False   True  False  
4  False  False  False  False   True  False  False  False  False  False  

[5 rows x 95 columns]


###4. Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Train-test split for Logistic Regression
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler_lr = StandardScaler()
X_train_lr_scaled = scaler_lr.fit_transform(X_train_lr)
X_test_lr_scaled = scaler_lr.transform(X_test_lr)

# Apply PCA to retain 95% variance
pca_lr = PCA(n_components=0.95)
X_train_lr_pca = pca_lr.fit_transform(X_train_lr_scaled)
X_test_lr_pca = pca_lr.transform(X_test_lr_scaled)

# Train and predict
logistic_reg = LogisticRegression(random_state=42)
logistic_reg.fit(X_train_lr_pca, y_train_lr)
y_pred_lr = logistic_reg.predict(X_test_lr_pca)

# Compute accuracies
train_accuracies['Logistic Regression'] = accuracy_score(y_train_lr, logistic_reg.predict(X_train_lr_pca))
test_accuracies['Logistic Regression'] = accuracy_score(y_test_lr, y_pred_lr)

# Metrics
print("\nLogistic Regression Report:")
print(classification_report(y_test_lr, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test_lr, y_pred_lr))


Logistic Regression Report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       843
           p       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[843   0]
 [  0 782]]


###5. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train-test split for Decision Tree
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, y, test_size=0.2, random_state=24)

# Train and predict
dt_classifier = DecisionTreeClassifier(random_state=24)
dt_classifier.fit(X_train_dt, y_train_dt)
y_pred_dt = dt_classifier.predict(X_test_dt)

# Compute accuracies
train_accuracies['Decision Tree'] = accuracy_score(y_train_dt, dt_classifier.predict(X_train_dt))
test_accuracies['Decision Tree'] = accuracy_score(y_test_dt, y_pred_dt)

# Metrics
print("\nDecision Tree Classifier Report:")
print(classification_report(y_test_dt, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test_dt, y_pred_dt))


Decision Tree Classifier Report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       869
           p       1.00      1.00      1.00       756

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[869   0]
 [  0 756]]


###6. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train-test split for Random Forest
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2, random_state=33)

# Train and predict
rf_classifier = RandomForestClassifier(random_state=33, n_estimators=100)
rf_classifier.fit(X_train_rf, y_train_rf)
y_pred_rf = rf_classifier.predict(X_test_rf)

# Compute accuracies
train_accuracies['Random Forest'] = accuracy_score(y_train_rf, rf_classifier.predict(X_train_rf))
test_accuracies['Random Forest'] = accuracy_score(y_test_rf, y_pred_rf)

# Metrics
print("\nRandom Forest Classifier Report:")
print(classification_report(y_test_rf, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test_rf, y_pred_rf))


Random Forest Classifier Report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       831
           p       1.00      1.00      1.00       794

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[831   0]
 [  0 794]]


###7. SVM Classifier

In [None]:
from sklearn.svm import SVC

# Train-test split for SVM
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X, y, test_size=0.2, random_state=51)

# Standardize features
scaler_svm = StandardScaler()
X_train_svm_scaled = scaler_svm.fit_transform(X_train_svm)
X_test_svm_scaled = scaler_svm.transform(X_test_svm)

# Apply PCA to retain 95% variance
pca_svm = PCA(n_components=0.95)
X_train_svm_pca = pca_svm.fit_transform(X_train_svm_scaled)
X_test_svm_pca = pca_svm.transform(X_test_svm_scaled)

# Train and predict
svm_classifier = SVC(random_state=51)
svm_classifier.fit(X_train_svm_pca, y_train_svm)
y_pred_svm = svm_classifier.predict(X_test_svm_pca)

# Compute accuracies
train_accuracies['SVM'] = accuracy_score(y_train_svm, svm_classifier.predict(X_train_svm_pca))
test_accuracies['SVM'] = accuracy_score(y_test_svm, y_pred_svm)

# Metrics
print("\nSVM Classifier Report:")
print(classification_report(y_test_svm, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test_svm, y_pred_svm))


SVM Classifier Report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       855
           p       1.00      1.00      1.00       770

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[855   0]
 [  1 769]]


###8. SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

# Train-test split for SGD
X_train_sgd, X_test_sgd, y_train_sgd, y_test_sgd = train_test_split(X, y, test_size=0.2, random_state=77)

# Standardize features
scaler_sgd = StandardScaler()
X_train_sgd_scaled = scaler_sgd.fit_transform(X_train_sgd)
X_test_sgd_scaled = scaler_sgd.transform(X_test_sgd)

# Apply PCA to retain 95% variance
pca_sgd = PCA(n_components=0.95)
X_train_sgd_pca = pca_sgd.fit_transform(X_train_sgd_scaled)
X_test_sgd_pca = pca_sgd.transform(X_test_sgd_scaled)

# Train and predict
sgd_classifier = SGDClassifier(random_state=77)
sgd_classifier.fit(X_train_sgd_pca, y_train_sgd)
y_pred_sgd = sgd_classifier.predict(X_test_sgd_pca)

# Compute accuracies
train_accuracies['SGD'] = accuracy_score(y_train_sgd, sgd_classifier.predict(X_train_sgd_pca))
test_accuracies['SGD'] = accuracy_score(y_test_sgd, y_pred_sgd)

# Metrics
print("\nSGD Classifier Report:")
print(classification_report(y_test_sgd, y_pred_sgd))
print("Confusion Matrix:")
print(confusion_matrix(y_test_sgd, y_pred_sgd))


SGD Classifier Report:
              precision    recall  f1-score   support

           e       1.00      1.00      1.00       868
           p       1.00      1.00      1.00       757

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[868   0]
 [  0 757]]


###9. Print all accuracies

In [None]:
# Print all accuracies
print("\nModel Accuracies:")
for model in train_accuracies.keys():
    print(f"{model} - Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")

# Check for overfitting
print("\nOverfitting Check:")
for model in train_accuracies.keys():
    if train_accuracies[model] > test_accuracies[model] + 0.05:  # Arbitrary threshold for overfitting
        print(f"{model} might be overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")
    else:
        print(f"{model} is not overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")



Model Accuracies:
Logistic Regression - Training Accuracy: 0.9998, Testing Accuracy: 1.0000
Decision Tree - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
Random Forest - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SVM - Training Accuracy: 0.9995, Testing Accuracy: 0.9994
SGD - Training Accuracy: 0.9997, Testing Accuracy: 1.0000

Overfitting Check:
Logistic Regression is not overfitting. Training Accuracy: 0.9998, Testing Accuracy: 1.0000
Decision Tree is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
Random Forest is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SVM is not overfitting. Training Accuracy: 0.9995, Testing Accuracy: 0.9994
SGD is not overfitting. Training Accuracy: 0.9997, Testing Accuracy: 1.0000
