#**Mushroom Classification** - Excluding the feature with the highest absolute correlation

##**-Without GridSearchCV Hyperparameter Tuning**

###1. Import libraries, load dataset and handle missing data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
df = pd.read_csv(url, header=None)

# Handle missing values by replacing '?' with NaN and filling with mode
df.replace('?', np.nan, inplace=True)
df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)


###2. Encode categorical features

In [2]:
# Encode all categorical features using LabelEncoder
label_encoder = LabelEncoder()
for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])

# Separate features and target
X = df.iloc[:, 1:]  # All columns except the first one as features
y = df.iloc[:, 0]   # The first column as the target

###3. Drop the feature with the highest absolute correlation

In [3]:
# Calculate correlation of features with the target
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix.iloc[0, 1:]  # Correlation of all features with the target
print("\nCorrelation of features with the target:")
print(correlation_with_target)

# Identify the feature with the highest absolute correlation
highest_correlation_feature = correlation_with_target.abs().idxmax()
print(f"\nFeature with the highest absolute correlation: {highest_correlation_feature}")

# Drop the feature with the highest absolute correlation
X = X.drop(columns=[highest_correlation_feature])
print(f"\nShape of dataset after removing the feature '{highest_correlation_feature}': {X.shape}")


Correlation of features with the target:
1     0.052951
2     0.178446
3    -0.031384
4    -0.501530
5    -0.093552
6     0.129200
7    -0.348387
8     0.540024
9    -0.530566
10   -0.102019
11   -0.324194
12   -0.334593
13   -0.298801
14   -0.154003
15   -0.146730
16         NaN
17    0.145142
18   -0.214366
19   -0.411771
20    0.171961
21    0.298686
22    0.217179
Name: 0, dtype: float64

Feature with the highest absolute correlation: 8

Shape of dataset after removing the feature '8': (8124, 21)


In [4]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize dictionaries to store accuracies
train_accuracies = {}
test_accuracies = {}

###4. Logistic Regression Model

In [5]:
# Logistic Regression
logistic_reg = LogisticRegression(random_state=42)
logistic_reg.fit(X_train, y_train)
y_pred_lr = logistic_reg.predict(X_test)
train_accuracies['Logistic Regression'] = accuracy_score(y_train, logistic_reg.predict(X_train))
test_accuracies['Logistic Regression'] = accuracy_score(y_test, y_pred_lr)
print("\nLogistic Regression Report:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       843
           1       0.93      0.86      0.89       782

    accuracy                           0.90      1625
   macro avg       0.90      0.90      0.90      1625
weighted avg       0.90      0.90      0.90      1625

Confusion Matrix:
[[789  54]
 [108 674]]


###5. Decision Tree Classifier

In [6]:
# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=24)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
train_accuracies['Decision Tree'] = accuracy_score(y_train, dt_classifier.predict(X_train))
test_accuracies['Decision Tree'] = accuracy_score(y_test, y_pred_dt)
print("\nDecision Tree Classifier Report:")
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))


Decision Tree Classifier Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[843   0]
 [  0 782]]


###6. Random Forest Classifier

In [7]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=33, n_estimators=100)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
train_accuracies['Random Forest'] = accuracy_score(y_train, rf_classifier.predict(X_train))
test_accuracies['Random Forest'] = accuracy_score(y_test, y_pred_rf)
print("\nRandom Forest Classifier Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Classifier Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[843   0]
 [  0 782]]


###7. SVM Classifier

In [8]:
# SVM Classifier
svm_classifier = SVC(random_state=51)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
train_accuracies['SVM'] = accuracy_score(y_train, svm_classifier.predict(X_train))
test_accuracies['SVM'] = accuracy_score(y_test, y_pred_svm)
print("\nSVM Classifier Report:")
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))


SVM Classifier Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confusion Matrix:
[[843   0]
 [  0 782]]


###8. SGD Classifier

In [9]:
# SGD Classifier
sgd_classifier = SGDClassifier(random_state=77)
sgd_classifier.fit(X_train, y_train)
y_pred_sgd = sgd_classifier.predict(X_test)
train_accuracies['SGD'] = accuracy_score(y_train, sgd_classifier.predict(X_train))
test_accuracies['SGD'] = accuracy_score(y_test, y_pred_sgd)
print("\nSGD Classifier Report:")
print(classification_report(y_test, y_pred_sgd))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_sgd))


SGD Classifier Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       843
           1       0.94      0.85      0.90       782

    accuracy                           0.90      1625
   macro avg       0.91      0.90      0.90      1625
weighted avg       0.91      0.90      0.90      1625

Confusion Matrix:
[[802  41]
 [115 667]]


###9. Print all accuracies

In [10]:
# Print all accuracies
print("\nModel Accuracies:")
for model in train_accuracies.keys():
    print(f"{model} - Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")

# Check for overfitting
print("\nOverfitting Check:")
for model in train_accuracies.keys():
    if train_accuracies[model] > test_accuracies[model] + 0.05:  # Threshold for overfitting
        print(f"{model} might be overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")
    else:
        print(f"{model} is not overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")


Model Accuracies:
Logistic Regression - Training Accuracy: 0.9117, Testing Accuracy: 0.9003
Decision Tree - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
Random Forest - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SVM - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SGD - Training Accuracy: 0.9097, Testing Accuracy: 0.9040

Overfitting Check:
Logistic Regression is not overfitting. Training Accuracy: 0.9117, Testing Accuracy: 0.9003
Decision Tree is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
Random Forest is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SVM is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SGD is not overfitting. Training Accuracy: 0.9097, Testing Accuracy: 0.9040


##**-With GridSearchCV Hyperparameter Tuning**

###1. Import libraries, load dataset and handle missing data

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
df = pd.read_csv(url, header=None)

# Handle missing values by replacing '?' with NaN and filling with mode
df.replace('?', np.nan, inplace=True)
df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)


###2. Encode categorical features

In [12]:
# Encode all categorical features using LabelEncoder
label_encoder = LabelEncoder()
for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])

# Separate features and target
X = df.iloc[:, 1:]  # All columns except the first one as features
y = df.iloc[:, 0]   # The first column as the target

###3. Drop the feature with the highest absolute correlation

In [13]:
# Calculate correlation of features with the target
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix.iloc[0, 1:]  # Correlation of all features with the target
print("\nCorrelation of features with the target:")
print(correlation_with_target)

# Identify the feature with the highest absolute correlation
highest_correlation_feature = correlation_with_target.abs().idxmax()
print(f"\nFeature with the highest absolute correlation: {highest_correlation_feature}")

# Drop the feature with the highest absolute correlation
X = X.drop(columns=[highest_correlation_feature])
print(f"\nShape of dataset after removing the feature '{highest_correlation_feature}': {X.shape}")


Correlation of features with the target:
1     0.052951
2     0.178446
3    -0.031384
4    -0.501530
5    -0.093552
6     0.129200
7    -0.348387
8     0.540024
9    -0.530566
10   -0.102019
11   -0.324194
12   -0.334593
13   -0.298801
14   -0.154003
15   -0.146730
16         NaN
17    0.145142
18   -0.214366
19   -0.411771
20    0.171961
21    0.298686
22    0.217179
Name: 0, dtype: float64

Feature with the highest absolute correlation: 8

Shape of dataset after removing the feature '8': (8124, 21)


In [14]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

###4. Define the models' parameter grids for GridSearchCV

In [15]:
# Define models and their parameter grids for GridSearchCV
models = {
    'Logistic Regression': (
        LogisticRegression(max_iter=1000),
        {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']}
    ),
    'Decision Tree': (
        DecisionTreeClassifier(),
        {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
    ),
    'Random Forest': (
        RandomForestClassifier(random_state=33),
        {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]}
    ),
    'SVM': (
        SVC(),
        {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    ),
    'SGD': (
        SGDClassifier(max_iter=1000, tol=1e-3),
        {'loss': ['hinge', 'log'], 'alpha': [0.0001, 0.001, 0.01]}
    )
}

# Initialize dictionaries to store accuracies
train_accuracies = {}
test_accuracies = {}

###5. Apply GridsearchCV for each model

In [16]:
# Apply GridSearchCV for each model
for name, (model, param_grid) in models.items():
    print(f"\nTraining {name} with GridSearchCV...")
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best model from GridSearchCV
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

    # Training and testing accuracy
    train_accuracies[name] = accuracy_score(y_train, best_model.predict(X_train))
    test_accuracies[name] = accuracy_score(y_test, best_model.predict(X_test))

    # Metrics
    print(f"\n{name} Report:")
    print(classification_report(y_test, best_model.predict(X_test)))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, best_model.predict(X_test)))


Training Logistic Regression with GridSearchCV...
Best parameters for Logistic Regression: {'C': 1, 'solver': 'liblinear'}

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       843
           1       0.93      0.86      0.89       782

    accuracy                           0.90      1625
   macro avg       0.90      0.90      0.90      1625
weighted avg       0.90      0.90      0.90      1625

Confusion Matrix:
[[789  54]
 [108 674]]

Training Decision Tree with GridSearchCV...
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_split': 2}

Decision Tree Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       843
           1       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Confu

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklear

###6. Print all accuracies

In [17]:
# Print all accuracies
print("\nModel Accuracies:")
for model in train_accuracies.keys():
    print(f"{model} - Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")

# Check for overfitting
print("\nOverfitting Check:")
for model in train_accuracies.keys():
    if train_accuracies[model] > test_accuracies[model] + 0.05:  # Threshold for overfitting
        print(f"{model} might be overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")
    else:
        print(f"{model} is not overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")


Model Accuracies:
Logistic Regression - Training Accuracy: 0.9118, Testing Accuracy: 0.9003
Decision Tree - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
Random Forest - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SVM - Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SGD - Training Accuracy: 0.9180, Testing Accuracy: 0.9095

Overfitting Check:
Logistic Regression is not overfitting. Training Accuracy: 0.9118, Testing Accuracy: 0.9003
Decision Tree is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
Random Forest is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SVM is not overfitting. Training Accuracy: 1.0000, Testing Accuracy: 1.0000
SGD is not overfitting. Training Accuracy: 0.9180, Testing Accuracy: 0.9095
