#**Mushroom Classification** - Selecting only features with absolute correlation > 0.5

##**-Without GridSearchCV Hyperparameter Tuning**

###1. Import libraries, load dataset and handle missing data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
df = pd.read_csv(url, header=None)

# Handle missing values by replacing '?' with NaN and filling with mode
df.replace('?', np.nan, inplace=True)
df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)


###2. Encode categorical features

In [2]:
# Encode all categorical features using LabelEncoder
label_encoder = LabelEncoder()
for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])

# Separate features and target
X = df.iloc[:, 1:]  # All columns except the first one as features
y = df.iloc[:, 0]   # The first column as the target

###3. Select features with absolute correlation > 0.5

In [3]:
# Calculate correlation of features with the target
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix.iloc[0, 1:]  # Correlation of all features with the target
print("\nCorrelation of features with the target:")
print(correlation_with_target)

# Select features with absolute correlation > 0.5
threshold = 0.5
selected_features = correlation_with_target[abs(correlation_with_target) > threshold].index
print(f"\nFeatures selected based on threshold ({threshold}): {list(selected_features)}")

# Retain only the selected features
X = X[selected_features]
print(f"\nShape of dataset after feature selection: {X.shape}")


Correlation of features with the target:
1     0.052951
2     0.178446
3    -0.031384
4    -0.501530
5    -0.093552
6     0.129200
7    -0.348387
8     0.540024
9    -0.530566
10   -0.102019
11   -0.324194
12   -0.334593
13   -0.298801
14   -0.154003
15   -0.146730
16         NaN
17    0.145142
18   -0.214366
19   -0.411771
20    0.171961
21    0.298686
22    0.217179
Name: 0, dtype: float64

Features selected based on threshold (0.5): [4, 8, 9]

Shape of dataset after feature selection: (8124, 3)


In [4]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

###4. Initialize the models

In [5]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=33, n_estimators=100),
    'SVM': SVC(),
    'SGD': SGDClassifier(max_iter=1000, tol=1e-3)
}

# Initialize dictionaries to store accuracies
train_accuracies = {}
test_accuracies = {}

###5. Train and evaluate each model

In [6]:
# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)

    # Training and testing accuracy
    train_accuracies[name] = accuracy_score(y_train, model.predict(X_train))
    test_accuracies[name] = accuracy_score(y_test, model.predict(X_test))

    # Metrics
    print(f"\n{name} Report:")
    print(classification_report(y_test, model.predict(X_test)))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, model.predict(X_test)))


Training Logistic Regression...

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       843
           1       0.78      0.78      0.78       782

    accuracy                           0.79      1625
   macro avg       0.79      0.79      0.79      1625
weighted avg       0.79      0.79      0.79      1625

Confusion Matrix:
[[673 170]
 [171 611]]

Training Decision Tree...

Decision Tree Report:
              precision    recall  f1-score   support

           0       0.92      0.81      0.86       843
           1       0.82      0.92      0.87       782

    accuracy                           0.87      1625
   macro avg       0.87      0.87      0.87      1625
weighted avg       0.87      0.87      0.87      1625

Confusion Matrix:
[[684 159]
 [ 60 722]]

Training Random Forest...

Random Forest Report:
              precision    recall  f1-score   support

           0       0.92      0.81      0.86    

###6. Print all accuracies

In [7]:
# Print all accuracies
print("\nModel Accuracies:")
for model in train_accuracies.keys():
    print(f"{model} - Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")

# Check for overfitting
print("\nOverfitting Check:")
for model in train_accuracies.keys():
    if train_accuracies[model] > test_accuracies[model] + 0.05:  # Threshold for overfitting
        print(f"{model} might be overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")
    else:
        print(f"{model} is not overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")


Model Accuracies:
Logistic Regression - Training Accuracy: 0.8007, Testing Accuracy: 0.7902
Decision Tree - Training Accuracy: 0.8675, Testing Accuracy: 0.8652
Random Forest - Training Accuracy: 0.8675, Testing Accuracy: 0.8652
SVM - Training Accuracy: 0.8001, Testing Accuracy: 0.7877
SGD - Training Accuracy: 0.7507, Testing Accuracy: 0.7452

Overfitting Check:
Logistic Regression is not overfitting. Training Accuracy: 0.8007, Testing Accuracy: 0.7902
Decision Tree is not overfitting. Training Accuracy: 0.8675, Testing Accuracy: 0.8652
Random Forest is not overfitting. Training Accuracy: 0.8675, Testing Accuracy: 0.8652
SVM is not overfitting. Training Accuracy: 0.8001, Testing Accuracy: 0.7877
SGD is not overfitting. Training Accuracy: 0.7507, Testing Accuracy: 0.7452


##**-With GridSearchCV Hyperparameter Tuning**

###1. Import libraries, load dataset and handle missing data

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
df = pd.read_csv(url, header=None)

# Handle missing values by replacing '?' with NaN and filling with mode
df.replace('?', np.nan, inplace=True)
df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.iloc[:, 11].fillna(df.iloc[:, 11].mode()[0], inplace=True)


###2. Encode categorical features

In [9]:
# Encode all categorical features using LabelEncoder
label_encoder = LabelEncoder()
for column in df.columns:
    df[column] = label_encoder.fit_transform(df[column])

# Separate features and target
X = df.iloc[:, 1:]  # All columns except the first one as features
y = df.iloc[:, 0]   # The first column as the target

###3. Select features with absolute correlation > 0.5

In [10]:
# Calculate correlation of features with the target
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix.iloc[0, 1:]  # Correlation of all features with the target
print("\nCorrelation of features with the target:")
print(correlation_with_target)

# Select features with absolute correlation > 0.5
threshold = 0.5
selected_features = correlation_with_target[abs(correlation_with_target) > threshold].index
print(f"\nFeatures selected based on threshold ({threshold}): {list(selected_features)}")

# Retain only the selected features
X = X[selected_features]
print(f"\nShape of dataset after feature selection: {X.shape}")


Correlation of features with the target:
1     0.052951
2     0.178446
3    -0.031384
4    -0.501530
5    -0.093552
6     0.129200
7    -0.348387
8     0.540024
9    -0.530566
10   -0.102019
11   -0.324194
12   -0.334593
13   -0.298801
14   -0.154003
15   -0.146730
16         NaN
17    0.145142
18   -0.214366
19   -0.411771
20    0.171961
21    0.298686
22    0.217179
Name: 0, dtype: float64

Features selected based on threshold (0.5): [4, 8, 9]

Shape of dataset after feature selection: (8124, 3)


In [11]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

###4. Define the models' parameter grids for GridSearchCV

In [12]:
# Define models and their parameter grids for GridSearchCV
models = {
    'Logistic Regression': (
        LogisticRegression(max_iter=1000),
        {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']}
    ),
    'Decision Tree': (
        DecisionTreeClassifier(),
        {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
    ),
    'Random Forest': (
        RandomForestClassifier(random_state=33),
        {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]}
    ),
    'SVM': (
        SVC(),
        {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    ),
    'SGD': (
        SGDClassifier(max_iter=1000, tol=1e-3),
        {'loss': ['hinge', 'log'], 'alpha': [0.0001, 0.001, 0.01]}
    )
}

# Initialize dictionaries to store accuracies
train_accuracies = {}
test_accuracies = {}

###5. Apply GridSearchCV for each model

In [13]:
# Apply GridSearchCV for each model
for name, (model, param_grid) in models.items():
    print(f"\nTraining {name} with GridSearchCV...")
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best model from GridSearchCV
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

    # Training and testing accuracy
    train_accuracies[name] = accuracy_score(y_train, best_model.predict(X_train))
    test_accuracies[name] = accuracy_score(y_test, best_model.predict(X_test))

    # Metrics
    print(f"\n{name} Report:")
    print(classification_report(y_test, best_model.predict(X_test)))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, best_model.predict(X_test)))


Training Logistic Regression with GridSearchCV...
Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'lbfgs'}

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       843
           1       0.78      0.78      0.78       782

    accuracy                           0.79      1625
   macro avg       0.79      0.79      0.79      1625
weighted avg       0.79      0.79      0.79      1625

Confusion Matrix:
[[673 170]
 [171 611]]

Training Decision Tree with GridSearchCV...
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_split': 2}

Decision Tree Report:
              precision    recall  f1-score   support

           0       0.92      0.81      0.86       843
           1       0.82      0.92      0.87       782

    accuracy                           0.87      1625
   macro avg       0.87      0.87      0.87      1625
weighted avg       0.87      0.87      0.87      1625

Confusi

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklear

###6. Print all accuracies

In [14]:
# Print all accuracies
print("\nModel Accuracies:")
for model in train_accuracies.keys():
    print(f"{model} - Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")

# Check for overfitting
print("\nOverfitting Check:")
for model in train_accuracies.keys():
    if train_accuracies[model] > test_accuracies[model] + 0.05:  # Threshold for overfitting
        print(f"{model} might be overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")
    else:
        print(f"{model} is not overfitting. Training Accuracy: {train_accuracies[model]:.4f}, Testing Accuracy: {test_accuracies[model]:.4f}")


Model Accuracies:
Logistic Regression - Training Accuracy: 0.8007, Testing Accuracy: 0.7902
Decision Tree - Training Accuracy: 0.8675, Testing Accuracy: 0.8652
Random Forest - Training Accuracy: 0.8675, Testing Accuracy: 0.8652
SVM - Training Accuracy: 0.8280, Testing Accuracy: 0.8240
SGD - Training Accuracy: 0.7601, Testing Accuracy: 0.7409

Overfitting Check:
Logistic Regression is not overfitting. Training Accuracy: 0.8007, Testing Accuracy: 0.7902
Decision Tree is not overfitting. Training Accuracy: 0.8675, Testing Accuracy: 0.8652
Random Forest is not overfitting. Training Accuracy: 0.8675, Testing Accuracy: 0.8652
SVM is not overfitting. Training Accuracy: 0.8280, Testing Accuracy: 0.8240
SGD is not overfitting. Training Accuracy: 0.7601, Testing Accuracy: 0.7409
