In [None]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import os

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
os.chdir("projects/proj_1_team_1")

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Load of the preprocessed dataframe from .csv file

In [4]:
df = pd.read_csv("mushrooms_preprocessed.csv", index_col=0)

In [None]:
df.head()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["poisonous"]),
    df["poisonous"],
    test_size=0.2,
    random_state=42,
)

In [None]:
pipeline = make_pipeline(DecisionTreeClassifier(max_depth=5)) # Max depth set to prevent potential overfitting

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

In [None]:
scores = cross_val_score(pipeline, X_train, y_train, cv=10)

print("Cross-validation scores:", scores)
print("Mean cross-validation accuracy:", scores.mean())

# Checking for data leakage

In general, odor can be a strong predictor for mushrooms edibility. To make sure the model isn’t relying only on this single feature, we remove "odor" and retrain. If the accuracy stays high, it means the model is learning from a combination of features, not just memorizing one shortcut.

In [12]:
df_no_odor = df.drop(['odor_a', 'odor_c', 'odor_f', 'odor_l', 'odor_m', 'odor_n', 'odor_p', 'odor_s', 'odor_y'], axis=1)

In [13]:
X_train_no_odor, X_test_no_odor, y_train_no_odor, y_test_no_odor = train_test_split(
    df_no_odor.drop(columns=["poisonous"]),
    df_no_odor["poisonous"],
    test_size=0.2,
    random_state=42,
)

In [None]:
pipeline_no_odor = make_pipeline(DecisionTreeClassifier(max_depth=5)) # Max depth set to prevent potential overfitting

pipeline_no_odor.fit(X_train_no_odor, y_train_no_odor)

In [None]:
y_pred_no_odor = pipeline_no_odor.predict(X_test_no_odor)
accuracy_score(y_test_no_odor, y_pred_no_odor)

In [None]:
print(classification_report(y_test_no_odor, y_pred_no_odor))

In [None]:
pd.DataFrame(confusion_matrix(y_test_no_odor, y_pred_no_odor))

In [None]:
scores_no_odor = cross_val_score(pipeline_no_odor, X_train_no_odor, y_train_no_odor, cv=10)

print("Cross-validation scores:", scores_no_odor)
print("Mean cross-validation accuracy:", scores_no_odor.mean())

There is no major change in the accuracy, thus we can say that the "odor" feature does not cause a data leakage.

# Learning curve

To visualize how our model's performance changes as the amount of training data increases, we can plot a learning curve.
It can help us understand if our model might be overfitting.

In [19]:
train_sizes, train_scores, valid_scores = learning_curve(
    pipeline, X_train, y_train, cv=10, train_sizes=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], scoring='accuracy')

In [20]:
train_mean = train_scores.mean(axis=1)
valid_mean = valid_scores.mean(axis=1)

train_std = train_scores.std(axis=1)
valid_std = valid_scores.std(axis=1)

In [None]:
train_sizes_percent = (train_sizes / len(X_train)) * 100

plt.figure(figsize=(10, 6))
plt.plot(train_sizes_percent, train_mean, label='Training Accuracy', color='blue')
plt.plot(train_sizes_percent, valid_mean, label='Validation Accuracy', color='green')

plt.fill_between(train_sizes_percent, train_mean - train_std, train_mean + train_std, color='blue', alpha=0.2)
plt.fill_between(train_sizes_percent, valid_mean - valid_std, valid_mean + valid_std, color='green', alpha=0.2)

plt.title('Learning Curve for Decision Tree')
plt.xlabel('Training Size (%)')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.show()

The model is performing really well with 99%-100% accuracy on both training and validation sets, suggesting it’s learning the patterns in the data without overfitting. The consistent results across different training sizes confirm the model is stable.