In [8]:
# Cell 1: Imports

import numpy as np
import pandas as pd
import os

from skimage.transform import resize
from skimage.io import imread

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
IMG_SIZE = (150, 150, 3)


In [10]:
# Cell 2: Load image data into a DataFrame

target = []
images = []       # Images as matrices (optional)
flat_data = []    # Flattened vectors

datadir = '../Data/AppleData'

categories = [
    'Apple___Apple_scab',
    'Apple___Black_rot',
    'Apple___Cedar_apple_rust',
    'Apple___healthy',
    'Background_without_leaves'
]

for category in categories:
  path= datadir + '/' + category
  for img in os.listdir(path):
    img_matrix=imread(os.path.join(path,img))
    img_resized=resize(img_matrix,(150,150,3))
    flat_data.append(img_resized.flatten())
    images.append(img_resized)
    target.append(category)

df = pd.DataFrame(flat_data)
df['Target'] = target

print("DataFrame shape:", df.shape)
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '../Data/AppleData/Apple___Apple_scab'

In [None]:
# Cell 3: Train/test split

X = df.iloc[:, :-1].values
y = df['Target'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)
pd.Series(y_train).value_counts()


In [None]:
# Cell 4: Decision Tree + GridSearchCV

param_grid = {
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "criterion": ["gini"]
}

dt = DecisionTreeClassifier(random_state=RANDOM_STATE)

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_STATE
)

grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best CV accuracy:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)

best_dt = grid_search.best_estimator_


In [None]:
# Cell 5: Evaluate best Decision Tree on test set

y_pred = best_dt.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=categories)

cm_df = pd.DataFrame(cm, index=categories, columns=categories)
cm_df


In [None]:
# Cell 6: Confusion matrix plot

plt.figure(figsize=(6,6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Decision Tree Confusion Matrix")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Cell 7: CV results table

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results_sorted = cv_results.sort_values("mean_test_score", ascending=False)

cv_results_sorted[[
    "mean_test_score",
    "std_test_score",
    "param_max_depth",
    "param_min_samples_split",
    "param_min_samples_leaf"
]].head(10)


In [None]:
# Cell 8: Tree complexity and feature importances

print("Tree depth:", best_dt.get_depth())
print("Number of leaves:", best_dt.get_n_leaves())

importances = best_dt.feature_importances_
nonzero_importances = importances[importances > 0]

print("Non-zero feature count:", len(nonzero_importances))

# Show top 20
top_idx = np.argsort(importances)[-20:][::-1]
top_vals = importances[top_idx]

pd.DataFrame({
    "feature_index": top_idx,
    "importance": top_vals
})
