In [1]:
# Cell 1: Imports

import os
import numpy as np
import pandas as pd

from skimage.transform import resize
from skimage.io import imread

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

sns.set()
RANDOM_STATE = 42


In [2]:
target = []
images = []   # In matrix format
flat_data = []  # In vector format

datadir = '../Data/AppleData'
categories = ['Apple___Apple_scab',
              'Apple___Black_rot',
              'Apple___Cedar_apple_rust',
              'Apple___healthy',
              'Background_without_leaves']

for category in categories:
  path= datadir + '/' + category
  for img in os.listdir(path):
    img_matrix=imread(os.path.join(path,img))
    img_resized=resize(img_matrix,(150,150,3))
    flat_data.append(img_resized.flatten())
    images.append(img_resized)
    target.append(category)

In [3]:
import pandas as pd
df=pd.DataFrame(flat_data)
df['Target']=target

In [5]:
# Cell 3: Scale, PCA, and train/test split

y = df["Target"].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop(columns=["Target"]).values)

pca = PCA(
    n_components=500,        # you can try 300â€“800 if you want
    svd_solver='randomized',
    random_state=RANDOM_STATE
)
X_pca = pca.fit_transform(X_scaled)

print("PCA-transformed feature dimension:", X_pca.shape[1])

# Train/test split using PCA features
X_train, X_test, y_train, y_test = train_test_split(
    X_pca,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print(pd.Series(y_train).value_counts())


PCA-transformed feature dimension: 500
X_train shape: (4630, 500)
X_test shape: (1158, 500)
Apple___healthy              1316
Background_without_leaves     914
Apple___Apple_scab            800
Apple___Cedar_apple_rust      800
Apple___Black_rot             800
Name: count, dtype: int64


In [6]:
# Cell 4: Decision Tree with aggressive RandomizedSearchCV (focus on accuracy)

dt = DecisionTreeClassifier(
    class_weight="balanced",   # helps a bit with slight imbalance
    random_state=RANDOM_STATE
)

param_dist = {
    "max_depth": [None] + list(range(10, 61, 5)),          # allow very deep trees
    "min_samples_split": [2, 5, 10, 20, 50, 100],
    "min_samples_leaf": [1, 2, 4, 8, 16, 32],
    "max_features": [None, "sqrt", "log2", 0.1, 0.25, 0.5],
    "criterion": ["gini", "entropy", "log_loss"],
    "min_impurity_decrease": [0.0, 1e-7, 1e-6, 1e-5]
}

rand_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_dist,
    n_iter=200,          # crank this up if you want; Rivanna can handle 200+
    cv=5,                # strong CV
    scoring="accuracy",
    n_jobs=-1,           # use all cores
    verbose=2,
    random_state=RANDOM_STATE
)

rand_search.fit(X_train, y_train)

print("Best CV accuracy:", rand_search.best_score_)
print("Best parameters:", rand_search.best_params_)

best_dt = rand_search.best_estimator_


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END criterion=log_loss, max_depth=20, max_features=log2, min_impurity_decrease=1e-07, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END criterion=entropy, max_depth=35, max_features=sqrt, min_impurity_decrease=1e-06, min_samples_leaf=16, min_samples_split=20; total time=   0.2s
[CV] END criterion=gini, max_depth=35, max_features=sqrt, min_impurity_decrease=1e-07, min_samples_leaf=16, min_samples_split=10; total time=   0.1s
[CV] END criterion=gini, max_depth=35, max_features=sqrt, min_impurity_decrease=1e-07, min_samples_leaf=16, min_samples_split=10; total time=   0.1s
[CV] END criterion=log_loss, max_depth=55, max_features=log2, min_impurity_decrease=1e-05, min_samples_leaf=32, min_samples_split=2; total time=   0.1s
[CV] END criterion=log_loss, max_depth=None, max_features=0.5, min_impurity_decrease=1e-05, min_samples_leaf=2, min_samples_split=100; total time=   1.6s
[CV] END criterion=entropy, 

In [None]:
# Cell X: Random Forest with RandomizedSearchCV (aiming for higher accuracy)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=400,
    random_state=RANDOM_STATE,
    class_weight="balanced_subsample",  # helps with class imbalance
    n_jobs=-1
    
)

rf_param_dist = {
    "n_estimators": [200, 400, 600, 800],
    "max_depth": [None, 20, 40, 60, 80],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["sqrt", "log2", 0.25, 0.5, None],
    "bootstrap": [True, False]
}

rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_dist,
    n_iter=40,             # increase to 60â€“80 if you want even more search
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
    random_state=RANDOM_STATE
)

rf_search.fit(X_train, y_train)

print("Best Random Forest CV accuracy:", rf_search.best_score_)
print("Best RF parameters:", rf_search.best_params_)

best_rf = rf_search.best_estimator_


Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [None]:
# Cell 5: Evaluate Decision Tree on test set

y_pred = best_dt.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Test Accuracy: {test_accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=categories)
cm_df = pd.DataFrame(cm, index=categories, columns=categories)

plt.figure(figsize=(6, 6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Decision Tree (PCA + RandomizedSearchCV)")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Cell X+1: Evaluate Random Forest on test set

rf_y_pred = best_rf.predict(X_test)

rf_test_accuracy = accuracy_score(y_test, rf_y_pred)
print(f"Random Forest Test Accuracy: {rf_test_accuracy:.4f}\n")

print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))

rf_cm = confusion_matrix(y_test, rf_y_pred, labels=categories)
rf_cm_df = pd.DataFrame(rf_cm, index=categories, columns=categories)

plt.figure(figsize=(6, 6))
sns.heatmap(rf_cm_df, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Random Forest (PCA features)")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Cell 6: Tree complexity summary

print("Final tree depth:", best_dt.get_depth())
print("Number of leaves:", best_dt.get_n_leaves())


In [None]:
# Cell 7: CV results table

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results_sorted = cv_results.sort_values("mean_test_score", ascending=False)

cv_results_sorted[[
    "mean_test_score",
    "std_test_score",
    "param_max_depth",
    "param_min_samples_split",
    "param_min_samples_leaf"
]].head(10)


In [None]:
# Cell 8: Confusion matrix plot

plt.figure(figsize=(6,6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Decision Tree Confusion Matrix")
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Cell 9: Tree complexity and feature importances

print("Tree depth:", best_dt.get_depth())
print("Number of leaves:", best_dt.get_n_leaves())

importances = best_dt.feature_importances_
nonzero_importances = importances[importances > 0]

print("Non-zero feature count:", len(nonzero_importances))

# Show top 20
top_idx = np.argsort(importances)[-20:][::-1]
top_vals = importances[top_idx]

pd.DataFrame({
    "feature_index": top_idx,
    "importance": top_vals
})
