# Tomato Leaf Diseases Prediction

## Overview
This notebook contains a tomato leaf disease classification model that follows the technique introduced in the research paper below, using machine learning with RGB and CMY color features.

## References

### Research Paper
- **Title**: Comparative Analysis of the Performance of the Decision Tree and K-Nearest Neighbors Methods in Classifying Coffee Leaf Diseases
- **Authors**: Adie Suryadi, Murhaban Murhaban, Rivansyah Suhendra
- **Published in**: Department of Information Technology, Teuku Umar University, Indonesia
- **URL**: [https://aptikom-journal.id/conferenceseries/article/view/649/272](https://aptikom-journal.id/conferenceseries/article/view/649/272)

### Dataset
- **Dataset**: Tomato Leaf Diseases
- **Source**: Kaggle
- **URL**: [https://www.kaggle.com/code/samanfatima7/tomato-leaf-disease-94-accuracy](https://www.kaggle.com/code/samanfatima7/tomato-leaf-disease-94-accuracy)

## Methodology
This implementation extracts color-based features from tomato leaf images:
- **RGB features**: Mean and standard deviation for each R, G, B channel (6 features)
- **CMY features**: Mean and standard deviation for each C, M, Y channel (6 features)
- **Total**: 12 color-based features per image

The features are then used to classify tomato leaves into four categories:
- Tomato_mosaic_virus
- Tomato_Yellow_Leaf_Curl_Virus
- healthy
- Septoria_leaf_spot
- Target_Spot
- Spider_mites Two-spotted_spider_mite

## Preprocessing Data

In [None]:
import numpy as np
from PIL import Image
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:

def rgb_to_cmy(rgb_image):
    # CMY = 1 - RGB
    cmy_image = 1.0 - rgb_image
    return cmy_image

def extract_color_features(image):
    features = []
    
    # RGB features (6)
    for channel in range(3):  # R, G, B
        channel_data = image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    # CMY features (6)
    cmy_image = rgb_to_cmy(image)
    for channel in range(3):  # C, M, Y
        channel_data = cmy_image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    return np.array(features)

def load_and_extract_features(image_dir, labels_df):
    features_list = []
    valid_indices = []
    
    for idx, row in labels_df.iterrows():
        # Fix: Use isinstance to check type
        if isinstance(row['id'], int):
            img_id = f"{row['id']}.jpg"
        else:
            img_id = row['id']
        img_path = os.path.join(image_dir, img_id)
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            img_array = np.array(img).astype('float32') / 255.0 # normalize 
            
            features = extract_color_features(img_array)
            features_list.append(features)
            valid_indices.append(idx)
        else:
            print(f"Warning: {img_path} not found")
    
    features_array = np.array(features_list)
    labels = labels_df.loc[valid_indices].reset_index(drop=True)
    
    return features_array, labels

def convert_to_single_label(row):
    if row['miner'] == 1:
        return 'miner'
    elif row['phoma'] == 1:
        return 'phoma'
    elif row['rust'] == 1:
        return 'rust'
    else:
        return 'nodisease'

In [None]:
data = []
tomato_dataset_dir = 'dataset/tomato-leaf-diseases'
for dir_name in os.listdir(tomato_dataset_dir):
    disease_name = dir_name.replace('Tomato___', ' ')
    dir_path = os.path.join(tomato_dataset_dir, dir_name)
    for img_name in os.listdir(dir_path):
        data.append({'id': os.path.join(dir_name, img_name), 'label': disease_name})

tomato_df = pd.DataFrame(data)

In [None]:
tomato_features, tomato_labels = load_and_extract_features(tomato_dataset_dir, tomato_df)
X_train_tomato, X_valid_tomato, y_train_tomato, y_valid_tomato = train_test_split(
    tomato_features, 
    tomato_labels['label'],
    test_size=0.2,
    stratify=tomato_labels['label'],
    random_state=123
)

In [None]:
scaler = StandardScaler()
X_train_tomato_scaled = scaler.fit_transform(X_train_tomato)
X_valid_tomato_scaled = scaler.transform(X_valid_tomato)

# Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# ---------- Decision Tree ----------
dt = DecisionTreeClassifier(max_features=None, random_state=123, splitter='best')
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(1, 21)) + [None],
    'min_samples_split': range(2, 11),
    'min_samples_leaf': range(1, 6)
}
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=10)
grid_search_dt.fit(X_train_tomato_scaled, y_train_tomato)

best_model_dt = grid_search_dt.best_estimator_
print(grid_search_dt.best_params_)
print(grid_search_dt.best_score_)

# ---------- KNN ----------
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, n_jobs=-1, p=2, weights='uniform')
param_grid_knn = {
    'metric': ['euclidean', 'manhattan'],
    'n_neighbors': range(1, 21),
}
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=10)
grid_search_knn.fit(X_train_tomato_scaled, y_train_tomato)

best_model_knn = grid_search_knn.best_estimator_
print(grid_search_knn.best_params_)
print(grid_search_knn.best_score_)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def show_classification_metrics(y_true, y_pred):
    print("Accuracy (subset accuracy):", round(accuracy_score(y_true, y_pred), 4))
    
    print("Precision (micro, macro):", round(precision_score(y_true, y_pred, average='micro', zero_division=0), 4), round(precision_score(y_true, y_pred, average='macro', zero_division=0), 4))
    print("Recall (micro, macro):", round(recall_score(y_true, y_pred, average='micro', zero_division=0), 4), round(recall_score(y_true, y_pred, average='macro', zero_division=0), 4))
    print("F1-score (micro, macro):", round(f1_score(y_true, y_pred, average='micro', zero_division=0), 4), round(f1_score(y_true, y_pred, average='macro', zero_division=0), 4))

# Evaluate outcomes

In [None]:
# ---------- Decision Tree ----------
best_model_dt.fit(X_train_tomato_scaled, y_train_tomato)
y_pred_valid_tomato_dt = best_model_dt.predict(X_valid_tomato_scaled)

print("\n=== Decision Tree Overall Metrics ===")
show_classification_metrics(y_valid_tomato, y_pred_valid_tomato_dt)

# ---------- KNN ----------
best_model_knn.fit(X_train_tomato_scaled, y_train_tomato)
y_pred_valid_tomato_knn = best_model_knn.predict(X_valid_tomato_scaled)

print("\n=== KNN Overall Metrics ===")
show_classification_metrics(y_valid_tomato, y_pred_valid_tomato_knn)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
labels = sorted(y_valid_tomato.unique())
fig, ax = plt.subplots(1,2,figsize=(15,6))

# Desicion Tree
cm_valid_dt = confusion_matrix(y_valid_tomato, y_pred_valid_tomato_dt)
sns.heatmap(
    cm_valid_dt,
    annot=True,
    fmt='d',
    cmap='Greens',
    xticklabels=labels,
    yticklabels=labels,
    ax=ax[0]
)
ax[0].set_xlabel('Predicted')
ax[0].set_ylabel('Actual')
ax[0].set_title('Confusion Matrix of X_valid - Desicion Tree')

# KNN
cm_valid_knn = confusion_matrix(y_valid_tomato, y_pred_valid_tomato_knn)
sns.heatmap(
    cm_valid_knn,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels,
    ax=ax[1]
)
ax[1].set_xlabel('Predicted')
ax[1].set_ylabel('Actual')
ax[1].set_title('Confusion Matrix of X_valid - KNN')

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
# Decision Tree
y_score_valid_tomato_dt = best_model_dt.predict_proba(X_valid_tomato_scaled)
y_bin_valid_tomato_dt = label_binarize(y_valid_tomato, classes=best_model_dt.classes_)
fig, ax = plt.subplots(1,2, figsize=(16, 6))
for i, class_name in enumerate(best_model_dt.classes_):
    fpr_dt, tpr_dt, _ = roc_curve(y_bin_valid_tomato_dt[:, i], y_score_valid_tomato_dt[:, i])
    roc_auc_dt = auc(fpr_dt, tpr_dt)
    ax[0].plot(
        fpr_dt,
        tpr_dt,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_dt:.4f})'
    )
ax[0].plot([0, 1], [0, 1], 'k--', lw=1)
ax[0].set_xlabel('False Positive Rate')
ax[0].set_ylabel('True Positive Rate')
ax[0].set_title('ROC Curve for Predicting X_valid - Decision Tree')
ax[0].legend()

# KNN
y_score_valid_tomato_knn = best_model_knn.predict_proba(X_valid_tomato_scaled)
y_bin_valid_tomato_knn = label_binarize(y_valid_tomato, classes=best_model_knn.classes_)
for i, class_name in enumerate(best_model_knn.classes_):
    fpr_knn, tpr_knn, _ = roc_curve(y_bin_valid_tomato_knn[:, i], y_score_valid_tomato_knn[:, i])
    roc_auc_knn = auc(fpr_knn, tpr_knn)
    ax[1].plot(
        fpr_knn,
        tpr_knn,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_knn:.4f})'
    )
ax[1].plot([0, 1], [0, 1], 'k--', lw=1)
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_title('ROC Curve for Predicting X_valid - KNN')
ax[1].legend()

# Interpretation
By applying the same technique used in the paper to a different type of dataset, we achieved a KNN macro F1-score of 0.7692. In addition, the AUC scores are high, especially for the KNN model. This suggests that this technique may have potential for predicting many kinds of leaf diseases.