# Coffee Leaf Diseases Prediction

## Overview
This notebook is a reproduction of the coffee leaf disease classification method described in the research paper below, using machine learning techniques with RGB and CMY color features.

## References

### Research Paper
- **Title**: Comparative Analysis of the Performance of the Decision Tree and K-Nearest Neighbors Methods in Classifying Coffee Leaf Diseases
- **Authors**: Adie Suryadi, Murhaban Murhaban, Rivansyah Suhendra
- **Published in**: Department of Information Technology, Teuku Umar University, Indonesia
- **URL**: [https://aptikom-journal.id/conferenceseries/article/view/649/272](https://aptikom-journal.id/conferenceseries/article/view/649/272)

### Dataset
- **Dataset**: Coffee Leaf Diseases
- **Source**: Kaggle
- **URL**: [https://www.kaggle.com/datasets/badasstechie/coffee-leaf-diseases/code](https://www.kaggle.com/datasets/badasstechie/coffee-leaf-diseases/code)

## Methodology
This implementation extracts color-based features from coffee leaf images:
- **RGB features**: Mean and standard deviation for each R, G, B channel (6 features)
- **CMY features**: Mean and standard deviation for each C, M, Y channel (6 features)
- **Total**: 12 color-based features per image

The features are then used to classify coffee leaves into four categories:
- Miner
- Phoma
- Rust
- No disease

## Preprocessing Data

In [None]:
# Import utilities from utils.py
from utils import (
    load_and_extract_features,
    show_evaluation_results,
    plot_confusion_matrix,
    plot_roc_curve,
    CustomMultiOutputEstimator
)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline as SKPipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.base import clone
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_features, train_labels = load_and_extract_features('train', resize_shape=(100, 50), use_raw_data=True)
test_features, test_labels = load_and_extract_features('test', resize_shape=(100, 50), use_raw_data=True)
X_train, X_valid, y_train, y_valid = train_test_split(
    train_features, 
    train_labels,
    test_size=0.2,
    stratify=train_labels,
    random_state=123
)
X_test, y_test = test_features, test_labels
print(f"\nShape of X_train: {X_train.shape}")

In [None]:
label_counts = np.sum(train_labels.values, axis=0)
print("Label distribution in training set:")
for label, count in zip(train_labels.columns, label_counts):
    print(f"{label}: {count} samples")

## Building and Evaluating Models

### Without SMOTE

In [None]:
def build_model(model_type, model, param_grid):
    pipeline = SKPipeline([
        ('pca', PCA(random_state=123)),
        ('scaler', StandardScaler()),
        ('multi_output', MultiOutputClassifier(model, n_jobs=-1))
    ])
    
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='f1_macro',
        cv=10,
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    print(f"Best parameters for {model_type}: {grid.best_params_}")
    print(f"Best F1 Macro Score for {model_type}: {grid.best_score_}")
    
    return grid.best_estimator_

# ---------- Decision Tree ----------
best_multilable_dt = build_model(
    'Decision Tree',
    DecisionTreeClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'multi_output__estimator__criterion': ['gini', 'entropy'],
        'multi_output__estimator__max_depth': [5, 8, 13, 18, None],
        'multi_output__estimator__min_samples_split': [2, 5],
        'multi_output__estimator__min_samples_leaf': [1, 3],
        'multi_output__estimator__class_weight': ['balanced', None],
        'multi_output__estimator__min_impurity_decrease': [0.0, 0.001, 0.01]
    }
)

# ---------- KNN ----------
best_multilable_knn = build_model(
    'KNN',
    KNeighborsClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'multi_output__estimator__n_neighbors': [1, 3, 5, 7, 9],
        'multi_output__estimator__metric': ['euclidean', 'manhattan', 'cosine'],
        'multi_output__estimator__weights': ['uniform', 'distance']
    }
)

#### Evaluation on Validation Set

In [None]:
# ---------- Decision Tree ----------
y_pred_valid_multilabel_dt = best_multilable_dt.predict(X_valid)

show_evaluation_results(
    "Decision Tree",
    y_pred_valid_multilabel_dt,
    y_valid
)

# ---------- KNN ----------
y_pred_valid_multilabel_knn = best_multilable_knn.predict(X_valid)

show_evaluation_results(
    "KNN",
    y_pred_valid_multilabel_knn,
    y_valid
)

##### Confusion Matrix Heatmap

In [None]:
# ---------- Decision Tree ----------
plot_confusion_matrix(
    'Decision Tree',
    y_pred_valid_multilabel_dt,
    y_valid
)

# ---------- KNN ----------
plot_confusion_matrix(
    'KNN',
    y_pred_valid_multilabel_knn,
    y_valid
)

##### ROC-AUC Curves

In [None]:
# ---------- Decision Tree ----------
plot_roc_curve(
    'Decision Tree',
    best_multilable_dt,
    X_valid,
    y_valid,
    'Validation Set'
)

# ---------- KNN ----------
plot_roc_curve(
    'KNN',
    best_multilable_knn,
    X_valid,
    y_valid,
    'Validation Set'
)

#### Evaluation on Test Set


In [None]:
# ---------- Decision Tree ----------
y_pred_test_multilabel_dt = best_multilable_dt.predict(X_test)

show_evaluation_results(
    "Decision Tree",
    y_pred_test_multilabel_dt,
    y_test
)

# ---------- KNN ----------
y_pred_test_multilabel_knn = best_multilable_knn.predict(X_test)

show_evaluation_results(
    "KNN",
    y_pred_test_multilabel_knn,
    y_test
)

##### Confusion Matrix Heatmap

In [None]:
# ---------- Decision Tree ----------
plot_confusion_matrix(
    'Decision Tree',
    y_pred_test_multilabel_dt,
    y_test
)

# ---------- KNN ----------
plot_confusion_matrix(
    'KNN',
    y_pred_test_multilabel_knn,
    y_test
)

##### ROC-AUC Curves

In [None]:
# ---------- Decision Tree ----------
plot_roc_curve(
    'Decision Tree',
    best_multilable_dt,
    X_test,
    y_test,
    'Test Set'
)

# ---------- KNN ----------
plot_roc_curve(
    'KNN',
    best_multilable_knn,
    X_test,
    y_test,
    'Test Set'
)

### Apply SMOTE

In [None]:
def build_model_smote(model_type, model, param_grid):
    
    # A list to store best estimators from each fold
    best_estimators_list = []
    
    # A list to store best macro scores from each fold
    best_macro_scores = []
    
    y_train_np = np.array(y_train)
    X_train_np = np.array(X_train)
    
    # Loop through each label for multi-label classification
    for i in range(y_train.shape[1]):
        print(f"Processing label {i+1}/{y_train.shape[1]}")
        
        y_label = y_train_np[:, i]
        
        if len(np.unique(y_label)) < 2:
            print(f"Skipping label {i+1} since it has only one class.")
            continue
        
        # Define pipeline for single label
        single_label_pipeline = ImbPipeline([
            ('pca', PCA(random_state=123)),
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=123)),
            ('model', clone(model))
        ])
        
        # Use StratifiedKFold for better representation of classes in each fold
        cv_splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)
        
        grid = RandomizedSearchCV(
            estimator=single_label_pipeline,
            param_distributions=param_grid,
            n_iter=200,
            scoring='f1',
            cv=cv_splitter,
            n_jobs=-1,
            random_state=123
        )
        
        grid.fit(X_train_np, y_label)
        
        print(f"Best parameters for {model_type} label {i+1}: {grid.best_params_}")
        print(f"Best F1 score for label {i+1}: {grid.best_score_}")
        best_estimators_list.append(grid.best_estimator_)
        best_macro_scores.append(grid.best_score_)
    
    return CustomMultiOutputEstimator(best_estimators_list)

# ---------- Decision Tree ----------
best_multilable_dt_smote = build_model_smote(
    'Decision Tree',
    DecisionTreeClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'smote__k_neighbors': [3, 5, 7, 11],
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [5, 8, 13, 18, None],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 3],
        'model__class_weight': ['balanced', None],
        'model__min_impurity_decrease': [0.0, 0.001, 0.01]
    }
)

# ---------- KNN ----------
best_multilable_knn_smote = build_model_smote(
    'KNN',
    KNeighborsClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'smote__k_neighbors': [3, 5, 7, 11],
        'model__n_neighbors': [1, 3, 5, 7, 9],
        'model__metric': ['euclidean', 'manhattan', 'cosine'],
        'model__weights': ['uniform', 'distance']
    }
)

#### Evaluation on Validation Set

In [None]:
# ---------- Decision Tree ----------
y_pred_valid_multilabel_dt_smote = best_multilable_dt_smote.predict(X_valid)

show_evaluation_results(
    "Decision Tree",
    y_pred_valid_multilabel_dt_smote,
    y_valid
)

# ---------- KNN ----------
y_pred_valid_multilabel_knn_smote = best_multilable_knn_smote.predict(X_valid)

show_evaluation_results(
    "KNN",
    y_pred_valid_multilabel_knn_smote,
    y_valid
)

##### Confusion Matrix Heatmap

In [None]:
# ---------- Decision Tree ----------
plot_confusion_matrix(
    'Decision Tree',
    y_pred_valid_multilabel_dt_smote,
    y_valid
)

# ---------- KNN ----------
plot_confusion_matrix(
    'KNN',
    y_pred_valid_multilabel_knn_smote,
    y_valid
)

##### ROC-AUC Curves

In [None]:
# ---------- Decision Tree ----------
plot_roc_curve(
    'Decision Tree',
    best_multilable_dt_smote,
    X_valid,
    y_valid,
    'Validation Set'
)

# ---------- KNN ----------
plot_roc_curve(
    'KNN',
    best_multilable_knn_smote,
    X_valid,
    y_valid,
    'Validation Set'
)

#### Evaluation on Test Set

In [None]:
# ---------- Decision Tree ----------
y_pred_test_multilabel_dt_smote = best_multilable_dt_smote.predict(X_test)

show_evaluation_results(
    "Decision Tree",
    y_pred_test_multilabel_dt_smote,
    y_test
)

# ---------- KNN ----------
y_pred_test_multilabel_knn_smote = best_multilable_knn_smote.predict(X_test)
show_evaluation_results(
    "KNN",
    y_pred_test_multilabel_knn_smote,
    y_test
)

##### Confusion Matrix Heatmap

In [None]:
# ---------- Decision Tree ----------
plot_confusion_matrix(
    'Decision Tree',
    y_pred_test_multilabel_dt_smote,
    y_test
)

# ---------- KNN ----------
plot_confusion_matrix(
    'KNN',
    y_pred_test_multilabel_knn_smote,
    y_test
)

##### ROC-AUC Curves

In [None]:
# ---------- Decision Tree ----------
plot_roc_curve(
    'Decision Tree',
    best_multilable_dt_smote,
    X_test,
    y_test,
    'Test Set'
)

# ---------- KNN ----------
plot_roc_curve(
    'KNN',
    best_multilable_knn_smote,
    X_test,
    y_test,
    'Test Set'
)

## Combine All Train and Test Images

### Checking label distribution

In [None]:
label_cols = ['miner', 'phoma', 'rust']
train_perc = (train_labels[label_cols].sum() / len(train_labels) * 100).reset_index()
train_perc.columns = ['Label', 'Percentage']
train_perc['Dataset'] = 'Train'

test_perc = (test_labels[label_cols].sum() / len(test_labels) * 100).reset_index()
test_perc.columns = ['Label', 'Percentage']
test_perc['Dataset'] = 'Test'

combined_perc = pd.concat([train_perc, test_perc])

plt.figure(figsize=(10, 6))
sns.barplot(data=combined_perc, x='Label', y='Percentage', hue='Dataset', palette='magma')

plt.title('Label Percentage Comparison: Train vs Test')
plt.ylabel('Percentage of Samples (%)')
plt.show()

### Preprocessing Data

In [None]:
all_features = np.vstack([train_features, test_features])
all_labels = pd.concat([train_labels, test_labels], axis=0)

X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(
    all_features,
    all_labels,
    test_size=0.2,
    stratify=all_labels,
    random_state=123
)

### Building and Evaluating Models

#### Without SMOTE

In [None]:
def build_model_all_data(model_type, model, param_grid):
    pipeline = SKPipeline([
        ('pca', PCA(random_state=123)),
        ('scaler', StandardScaler()),
        ('multi_output', MultiOutputClassifier(model, n_jobs=-1))
    ])
    
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='f1_macro',
        cv=10,
        n_jobs=-1
    )
    
    grid.fit(X_all_train, y_all_train)
    print(f"Best parameters for {model_type}: {grid.best_params_}")
    print(f"Best F1 Macro Score for {model_type}: {grid.best_score_}")
    
    return grid.best_estimator_

best_all_multilable_dt = build_model_all_data(
    'Decision Tree',
    DecisionTreeClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'multi_output__estimator__criterion': ['gini', 'entropy'],
        'multi_output__estimator__max_depth': [5, 8, 13, 18, None],
        'multi_output__estimator__min_samples_split': [2, 5],
        'multi_output__estimator__min_samples_leaf': [1, 3],
        'multi_output__estimator__class_weight': ['balanced', None],
        'multi_output__estimator__min_impurity_decrease': [0.0, 0.001, 0.01]
    }
)

best_all_multilable_knn = build_model_all_data(
    'KNN',
    KNeighborsClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'multi_output__estimator__n_neighbors': [1, 3, 5, 7, 9],
        'multi_output__estimator__metric': ['euclidean', 'manhattan', 'cosine'],
        'multi_output__estimator__weights': ['uniform', 'distance']
    }
)

In [None]:
# ---------- Decision Tree ----------
y_all_pred_multilabel_dt = best_all_multilable_dt.predict(X_all_test)

show_evaluation_results(
    "Decision Tree",
    y_all_pred_multilabel_dt,
    y_all_test
)

# ---------- KNN ----------
y_all_pred_multilabel_knn = best_all_multilable_knn.predict(X_all_test)

show_evaluation_results(
    "KNN",
    y_all_pred_multilabel_knn,
    y_all_test
)

##### Confusion Matrix Heatmap

In [None]:
# ---------- Decision Tree ----------
plot_confusion_matrix(
    'Decision Tree',
    y_all_pred_multilabel_dt,
    y_all_test
)

# ---------- KNN ----------
plot_confusion_matrix(
    'KNN',
    y_all_pred_multilabel_knn,
    y_all_test
)

##### ROC-AUC Curves

In [None]:
# ---------- Decision Tree ----------
plot_roc_curve(
    'Decision Tree',
    best_all_multilable_dt,
    X_all_test,
    y_all_test,
    'Test Set (New 20% Split)'
)

# ---------- KNN ----------
plot_roc_curve(
    'KNN',
    best_all_multilable_knn,
    X_all_test,
    y_all_test,
    'Test Set (New 20% Split)'
)

#### Apply SMOTE

In [None]:
def build_model_all_data_smote(model_type, model, param_grid):
    
    # A list to store best estimators from each fold
    best_estimators_list_all_data = []
    
    # A list to store best macro scores from each fold
    best_macro_scores_all_data = []
    
    y_all_train_np = np.array(y_all_train)
    X_all_train_np = np.array(X_all_train)
    
    # Loop through each label for multi-label classification
    for i in range(y_all_train.shape[1]):
        print(f"Processing label {i+1}/{y_all_train.shape[1]}")
        
        y_all_label = y_all_train_np[:, i]
        
        if len(np.unique(y_all_label)) < 2:
            print(f"Skipping label {i+1} since it has only one class.")
            continue
        
        # Define pipeline for single label
        single_label_all_data_pipeline = ImbPipeline([
            ('pca', PCA(random_state=123)),
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=123)),
            ('model', clone(model))
        ])
        
        # Use StratifiedKFold for better representation of classes in each fold
        cv_splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)
        
        grid = RandomizedSearchCV(
            estimator=single_label_all_data_pipeline,
            param_distributions=param_grid,
            n_iter=200,
            scoring='f1',
            cv=cv_splitter,
            n_jobs=-1,
            random_state=123
        )
        
        grid.fit(X_all_train_np, y_all_label)
        
        print(f"Best parameters for {model_type} label {i+1}: {grid.best_params_}")
        print(f"Best F1 score for label {i+1}: {grid.best_score_}")
        best_estimators_list_all_data.append(grid.best_estimator_)
        best_macro_scores_all_data.append(grid.best_score_)
    
    return CustomMultiOutputEstimator(best_estimators_list_all_data)

# ---------- Decision Tree ----------
best_all_multilable_dt_smote = build_model_all_data_smote(
    'Decision Tree',
    DecisionTreeClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'smote__k_neighbors': [3, 5, 7, 11],
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [5, 8, 13, 18, None],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 3],
        'model__class_weight': ['balanced', None],
        'model__min_impurity_decrease': [0.0, 0.001, 0.01]
    }
)

# ---------- KNN ----------
best_all_multilable_knn_smote = build_model_all_data_smote(
    'KNN',
    KNeighborsClassifier(),
    {
        'pca__n_components': [10, 20, 50],
        'smote__k_neighbors': [3, 5, 7, 11],
        'model__n_neighbors': [1, 3, 5, 7, 9],
        'model__metric': ['euclidean', 'manhattan', 'cosine'],
        'model__weights': ['uniform', 'distance']
    }
)

In [None]:
# ---------- Decision Tree ----------
y_all_pred_multilabel_dt_smote = best_all_multilable_dt_smote.predict(X_all_test)

show_evaluation_results(
    "Decision Tree",
    y_all_pred_multilabel_dt_smote,
    y_all_test
)

# ---------- KNN ----------
y_all_pred_multilabel_knn_smote = best_all_multilable_knn_smote.predict(X_all_test)

show_evaluation_results(
    "KNN",
    y_all_pred_multilabel_knn_smote,
    y_all_test
)

##### Confusion Matrix Heatmap

In [None]:
# ---------- Decision Tree ----------
plot_confusion_matrix(
    'Decision Tree',
    y_all_pred_multilabel_dt_smote,
    y_all_test
)

# ---------- KNN ----------
plot_confusion_matrix(
    'KNN',
    y_all_pred_multilabel_knn_smote,
    y_all_test
)

##### ROC-AUC Curves

In [None]:
# ---------- Decision Tree ----------
plot_roc_curve(
    'Decision Tree',
    best_all_multilable_dt_smote,
    X_all_test,
    y_all_test,
    'Test Set (New 20% Split)'
)

# ---------- KNN ----------
plot_roc_curve(
    'KNN',
    best_all_multilable_knn_smote,
    X_all_test,
    y_all_test,
    'Test Set (New 20% Split)'
)

## Save models

To save scikit-learn models, we use `joblib` which is more efficient for large numpy arrays:

In [None]:
import joblib

joblib.dump(best_multilable_dt, 'models/best_multilabel_dt.pkl')
joblib.dump(best_multilable_knn, 'models/best_multilabel_knn.pkl')
joblib.dump(best_all_multilable_dt, 'models/best_all_multilabel_dt.pkl')
joblib.dump(best_all_multilable_knn, 'models/best_all_multilabel_knn.pkl')
joblib.dump(best_multilable_dt_smote, 'models/best_multilabel_dt_smote.pkl')
joblib.dump(best_multilable_knn_smote, 'models/best_multilabel_knn_smote.pkl')
joblib.dump(best_all_multilable_dt_smote, 'models/best_all_multilabel_dt_smote.pkl')
joblib.dump(best_all_multilable_knn_smote, 'models/best_all_multilabel_knn_smote.pkl')