# Coffee Leaf Diseases Prediction

## Overview
This notebook is a reproduction of the coffee leaf disease classification method described in the research paper below, using machine learning techniques with RGB and CMY color features.

## References

### Research Paper
- **Title**: Comparative Analysis of the Performance of the Decision Tree and K-Nearest Neighbors Methods in Classifying Coffee Leaf Diseases
- **Authors**: Adie Suryadi, Murhaban Murhaban, Rivansyah Suhendra
- **Published in**: Department of Information Technology, Teuku Umar University, Indonesia
- **URL**: [https://aptikom-journal.id/conferenceseries/article/view/649/272](https://aptikom-journal.id/conferenceseries/article/view/649/272)

### Dataset
- **Dataset**: Coffee Leaf Diseases
- **Source**: Kaggle
- **URL**: [https://www.kaggle.com/datasets/badasstechie/coffee-leaf-diseases/code](https://www.kaggle.com/datasets/badasstechie/coffee-leaf-diseases/code)

## Methodology
This implementation extracts color-based features from coffee leaf images:
- **RGB features**: Mean and standard deviation for each R, G, B channel (6 features)
- **CMY features**: Mean and standard deviation for each C, M, Y channel (6 features)
- **Total**: 12 color-based features per image

The features are then used to classify coffee leaves into four categories:
- Miner
- Phoma
- Rust
- No disease

## Preprocessing Data

In [None]:
import numpy as np
from PIL import Image
import os

def rgb_to_cmy(rgb_image):
    # CMY = 1 - RGB
    cmy_image = 1.0 - rgb_image
    return cmy_image

def extract_color_features(image):
    features = []
    
    # RGB features (6)
    for channel in range(3):  # R, G, B
        channel_data = image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    # CMY features (6)
    cmy_image = rgb_to_cmy(image)
    for channel in range(3):  # C, M, Y
        channel_data = cmy_image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    return np.array(features)

def load_and_extract_features(image_dir, labels_df):
    features_list = []
    valid_indices = []
    
    for idx, row in labels_df.iterrows():
        img_path = os.path.join(image_dir, f"{row['id']}.jpg")
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            img_resized = img.resize((410, 205)) # resize to 410x205
            img_array = np.array(img_resized).astype('float32') / 255.0 # normalize 
            
            features = extract_color_features(img_array)
            features_list.append(features)
            valid_indices.append(idx)
        else:
            print(f"Warning: {img_path} not found")
    
    features_array = np.array(features_list)
    labels = labels_df.loc[valid_indices].reset_index(drop=True)
    
    return features_array, labels

def convert_to_single_label(row):
    if row['miner'] == 1:
        return 'miner'
    elif row['phoma'] == 1:
        return 'phoma'
    elif row['rust'] == 1:
        return 'rust'
    else:
        return 'nodisease'

In [None]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder 
import pandas as pd

train_label_df = pd.read_csv('dataset/train_classes.csv')
train_label_df['label'] = train_label_df.apply(convert_to_single_label, axis=1)

train_features, train_labels = load_and_extract_features('dataset/coffee-leaf-diseases/train/images', train_label_df)

X_train, X_valid, y_train, y_valid = train_test_split(
    train_features, 
    train_labels['label'],
    test_size=0.2,
    stratify=train_labels['label'],
    random_state=123
)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# ---------- Decision Tree ----------
dt = DecisionTreeClassifier(max_features=None, random_state=123, splitter='best')
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(1, 21)) + [None],
    'min_samples_split': range(2, 11),
    'min_samples_leaf': range(1, 6)
}
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=10)
grid_search_dt.fit(X_train_scaled, y_train)

best_model_dt = grid_search_dt.best_estimator_
print(grid_search_dt.best_params_)
print(grid_search_dt.best_score_)

# ---------- KNN ----------
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, n_jobs=-1, p=2, weights='uniform')
param_grid_knn = {
    'metric': ['euclidean', 'manhattan'],
    'n_neighbors': range(1, 21),
}
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=10)
grid_search_knn.fit(X_train_scaled, y_train)

best_model_knn = grid_search_knn.best_estimator_
print(grid_search_knn.best_params_)
print(grid_search_knn.best_score_)

# ---------- Logistic Regression ----------
lr = LogisticRegression(max_iter=1000, random_state=123) # max_iter need more than 500
param_grid_lr = {
    'solver': ['lbfgs', 'saga'],
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2']
}
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=10, scoring='accuracy', n_jobs=-1)
grid_search_lr.fit(X_train_scaled, y_train)

best_model_lr = grid_search_lr.best_estimator_
print(grid_search_lr.best_params_)
print(grid_search_lr.best_score_)

# ---------- Neural network ----------
nn = MLPClassifier(max_iter=500, random_state=123)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01]
}
grid_search_nn = GridSearchCV(estimator=nn, param_grid=param_grid,cv=10, scoring='accuracy', n_jobs=-1)
grid_search_nn.fit(X_train_scaled, y_train)

best_model_nn = grid_search_nn.best_estimator_
print(grid_search_nn.best_params_)
print(grid_search_nn.best_score_)

The best model for Decision Tree goes with below parameter:
- criterion: 'entropy'
- max_depth: 13
- min_samples_leaf: 1
- min_samples_split: 2

The best model for KNN goes with below parameter:
- metric: 'euclidean'
- n_neighbors: 1

The best model for Logistic Regression goes with below parameter:
- C: 10
- penalty: 'l2'
- solver': 'saga'

The best model for Neural Network goes with below parameter:
- alpha: 0.001
- hidden_layer_sizes: 50
- learning_rate_init: 0.001
- solver: 'lbfgs'

## Find the Best Model
### Using the parameters described in the paper

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ---------- Decision Tree ----------
dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=123,
    splitter='best'
)
dt.fit(X_train_scaled, y_train)
y_pred_valid_dt = dt.predict(X_valid_scaled)

print("\n=== Decision Tree Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_dt))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_dt, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_dt, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_dt, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_dt, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_dt, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_dt, average='macro', zero_division=0))

# ---------- KNN ----------
knn = KNeighborsClassifier(
    algorithm='auto',
    leaf_size=30,
    metric='minkowski',
    n_jobs=-1,
    n_neighbors=5,
    p=2,
    weights='uniform'
)
knn.fit(X_train_scaled, y_train)
y_pred_valid_knn = knn.predict(X_valid_scaled)

print("\n=== KNN Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_knn))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_knn, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_knn, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_knn, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_knn, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_knn, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_knn, average='macro', zero_division=0))

# ---------- Logistic Regression ----------
lr = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    random_state=123
)
lr.fit(X_train_scaled, y_train)
y_pred_valid_lr = lr.predict(X_valid_scaled)

print("\n=== Logistic Regression Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_lr))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_lr, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_lr, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_lr, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_lr, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_lr, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_lr, average='macro', zero_division=0))

# ---------- Neural Network ----------
nn = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='relu',
    solver='adam',
    # solver='lbfgs',
    max_iter=1000,
    random_state=123
)
nn.fit(X_train_scaled, y_train)
y_pred_valid_nn = nn.predict(X_valid_scaled)

print("\n=== Neural Network Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_nn))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_nn, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_nn, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_nn, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_nn, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_nn, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_nn, average='macro', zero_division=0))

#### Confusion Matrix Heatmap

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

labels = sorted(y_valid.unique())

# Decision Tree
cm_valid_dt = confusion_matrix(y_valid, y_pred_valid_dt)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_dt,
    annot=True,
    fmt='d',
    cmap='Greens',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid - Desicion Tree')
plt.show()

# KNN
cm_valid_knn = confusion_matrix(y_valid, y_pred_valid_knn)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_knn,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid - KNN')
plt.show()

# Logistic Regression
cm_valid_lr = confusion_matrix(y_valid, y_pred_valid_lr)  # 実際のラベルと Logistic Regression の予測ラベルを比較して混同行列を作成
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_lr,
    annot=True,
    fmt='d',
    cmap='Purples',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid - Logistic Regression')
plt.show()

# Neural Network
cm_valid_nn = confusion_matrix(y_valid, y_pred_valid_nn)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_nn,
    annot=True,
    fmt='d',
    cmap='Oranges',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid - Neural Network')
plt.show()

#### ROC-AUC Curves

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

# Decision Tree
y_score_valid_dt = dt.predict_proba(X_valid_scaled)
y_bin_valid_dt = label_binarize(y_valid, classes=dt.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(dt.classes_):
    fpr_dt, tpr_dt, _ = roc_curve(y_bin_valid_dt[:, i], y_score_valid_dt[:, i])
    roc_auc_dt = auc(fpr_dt, tpr_dt)
    plt.plot(
        fpr_dt,
        tpr_dt,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_dt:.4f})'
    )
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid - Decision Tree')
plt.legend()
plt.show()

# KNN
y_score_valid_knn = knn.predict_proba(X_valid_scaled)
y_bin_valid_knn = label_binarize(y_valid, classes=knn.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(knn.classes_):
    fpr_knn, tpr_knn, _ = roc_curve(y_bin_valid_knn[:, i], y_score_valid_knn[:, i])
    roc_auc_knn = auc(fpr_knn, tpr_knn)
    plt.plot(
        fpr_knn,
        tpr_knn,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_knn:.4f})'
    )
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid - KNN')
plt.legend()
plt.show()

# Logistic Regression
y_score_valid_lr = lr.predict_proba(X_valid_scaled)
y_bin_valid_lr = label_binarize(y_valid, classes=lr.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(lr.classes_):
    fpr_lr, tpr_lr, _ = roc_curve(y_bin_valid_lr[:, i], y_score_valid_lr[:, i])
    roc_auc_lr = auc(fpr_lr, tpr_lr)
    plt.plot(
        fpr_lr,
        tpr_lr,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_lr:.4f})'
    )

plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid - Logistic Regression')
plt.legend()
plt.show()

# Neural Network
y_score_valid_nn = nn.predict_proba(X_valid_scaled)
y_bin_valid_nn = label_binarize(y_valid, classes=nn.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(nn.classes_):
    fpr_nn, tpr_nn, _ = roc_curve(y_bin_valid_nn[:, i], y_score_valid_nn[:, i])
    roc_auc_nn = auc(fpr_nn, tpr_nn)
    plt.plot(
        fpr_nn,
        tpr_nn,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_nn:.4f})'
    )
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid - Neural Network')
plt.legend()
plt.show()

### Using the best parameters from CV

In [None]:
# ---------- Decision Tree ----------
y_pred_valid_dt_best = best_model_dt.predict(X_valid_scaled)

print("\n=== Decision Tree Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_dt_best))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_dt_best, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_dt_best, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_dt_best, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_dt_best, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_dt_best, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_dt_best, average='macro', zero_division=0))

# ---------- KNN ----------
y_pred_valid_knn_best = best_model_knn.predict(X_valid_scaled)

print("\n=== KNN Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_knn_best))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_knn_best, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_knn_best, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_knn_best, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_knn_best, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_knn_best, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_knn_best, average='macro', zero_division=0))

# ---------- Logistic Regression ----------
y_pred_valid_lr_best = best_model_lr.predict(X_valid_scaled)

print("\n=== Logistic Regression Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_lr_best))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_lr_best, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_lr_best, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_lr_best, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_lr_best, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_lr_best, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_lr_best, average='macro', zero_division=0))

# ---------- Neural Network ----------
y_pred_valid_nn_best = best_model_nn.predict(X_valid_scaled)

print("\n=== Nueral Network Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_valid, y_pred_valid_nn_best))
print("Precision (micro):", precision_score(y_valid, y_pred_valid_nn_best, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_valid, y_pred_valid_nn_best, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_valid, y_pred_valid_nn_best, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_valid, y_pred_valid_nn_best, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_valid, y_pred_valid_nn_best, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_valid, y_pred_valid_nn_best, average='macro', zero_division=0))

#### Confusion Matrix Heatmap

In [None]:
# Decision Tree
cm_valid_dt_best = confusion_matrix(y_valid, y_pred_valid_dt_best)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_dt_best,
    annot=True,
    fmt='d',
    cmap='Greens',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid Using the Best Model - Decision Tree')
plt.show()

# KNN
cm_valid_knn_best = confusion_matrix(y_valid, y_pred_valid_knn_best)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_knn_best,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid Using the Best Model - KNN')
plt.show()

# Logistic Regression
cm_valid_lr_best = confusion_matrix(y_valid, y_pred_valid_lr_best)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_lr_best,
    annot=True,
    fmt='d',
    cmap='Purples',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid Using the Best Model - Logistic Regression')
plt.show()

# Neural Network
cm_valid_nn_best = confusion_matrix(y_valid, y_pred_valid_nn_best)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_nn_best,
    annot=True,
    fmt='d',
    cmap='Oranges',
    xticklabels=labels,
    yticklabels=labels
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of X_valid Using the Best Model - Neural Network')
plt.show()

#### ROC-AUC Curves

In [None]:
# Decision Tree
y_score_valid_dt_best = best_model_dt.predict_proba(X_valid_scaled)
y_bin_valid_dt_best = label_binarize(y_valid, classes=best_model_dt.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(best_model_dt.classes_):
    fpr_dt, tpr_dt, _ = roc_curve(y_bin_valid_dt_best[:, i], y_score_valid_dt_best[:, i])
    roc_auc_dt_best = auc(fpr_dt, tpr_dt)
    plt.plot(
        fpr_dt,
        tpr_dt,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_dt_best:.4f})'
    )
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid Using the Best Model - Decision Tree')
plt.legend()
plt.show()

# KNN
y_score_valid_knn_best = best_model_knn.predict_proba(X_valid_scaled)
y_bin_valid_knn_best = label_binarize(y_valid, classes=best_model_knn.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(best_model_knn.classes_):
    fpr_knn, tpr_knn, _ = roc_curve(y_bin_valid_knn_best[:, i], y_score_valid_knn_best[:, i])
    roc_auc_knn_best = auc(fpr_knn, tpr_knn)
    plt.plot(
        fpr_knn,
        tpr_knn,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_knn_best:.4f})'
    )
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid Using the Best Model - KNN')
plt.legend()
plt.show()

# Logistic Regression
y_score_valid_lr_best = best_model_lr.predict_proba(X_valid_scaled)
y_bin_valid_lr_best = label_binarize(y_valid, classes=best_model_lr.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(best_model_lr.classes_):
    fpr_lr, tpr_lr, _ = roc_curve(y_bin_valid_lr_best[:, i], y_score_valid_lr_best[:, i])
    roc_auc_lr_best = auc(fpr_lr, tpr_lr)
    plt.plot(
        fpr_lr,
        tpr_lr,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_lr_best:.4f})'
    )

plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid Using the Best Model - Logistic Regression')
plt.legend()
plt.show()

# Neural Network
y_score_valid_nn_best = best_model_nn.predict_proba(X_valid_scaled)
y_bin_valid_nn_best = label_binarize(y_valid, classes=best_model_nn.classes_)
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(best_model_nn.classes_):
    fpr_nn, tpr_nn, _ = roc_curve(y_bin_valid_nn_best[:, i], y_score_valid_nn_best[:, i])
    roc_auc_nn_best = auc(fpr_nn, tpr_nn)
    plt.plot(
        fpr_nn,
        tpr_nn,
        lw=2,
        label=f'{class_name} (AUC = {roc_auc_nn_best:.4f})'
    )
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Predicting X_valid Using the Best Model - Neural Network')
plt.legend()
plt.show()

## Save models

To save scikit-learn models, we use `joblib` which is more efficient for large numpy arrays:

In [None]:
import joblib

joblib.dump(best_model_knn, 'best_model_knn.pkl')
joblib.dump(best_model_dt, 'best_model_dt.pkl')
joblib.dump(best_model_lr, 'best_model_lr.pkl')
joblib.dump(best_model_nn, 'best_model_dt.pkl')
joblib.dump(dt, 'decision_tree_model.pkl')
joblib.dump(knn, 'knn_model.pkl')
joblib.dump(lr, 'logistic_regression_model.pkl')
joblib.dump(knn, 'nn_model.pkl')

## Option - Hierarchical Clustering (Unsupervised learning)

- GridSearchCV is used to tune hyperparameters in supervised learning. Since hierarchical clustering is an unsupervised method, we cannot search for optimal parameters using GridSearchCV with cross-validation.
- ROC-AUC is also intended for supervised learning. Because AgglomerativeClustering does not provide predict_proba, we cannot plot ROC-AUC.
- For clustering algorithms, we instead use evaluation metrics such as Adjusted Rand Index (ARI) or Normalized Mutual Information (NMI).


Using linkage='complete' currently gives the best accuracy of 34%. This is lower than supervised learning, because clustering does not use label information, so its accuracy is not as high compared to supervised methods.

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix, classification_report
from scipy.cluster.hierarchy import dendrogram, linkage

# Hierarchical Clustering
hc = AgglomerativeClustering(
    n_clusters=len(label_encoder.classes_),  # match the number of the class
    linkage='complete'                       # linkage ('ward', 'single', 'complete', 'average', etc.)
)

# Result of clustering
y_pred_valid_hc = hc.fit_predict(X_valid_scaled)

# Confusion Matrix
cm_valid_hc = confusion_matrix(y_valid_encoded, y_pred_valid_hc)
print("\n=== Confusion Matrix (Hierarchical Clustering - Valid Data) ===")

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_valid_hc,
    annot=True,
    fmt='d',
    cmap='Greys',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.xlabel('Predicted Cluster')
plt.ylabel('Actual Class')
plt.title('Confusion Matrix of X_valid - Hierarchical Clustering')
plt.show()

# Classification Report
print("\n=== Classification Report (Hierarchical Clustering - Valid Data) ===")
print(classification_report(y_valid_encoded, y_pred_valid_hc, target_names=label_encoder.classes_))

# linkage
Z = linkage(X_valid_scaled, method='complete')  # method='ward', 'complete', 'average' など選べる

# Dendrogram
plt.figure(figsize=(10, 6))
dendrogram(Z, labels=y_valid.values, leaf_rotation=90, leaf_font_size=10)
plt.title("Hierarchical Clustering Dendrogram")
labels=y_valid.values
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.show()