In [None]:
import numpy as np
from PIL import Image
import os

def rgb_to_cmy(rgb_image):
    # CMY = 1 - RGB
    cmy_image = 1.0 - rgb_image
    return cmy_image

def extract_color_features(image):
    features = []
    
    # RGB features (6)
    for channel in range(3):  # R, G, B
        channel_data = image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    # CMY features (6)
    cmy_image = rgb_to_cmy(image)
    for channel in range(3):  # C, M, Y
        channel_data = cmy_image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    return np.array(features)

def load_and_extract_features(image_dir, labels_df):
    features_list = []
    valid_indices = []
    
    for idx, row in labels_df.iterrows():
        img_path = os.path.join(image_dir, f"{row['id']}.jpg")
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            img_resized = img.resize((410, 205))
            img_array = np.array(img_resized).astype('float32') / 255.0
            
            features = extract_color_features(img_array)
            features_list.append(features)
            valid_indices.append(idx)
        else:
            print(f"Warning: {img_path} not found")
    
    features_array = np.array(features_list)
    labels = labels_df.loc[valid_indices].reset_index(drop=True)
    
    return features_array, labels

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

train_label_df = pd.read_csv('dataset/train_classes.csv')
test_label_df = pd.read_csv('dataset/test_classes.csv')
train_label_df['nodisease'] = 1 - (train_label_df[['miner', 'rust', 'phoma']].sum(axis=1) > 0).astype(int)
test_label_df['nodisease'] = 1 - (test_label_df[['miner', 'rust', 'phoma']].sum(axis=1) > 0).astype(int)
train_label_df.loc[train_label_df['miner'] == 1, 'rust'] = 0
test_label_df.loc[test_label_df['miner'] == 1, 'rust'] = 0

train_features, train_labels = load_and_extract_features('dataset/coffee-leaf-diseases/train/images', train_label_df)
X_test, test_labels = load_and_extract_features('dataset/coffee-leaf-diseases/test/images', test_label_df)

y_test = test_labels[['nodisease', 'miner', 'rust', 'phoma']]

X_train, X_valid, y_train, y_valid = train_test_split(
    train_features, 
    train_labels[['nodisease', 'miner', 'phoma', 'rust']],
    test_size=0.2,
    stratify=train_labels[['nodisease', 'miner', 'phoma', 'rust']],
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [2]:
import numpy as np
from PIL import Image
import os

def rgb_to_cmy(rgb_image):
    # CMY = 1 - RGB
    cmy_image = 1.0 - rgb_image
    return cmy_image

def extract_color_features(image):
    features = []
    
    # RGB features (6)
    for channel in range(3):  # R, G, B
        channel_data = image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    # CMY features (6)
    cmy_image = rgb_to_cmy(image)
    for channel in range(3):  # C, M, Y
        channel_data = cmy_image[:, :, channel]
        features.append(np.mean(channel_data))  # Mean
        features.append(np.std(channel_data))   # Standard deviation
    
    return np.array(features)

def load_and_extract_features(image_dir, labels_df):
    features_list = []
    valid_indices = []
    
    for idx, row in labels_df.iterrows():
        img_path = os.path.join(image_dir, f"{row['id']}.jpg")
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            img_resized = img.resize((410, 205))
            img_array = np.array(img_resized).astype('float32') / 255.0
            
            features = extract_color_features(img_array)
            features_list.append(features)
            valid_indices.append(idx)
        else:
            print(f"Warning: {img_path} not found")
    
    features_array = np.array(features_list)
    labels = labels_df.loc[valid_indices].reset_index(drop=True)
    
    return features_array, labels

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

train_label_df = pd.read_csv('dataset/train_classes.csv')
test_label_df = pd.read_csv('dataset/test_classes.csv')

train_features, train_labels = load_and_extract_features('dataset/coffee-leaf-diseases/train/images', train_label_df)
X_test, test_labels = load_and_extract_features('dataset/coffee-leaf-diseases/test/images', test_label_df)

y_test = test_labels[['miner', 'rust', 'phoma']]

X_train, X_valid, y_train, y_valid = train_test_split(
    train_features, 
    train_labels[['miner', 'rust', 'phoma']],  # または適切なラベル列名
    test_size=0.2,
    stratify=train_labels[['miner', 'rust', 'phoma']],
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
print(X_train_scaled.shape)

(1011, 12)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ---------- Decision Tree ----------
dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=123,
    splitter='best'
)
multi_dt = MultiOutputClassifier(dt)
multi_dt.fit(X_train_scaled, y_train)
y_pred_dt = multi_dt.predict(X_test_scaled)

# Evaluate the model
print("\n=== Evaluation Decision Tree Metrics per label ===")
for i, label in enumerate(labels):
    y_true = y_test[label] if hasattr(y_test, 'columns') else y_test[:, i]
    y_pred = y_pred_dt[:, i]
    
    acc_dt = accuracy_score(y_true, y_pred)
    prec_dt = precision_score(y_true, y_pred, zero_division=0)
    rec_dt = recall_score(y_true, y_pred, zero_division=0)
    f1_dt = f1_score(y_true, y_pred, zero_division=0)
    
    print(f"{label}: Accuracy={acc_dt:.4f}, Precision={prec_dt:.4f}, Recall={rec_dt:.4f}, F1-score={f1_dt:.4f}")

print("\n=== Decision Tree Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_test, y_pred_dt))
print("Precision (micro):", precision_score(y_test, y_pred_dt, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_test, y_pred_dt, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_test, y_pred_dt, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_test, y_pred_dt, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_test, y_pred_dt, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_test, y_pred_dt, average='macro', zero_division=0))

# ---------- KNN ----------
knn = KNeighborsClassifier(
    algorithm='auto',
    leaf_size=30,
    metric='minkowski',
    n_jobs=-1,
    n_neighbors=5,
    p=2,
    weights='uniform'
)
multi_knn = MultiOutputClassifier(knn)
multi_knn.fit(X_train_scaled, y_train)
y_pred_knn = multi_knn.predict(X_test_scaled)

# Evaluate the model
print("\n=== Evaluation KNN Metrics per label ===")
for i, label in enumerate(labels):
    y_true = y_test[label] if hasattr(y_test, 'columns') else y_test[:, i]
    y_pred = y_pred_knn[:, i]
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    print(f"{label}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1-score={f1:.4f}")

print("\n=== KNN Overall Metrics ===")
print("Accuracy (subset accuracy):", accuracy_score(y_test, y_pred_knn))
print("Precision (micro):", precision_score(y_test, y_pred_knn, average='micro', zero_division=0))
print("Recall (micro):", recall_score(y_test, y_pred_knn, average='micro', zero_division=0))
print("F1-score (micro):", f1_score(y_test, y_pred_knn, average='micro', zero_division=0))
print("Precision (macro):", precision_score(y_test, y_pred_knn, average='macro', zero_division=0))
print("Recall (macro):", recall_score(y_test, y_pred_knn, average='macro', zero_division=0))
print("F1-score (macro):", f1_score(y_test, y_pred_knn, average='macro', zero_division=0))


=== Evaluation Decision Tree Metrics per label ===
miner: Accuracy=0.7200, Precision=0.5667, Recall=0.5312, F1-score=0.5484
rust: Accuracy=0.7700, Precision=0.4074, Recall=0.6111, F1-score=0.4889
phoma: Accuracy=0.8900, Precision=0.7097, Recall=0.9167, F1-score=0.8000

=== Decision Tree Overall Metrics ===
Accuracy (subset accuracy): 0.49
Precision (micro): 0.5681818181818182
Recall (micro): 0.6756756756756757
F1-score (micro): 0.6172839506172839
Precision (macro): 0.5612504978096376
Recall (macro): 0.6863425925925926
F1-score (macro): 0.6124253285543608

=== Evaluation KNN Metrics per label ===
miner: Accuracy=0.7600, Precision=0.6667, Recall=0.5000, F1-score=0.5714
rust: Accuracy=0.8200, Precision=0.5000, Recall=0.6111, F1-score=0.5500
phoma: Accuracy=0.9200, Precision=0.7500, Recall=1.0000, F1-score=0.8571

=== KNN Overall Metrics ===
Accuracy (subset accuracy): 0.58
Precision (micro): 0.6538461538461539
Recall (micro): 0.6891891891891891
F1-score (micro): 0.6710526315789473
Precis

In [None]:
from sklearn.model_selection import GridSearchCV

# ---------- Decision Tree ----------
dt = DecisionTreeClassifier(max_features=None, random_state=123, splitter='best')
multi_dt = MultiOutputClassifier(dt)
param_grid_dt = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': range(1, 21),
    'estimator__min_samples_split': range(2, 11),
    'estimator__min_samples_leaf': range(1, 6)
}
grid_search_dt = GridSearchCV(multi_dt, param_grid_dt, cv=5)
grid_search_dt.fit(X_train_features, y_train_multi)

print(grid_search_dt.best_params_)
print(grid_search_dt.best_score_)

# ---------- KNN ----------
knn = KNeighborsClassifier(algorithm='auto', n_jobs=-1)
multi_knn = MultiOutputClassifier(knn)
param_grid_knn = {
    'estimator__leaf_size': range(25, 36),
    'estimator__metric': ['euclidean', 'manhattan'],
    'estimator__n_neighbors': range(1, 21),
    'estimator__p': [1, 2],
    'estimator__weights': ['uniform', 'distance'],
}
grid_search_knn = GridSearchCV(multi_knn, param_grid_knn, cv=5)
grid_search_knn.fit(X_train_features, y_train_multi)

print(grid_search_knn.best_params_)
print(grid_search_knn.best_score_)

  _data = np.array(data, dtype=dtype, copy=copy,


{'estimator__criterion': 'entropy', 'estimator__max_depth': 7, 'estimator__min_samples_leaf': 4, 'estimator__min_samples_split': 2}
0.36944601292427376
{'estimator__leaf_size': 25, 'estimator__metric': 'euclidean', 'estimator__n_neighbors': 8, 'estimator__p': 1, 'estimator__weights': 'uniform'}
0.411267959094046


  _data = np.array(data, dtype=dtype, copy=copy,
