In [1]:
# %% [markdown]
# # Hybrid Feature Engineered Model: Training Notebook
# This notebook demonstrates how to extract hybrid features (color, texture, shape) from rice plant images and train a Random Forest classifier for disease classification, leveraging EDA insights.

# %% [markdown]
# ## 1. Install Required Packages

# %%
%pip install opencv-python-headless scikit-learn scikit-image joblib tqdm

Note: you may need to restart the kernel to use updated packages.


In [None]:
# %% [markdown]
# ## 2. Import Libraries

# %%
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import local_binary_pattern
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# %% [markdown]
# ## 3. Feature Extraction Functions

# %%
def extract_features(img):
    """
    Extract hybrid features: color histogram (HSV), texture (LBP), and shape (contour area).
    """
    features = []
    # Color: HSV histogram
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist_h = cv2.calcHist([hsv], [0], None, [8], [0, 180]).flatten()
    hist_s = cv2.calcHist([hsv], [1], None, [8], [0, 256]).flatten()
    hist_v = cv2.calcHist([hsv], [2], None, [8], [0, 256]).flatten()
    features.extend(hist_h)
    features.extend(hist_s)
    features.extend(hist_v)
    # Texture: Local Binary Pattern (LBP)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    lbp = local_binary_pattern(gray, P=8, R=1, method='uniform')
    (hist_lbp, _) = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
    features.extend(hist_lbp)
    # Shape: Largest contour area
    contours, _ = cv2.findContours(gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    largest_area = max([cv2.contourArea(cnt) for cnt in contours], default=0)
    features.append(largest_area)
    return np.array(features, dtype=np.float32)

# %% [markdown]
# ## 4. Data Preparation

# %%
# Load cleaned metadata (after EDA cleaning)
meta = pd.read_csv("Dataset/meta_train.csv")
# If you have already removed duplicates/unreliable images as in your EDA, use the cleaned DataFrame

# Map labels to integer indices for classification
label2idx = {label: idx for idx, label in enumerate(sorted(meta['label'].unique()))}
idx2label = {v: k for k, v in label2idx.items()}
meta['label_idx'] = meta['label'].map(label2idx)

# %% [markdown]
# ## 5. Extract Features from Images

# %%
IMAGE_SIZE = (224, 224)

features = []
labels = []

for i, row in tqdm(meta.iterrows(), total=len(meta)):
    img_path = f"Dataset/train_images/{row['label']}/{row['image_id']}"
    img = cv2.imread(img_path)
    if img is None:
        continue  # Skip missing/corrupted images
    img = cv2.resize(img, IMAGE_SIZE)
    # Green channel emphasis (from EDA)
    img[:, :, 1] = img[:, :, 1] * 0.6
    feats = extract_features(img)
    features.append(feats)
    labels.append(row['label_idx'])

features = np.array(features)
labels = np.array(labels)

print("Feature shape:", features.shape)
print("Labels shape:", labels.shape)

# %% [markdown]
# ## 6. Train/Test Split

# %%
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, stratify=labels, random_state=42
)

# %% [markdown]
# ## 7. Handle Class Imbalance (Optional)

# %%
# Compute class weights from label distribution (from EDA)
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}
print("Class weights:", class_weight_dict)

# %% [markdown]
# ## 8. Train Random Forest Classifier

# %%
rf = RandomForestClassifier(
    n_estimators=150,
    class_weight=class_weight_dict,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
rf.fit(X_train, y_train)

# %% [markdown]
# ## 9. Evaluate Model

# %%
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=[idx2label[i] for i in sorted(idx2label)]))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# %% [markdown]
# ## 10. Save Model for Later Use

# %%
joblib.dump(rf, "hybrid_rf_model_training3.pkl")
print("Model saved as hybrid_rf_model_training3.pkl")

# %% [markdown]
# ## 11. How to Use the Model for Inference

# %%
# Example: Predict on a new image
def predict_image(img_path, model, label_map):
    img = cv2.imread(img_path)
    img = cv2.resize(img, IMAGE_SIZE)
    img[:, :, 1] = img[:, :, 1] * 0.6  # Green emphasis
    feats = extract_features(img).reshape(1, -1)
    pred_idx = model.predict(feats)[0]
    return label_map[pred_idx]


100%|██████████| 10407/10407 [01:27<00:00, 118.90it/s]
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.1s


Feature shape: (10407, 35)
Labels shape: (10407,)
Class weights: {0: 2.172651356993737, 1: 2.7386842105263156, 2: 3.0881305637982197, 3: 0.5987917146144994, 4: 1.0784455958549224, 5: 0.7217059639389737, 6: 1.6785483870967741, 7: 0.6528858218318695, 8: 0.5899659863945578, 9: 0.9565257352941177}
                          precision    recall  f1-score   support

   bacterial_leaf_blight       0.92      0.83      0.87        96
   bacterial_leaf_streak       0.90      0.92      0.91        76
bacterial_panicle_blight       0.92      0.72      0.81        67
                   blast       0.88      0.94      0.91       348
              brown_spot       0.93      0.84      0.89       193
              dead_heart       0.85      0.93      0.89       288
            downy_mildew       0.94      0.85      0.89       124
                   hispa       0.91      0.92      0.92       319
                  normal       0.93      0.93      0.93       353
                  tungro       0.91      0.8

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 150 out of 150 | elapsed:    0.0s finished


In [8]:
print("Feature shape:", features.shape)
print("Labels shape:", labels.shape)

# Print the first 5 rows of features
print("\nFirst 5 rows of features:")
print(features[:5])

# Print the first 5 labels
print("\nFirst 5 labels:")
print(labels[:5])

Feature shape: (10407, 35)
Labels shape: (10407,)

First 5 rows of features:
[[3.0589e+04 1.7571e+04 7.3000e+01 5.0000e+00 1.0000e+01 3.3000e+01
  2.7400e+02 1.6210e+03 7.0000e+00 5.4000e+01 1.6460e+03 5.3340e+03
  4.2170e+03 5.0670e+03 7.3900e+03 2.6461e+04 2.5380e+03 5.0950e+03
  7.1710e+03 9.1970e+03 1.1037e+04 8.7970e+03 4.8910e+03 1.4500e+03
  3.6510e+03 4.3790e+03 2.4470e+03 4.6160e+03 1.0375e+04 5.5260e+03
  3.4930e+03 4.3890e+03 4.2380e+03 7.0620e+03 4.9729e+04]
 [3.3928e+04 1.3710e+04 2.8000e+01 4.0000e+00 1.0000e+00 7.0000e+00
  4.9100e+02 2.0070e+03 2.0000e+00 2.6000e+01 1.6450e+03 4.6410e+03
  5.0870e+03 6.2760e+03 8.3540e+03 2.4145e+04 1.1300e+03 4.0070e+03
  7.5840e+03 9.3970e+03 9.1380e+03 1.0188e+04 7.0300e+03 1.7020e+03
  4.0160e+03 4.6810e+03 2.2330e+03 4.4610e+03 9.5200e+03 5.2280e+03
  3.0360e+03 4.7200e+03 4.5350e+03 7.7460e+03 4.9729e+04]
 [2.5955e+04 2.1236e+04 2.2200e+02 1.1000e+01 2.1000e+01 5.6000e+01
  3.8300e+02 2.2920e+03 2.0000e+01 1.7500e+02 2.8980e+03 5.

In [None]:
# Usage:
loaded_model = joblib.load("hybrid_rf_model_training3.pkl")
result = predict_image("Dataset/train_images/tungro/100011.jpg", loaded_model, idx2label)
print("Predicted label:", result)

Predicted label: tungro


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 150 out of 150 | elapsed:    0.0s finished
