# CREATING ORGANIZED TOMATO POTATO DATASET

In [1]:
import os
import shutil

# --- CONFIGURATION ---
SOURCE_DIRECTORY = '/kaggle/input/combined-dataset1to4-modified/Combined_Dataset1to4'
OUTPUT_DIRECTORY = 'Organized_Dataset_Tomato_Potato' # New output folder

# ❗ KEY CHANGE: Define a list of the only plant categories you want to process.
# The script will ignore any folder that doesn't belong to these categories.
CLASSES_TO_PROCESS = ['Tomato', 'Potato']

# --- SCRIPT LOGIC ---

PLANT_MAP = {
    'Apple': 'Apple', 'Cotton': 'Cotton', 'Rice': 'Rice',
    'Blueberry': 'Blueberry', 'Cherry': 'Cherry', 'Corn': 'Corn', 'Maize': 'Corn',
    'Grape': 'Grape', 'Orange': 'Orange', 'Peach': 'Peach', 'Pepper': 'Pepper',
    'Potato': 'Potato', 'Raspberry': 'Raspberry', 'Soybean': 'Soybean',
    'Squash': 'Squash', 'Strawberry': 'Strawberry', 'Sugarcane': 'Sugarcane',
    'Tomato': 'Tomato', 'Wheat': 'Wheat'
}

def get_new_names(original_name):
    """Parses the original folder name to get the new category and subfolder name."""
    name_lower = original_name.lower()

    if '_on_' in name_lower:
        parts = original_name.split('_on_')
        return parts[1].capitalize(), parts[0].replace('_', ' ')
    if '_in_' in name_lower:
        parts = original_name.split('_in_')
        return parts[1].capitalize(), parts[0].replace('_', ' ')
    
    for keyword, category in PLANT_MAP.items():
        if keyword.lower() in name_lower:
            new_subfolder_name = original_name.replace(keyword, '').replace('___', '_').strip(' _')
            new_subfolder_name = new_subfolder_name.replace('(maize)', '').replace('(including_sour)', '').strip(' _')
            
            if not new_subfolder_name or new_subfolder_name.lower() == 'healthy':
                new_subfolder_name = 'healthy'
                
            return category, new_subfolder_name.replace('_', ' ').capitalize()

    return "Unclassified", original_name

def main():
    """Main function to create the new structure and copy files."""
    print(f"Preparing to organize folders for {CLASSES_TO_PROCESS}...")
    print(f"Source: '{SOURCE_DIRECTORY}'")

    try:
        all_folders = [f for f in os.listdir(SOURCE_DIRECTORY) if os.path.isdir(os.path.join(SOURCE_DIRECTORY, f))]
        print(f"Found {len(all_folders)} total folders to check.")
    except FileNotFoundError:
        print(f"❌ Error: The source directory was not found: '{SOURCE_DIRECTORY}'")
        return
        
    os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)
    
    processed_count = 0
    for original_folder_name in all_folders:
        source_path = os.path.join(SOURCE_DIRECTORY, original_folder_name)
        
        category, new_subfolder_name = get_new_names(original_folder_name)
        
        # ❗ KEY CHANGE: Check if the detected category is in our target list.
        if category in CLASSES_TO_PROCESS:
            # If it is, proceed with copying the folder.
            destination_path = os.path.join(OUTPUT_DIRECTORY, category, new_subfolder_name)
            
            print(f"Copying: '{original_folder_name}'  ->  '{category}/{new_subfolder_name}'")
            
            try:
                shutil.copytree(source_path, destination_path)
                processed_count += 1
            except FileExistsError:
                print(f"    - Skipped: Destination folder already exists.")
            except Exception as e:
                print(f"    - ❌ Error copying '{original_folder_name}': {e}")
        # If the category is not 'Tomato' or 'Potato', the script simply ignores it
        # and moves to the next folder.
            
    print(f"\n✅ Done! Processed and copied {processed_count} folders related to {CLASSES_TO_PROCESS}.")
    print(f"Your new, organized dataset is ready in the '{OUTPUT_DIRECTORY}' folder.")


if __name__ == "__main__":
    main()

Preparing to organize folders for ['Tomato', 'Potato']...
Source: '/kaggle/input/combined-dataset1to4-modified/Combined_Dataset1to4'
Found 74 total folders to check.
Copying: 'Tomato___Late_blight'  ->  'Tomato/Late blight'
Copying: 'Tomato___healthy'  ->  'Tomato/Healthy'
Copying: 'Tomato___Spider_mites_Two-spotted_spider_mite'  ->  'Tomato/Spider mites two-spotted spider mite'
Copying: 'Potato_Late_blight'  ->  'Potato/Late blight'
Copying: 'Tomato___Early_blight'  ->  'Tomato/Early blight'
Copying: 'Tomato___Septoria_leaf_spot'  ->  'Tomato/Septoria leaf spot'
Copying: 'Potato_healthy'  ->  'Potato/Healthy'
Copying: 'Tomato___Tomato_Yellow_Leaf_Curl_Virus'  ->  'Tomato/Yellow leaf curl virus'
Copying: 'Tomato___Bacterial_spot'  ->  'Tomato/Bacterial spot'
Copying: 'Tomato___Target_Spot'  ->  'Tomato/Target spot'
Copying: 'Potato_Early_blight'  ->  'Potato/Early blight'
Copying: 'Tomato___Tomato_mosaic_virus'  ->  'Tomato/Mosaic virus'
Copying: 'Tomato___Leaf_Mold'  ->  'Tomato/Leaf 

# Create Stage 1 Splits (Plant Classification)

In [2]:
import os
import shutil
import random

# --- CONFIGURATION ---
# The root directory of the previously organized dataset (Stage 0 output)
SOURCE_ROOT = '/kaggle/working/Organized_Dataset_Tomato_Potato'
# The root directory for the final Stage 1 (Coarse Classification) splits
DESTINATION_ROOT = 'Stage_1_Splits'

# Define the plant categories to process (must match the folder names in SOURCE_ROOT)
PLANT_CATEGORIES = ['Tomato', 'Potato']

# Define the desired split ratios (must sum to 1.0)
SPLIT_RATIOS = {
    'train': 0.70,
    'validation': 0.15,
    'test': 0.15
}

def split_data():
    """
    Combines all disease images per plant, shuffles them, and splits them
    into the final train, validation, and test directories for Stage 1.
    """
    if sum(SPLIT_RATIOS.values()) != 1.0:
        print("❌ Error: Split ratios must sum exactly to 1.0. Check your configuration.")
        return

    print(f"Starting data split process for Stage 1 (Train: {SPLIT_RATIOS['train']:.0%}, Valid: {SPLIT_RATIOS['validation']:.0%}, Test: {SPLIT_RATIOS['test']:.0%})")

    # 1. Create the necessary destination directories
    for split_type in SPLIT_RATIOS.keys():
        for category in PLANT_CATEGORIES:
            os.makedirs(os.path.join(DESTINATION_ROOT, split_type, category), exist_ok=True)

    total_images_processed = 0

    # 2. Process each plant category
    for category in PLANT_CATEGORIES:
        source_category_path = os.path.join(SOURCE_ROOT, category)

        if not os.path.exists(source_category_path):
            print(f"⚠️ Warning: Source folder not found for {category} at {source_category_path}. Skipping.")
            continue

        print(f"\n--- Processing {category} ---")

        # Collect all image file paths across all disease subfolders
        all_image_paths = []
        for root, _, files in os.walk(source_category_path):
            for file in files:
                # Basic check to ensure we only process image files
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    all_image_paths.append(os.path.join(root, file))

        # 3. Shuffle and split the paths
        random.shuffle(all_image_paths)
        total_count = len(all_image_paths)
        print(f"Total images found: {total_count}")

        if total_count == 0:
             print(f"Skipping {category}: No images found.")
             continue

        # Calculate split indices
        train_end = int(total_count * SPLIT_RATIOS['train'])
        validation_end = train_end + int(total_count * SPLIT_RATIOS['validation'])

        # Split the list
        train_files = all_image_paths[:train_end]
        validation_files = all_image_paths[train_end:validation_end]
        # Test takes the remainder to ensure all files are used
        test_files = all_image_paths[validation_end:] 

        # Store files to copy by split type
        split_files = {
            'train': train_files,
            'validation': validation_files,
            'test': test_files
        }

        # 4. Copy files to the final destination structure
        for split_type, file_list in split_files.items():
            destination_dir = os.path.join(DESTINATION_ROOT, split_type, category)
            print(f"  - Copying {len(file_list):5d} files to {split_type}/{category}...")

            for src_path in file_list:
                # Use os.path.basename to get only the filename (flattening the disease structure)
                dst_path = os.path.join(destination_dir, os.path.basename(src_path))
                try:
                    shutil.copy2(src_path, dst_path) # copy2 preserves metadata
                    total_images_processed += 1
                except Exception as e:
                     print(f"    - ❌ Error copying {os.path.basename(src_path)}: {e}")


    print(f"\n✅ Data splitting complete. Total images copied: {total_images_processed}")
    print(f"The Stage 1 dataset is ready in the '{DESTINATION_ROOT}' folder.")

if __name__ == "__main__":
    split_data()

Starting data split process for Stage 1 (Train: 70%, Valid: 15%, Test: 15%)

--- Processing Tomato ---
Total images found: 19006
  - Copying 13304 files to train/Tomato...
  - Copying  2850 files to validation/Tomato...
  - Copying  2852 files to test/Tomato...

--- Processing Potato ---
Total images found: 2344
  - Copying  1640 files to train/Potato...
  - Copying   351 files to validation/Potato...
  - Copying   353 files to test/Potato...

✅ Data splitting complete. Total images copied: 21350
The Stage 1 dataset is ready in the 'Stage_1_Splits' folder.


# Create Stage 2 Splits (Disease Classification)

In [3]:
import os
import shutil
import random

# --- CONFIGURATION ---
# SOURCE_ROOT: Source Directory, which is the output from the initial organization script
SOURCE_ROOT = '/kaggle/working/Organized_Dataset_Tomato_Potato'
# DESTINATION_ROOT: Destination Directory for the final Stage 2 split dataset
DESTINATION_ROOT = 'Stage_2_Splits'

# Plant Categories to process (must match the top-level folders in SOURCE_ROOT)
PLANT_CATEGORIES = ['Tomato', 'Potato']

# Desired split ratios: Train / Validation / Test (must sum to 1.0)
SPLIT_RATIOS = {
    'train': 0.70,
    'validation': 0.15,
    'test': 0.15
}

def split_disease_data():
    """
    For each plant category, this script divides the data by disease into separate
    train, validation, and test directories. This prepares the dataset for
    the specialized Stage 2 models.
    """
    if sum(SPLIT_RATIOS.values()) != 1.0:
        print("❌ Error: Split ratios must sum exactly to 1.0. Please check the configuration.")
        return

    print(f"Starting data split process for Stage 2 (Train: {SPLIT_RATIOS['train']:.0%}, Validation: {SPLIT_RATIOS['validation']:.0%}, Test: {SPLIT_RATIOS['test']:.0%})")

    total_images_processed = 0

    # 1. Loop through each plant category (e.g., 'Tomato', 'Potato')
    for category in PLANT_CATEGORIES:
        source_category_path = os.path.join(SOURCE_ROOT, category)

        if not os.path.exists(source_category_path):
            print(f"⚠️ Warning: Source folder not found for {category} at {source_category_path}. Skipping.")
            continue

        print(f"\n--- Processing {category} ---")

        # 2. Identify disease subfolders (which serve as the specific labels)
        disease_folders = [d for d in os.listdir(source_category_path)
                           if os.path.isdir(os.path.join(source_category_path, d))]

        if not disease_folders:
            print(f"  - No disease subfolders found for {category}. Skipping.")
            continue

        for disease_name in disease_folders:
            source_disease_path = os.path.join(source_category_path, disease_name)

            # 3. Collect all image file paths for this disease
            all_image_paths = [os.path.join(source_disease_path, f)
                               for f in os.listdir(source_disease_path)
                               if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

            random.shuffle(all_image_paths)
            total_count = len(all_image_paths)

            if total_count == 0:
                 print(f"  - No images found for disease '{disease_name}'. Skipping.")
                 continue

            print(f"  - Disease '{disease_name}': Total {total_count} images.")

            # 4. Calculate Split Indices
            train_end = int(total_count * SPLIT_RATIOS['train'])
            validation_end = train_end + int(total_count * SPLIT_RATIOS['validation'])

            train_files = all_image_paths[:train_end]
            validation_files = all_image_paths[train_end:validation_end]
             # Test takes the remainder to ensure all files are used
            test_files = all_image_paths[validation_end:]

            split_files = {
                'train': train_files,
                'validation': validation_files,
                'test': test_files
            }

            # 5. Copy files to the destination structure
            for split_type, file_list in split_files.items():
                # Destination Path: DESTINATION_ROOT / PLANT / SPLIT_TYPE / DISEASE
                destination_dir = os.path.join(DESTINATION_ROOT, category, split_type, disease_name)
                os.makedirs(destination_dir, exist_ok=True)

                print(f"    - Copying {len(file_list):4d} images to {split_type}/{disease_name}...")

                for src_path in file_list:
                    dst_path = os.path.join(destination_dir, os.path.basename(src_path))
                    try:
                        shutil.copy2(src_path, dst_path)
                        total_images_processed += 1
                    except Exception as e:
                        print(f"      - ❌ Error copying {os.path.basename(src_path)}: {e}")


    print(f"\n✅ Data splitting complete. Total images copied: {total_images_processed}")
    print(f"The Stage 2 dataset is ready in the '{DESTINATION_ROOT}' folder.")

if __name__ == "__main__":
    split_disease_data()

Starting data split process for Stage 2 (Train: 70%, Validation: 15%, Test: 15%)

--- Processing Tomato ---
  - Disease 'Leaf mold': Total 1061 images.
    - Copying  742 images to train/Leaf mold...
    - Copying  159 images to validation/Leaf mold...
    - Copying  160 images to test/Leaf mold...
  - Disease 'Target spot': Total 1422 images.
    - Copying  995 images to train/Target spot...
    - Copying  213 images to validation/Target spot...
    - Copying  214 images to test/Target spot...
  - Disease 'Bacterial spot': Total 2234 images.
    - Copying 1563 images to train/Bacterial spot...
    - Copying  335 images to validation/Bacterial spot...
    - Copying  336 images to test/Bacterial spot...
  - Disease 'Mosaic virus': Total 452 images.
    - Copying  316 images to train/Mosaic virus...
    - Copying   67 images to validation/Mosaic virus...
    - Copying   69 images to test/Mosaic virus...
  - Disease 'Yellow leaf curl virus': Total 5423 images.
    - Copying 3796 images to

In [4]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K

In [5]:
from huggingface_hub import login
login(new_session=False)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoModel
import torch, torchvision.transforms as T
from PIL import Image
import requests

MODEL_ID = "facebook/dinov3-convnext-base-pretrain-lvd1689m"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# load model
model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True).to(DEVICE).eval()

# preprocessing
transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

# test image
url = "https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg"
img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
x = transform(img).unsqueeze(0).to(DEVICE)

with torch.no_grad():
    out = model(pixel_values=x)

print("CLS embedding:", out.last_hidden_state[:,0,:].shape)

config.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

2025-10-28 09:22:14.469303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761643334.666729      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761643334.727652      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/350M [00:00<?, ?B/s]

CLS embedding: torch.Size([1, 1024])


# STAGE 1 TRAINING

In [8]:
from pathlib import Path
import numpy as np, json, torch
from PIL import Image
import torchvision.transforms as T
from transformers import AutoModel
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
from tqdm.auto import tqdm

# -------- CONFIG: adjust if needed --------
SPLIT_ROOT = Path("/kaggle/working/Stage_1_Splits")
OUT = Path("/kaggle/working/embeddings"); OUT.mkdir(parents=True, exist_ok=True)
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/dinov3-convnext-base-pretrain-lvd1689m"
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}
# ------------------------------------------

# HF auth (must have HF_TOKEN in Kaggle Secrets)
try:
    hf_token = UserSecretsClient().get_secret("HF_TOKEN")
    login(token=hf_token, new_session=False)
except:
    print("HF_TOKEN not found in Kaggle Secrets. Proceeding without login, which might fail for some models.")
    hf_token=None

# load model
model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True, token=hf_token).to(DEVICE).eval()

transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

# Collect class list from the 'train' subdirectory
train_dir = SPLIT_ROOT / "train"
if not train_dir.exists():
    raise SystemExit(f"The directory {train_dir} does not exist. Please check your file structure.")

# --- MODIFICATION START ---
# Original line that finds all folders:
# classes = sorted([p.name for p in train_dir.iterdir() if p.is_dir()])

# New line to only use specific folders:
classes = ["Potato", "Tomato"]
# --- MODIFICATION END ---

if not classes:
    raise SystemExit(f"No class folders found in {train_dir}. Expected structure: {SPLIT_ROOT}/train/<CLASS>")

cls2idx = {c:i for i,c in enumerate(classes)}
print("Processing only these classes:", len(classes), classes)

def collect_from_split(split_root, split_name):
    """Return (paths, labels) for the given split.
       Expects structure: split_root/<split_name>/<CLASS>/*.jpg
    """
    paths, labels = [], []
    split_folder = split_root / split_name
    if not split_folder.exists() or not split_folder.is_dir():
        return paths, np.array(labels, dtype=np.int32)

    for c in classes:
        class_folder = split_folder / c
        if not class_folder.exists() or not class_folder.is_dir():
            continue
        for f in sorted(class_folder.iterdir()):
            if f.is_file() and f.suffix.lower() in IMAGE_EXTS:
                paths.append(str(f))
                labels.append(cls2idx[c])
    return paths, np.array(labels, dtype=np.int32)

train_paths, y_train = collect_from_split(SPLIT_ROOT, "train")
val_paths,   y_val   = collect_from_split(SPLIT_ROOT, "validation")
test_paths,  y_test  = collect_from_split(SPLIT_ROOT, "test")

print(f"Images — train: {len(train_paths)}, val: {len(val_paths)}, test: {len(test_paths)}")

def batch_embed(paths, batch_size=BATCH_SIZE):
    embs = []
    model.eval()
    for i in tqdm(range(0, len(paths), batch_size), desc="Embedding batches"):
        batch = paths[i:i+batch_size]
        imgs = []
        for p in batch:
            try:
                imgs.append(transform(Image.open(p).convert("RGB")))
            except Exception as e:
                print("skip:", p, "err:", e)
        if not imgs:
            continue
        x = torch.stack(imgs).to(DEVICE)
        with torch.no_grad():
            out = model(pixel_values=x)
        embs.append(out.last_hidden_state[:,0,:].cpu().numpy())
    return np.vstack(embs) if embs else np.zeros((0,0), dtype=np.float32)

# Create embeddings if they don't exist
if (OUT/"X_train.npy").exists() and (OUT/"X_val.npy").exists() and (OUT/"X_test.npy").exists():
    print("Embeddings already exist — loading.")
    X_train = np.load(OUT/"X_train.npy"); y_train = np.load(OUT/"y_train.npy")
    X_val   = np.load(OUT/"X_val.npy");   y_val   = np.load(OUT/"y_val.npy")
    X_test  = np.load(OUT/"X_test.npy");  y_test  = np.load(OUT/"y_test.npy")
else:
    print("Extracting train embeddings...")
    X_train = batch_embed(train_paths)
    np.save(OUT/"X_train.npy", X_train); np.save(OUT/"y_train.npy", y_train)
    print("Saved X_train", X_train.shape)

    print("Extracting val embeddings...")
    X_val = batch_embed(val_paths)
    np.save(OUT/"X_val.npy", X_val); np.save(OUT/"y_val.npy", y_val)
    print("Saved X_val", X_val.shape)

    print("Extracting test embeddings...")
    X_test = batch_embed(test_paths)
    np.save(OUT/"X_test.npy", X_test); np.save(OUT/"y_test.npy", y_test)
    print("Saved X_test", X_test.shape)

print("Done. Shapes:", np.load(OUT/"X_train.npy").shape, np.load(OUT/"X_val.npy").shape, np.load(OUT/"X_test.npy").shape)
print("Embeddings saved to:", OUT.resolve())

Processing only these classes: 2 ['Potato', 'Tomato']
Images — train: 14938, val: 3201, test: 3205
Extracting train embeddings...


Embedding batches:   0%|          | 0/1868 [00:00<?, ?it/s]

Saved X_train (14938, 1024)
Extracting val embeddings...


Embedding batches:   0%|          | 0/401 [00:00<?, ?it/s]

Saved X_val (3201, 1024)
Extracting test embeddings...


Embedding batches:   0%|          | 0/401 [00:00<?, ?it/s]

Saved X_test (3205, 1024)
Done. Shapes: (14938, 1024) (3201, 1024) (3205, 1024)
Embeddings saved to: /kaggle/working/embeddings


In [9]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, normalize
import joblib
import os

EMB_DIR = "/kaggle/working/embeddings"
MODEL_OUT = os.path.join(EMB_DIR, "stage1_classifier_model.joblib")


# load
X_train = np.load(os.path.join(EMB_DIR, "X_train.npy"))
y_train = np.load(os.path.join(EMB_DIR, "y_train.npy"))
X_val   = np.load(os.path.join(EMB_DIR, "X_val.npy"))
y_val   = np.load(os.path.join(EMB_DIR, "y_val.npy"))

print("Shapes loaded — X_train:", X_train.shape, "y_train:", y_train.shape, "X_val:", X_val.shape, "y_val:", y_val.shape)

# basic sanity
if X_train.size == 0 or X_val.size == 0:
    raise SystemExit("Empty embeddings — check previous step. Exiting.")

if X_train.shape[0] != y_train.shape[0] or X_val.shape[0] != y_val.shape[0]:
    raise SystemExit("Mismatch between number of embeddings and labels. Exiting.")

# Option A: Standardize (common). with_mean=True normally okay unless huge memmap / sparse
sc = StandardScaler(with_mean=True, with_std=True)
X_train_s = sc.fit_transform(X_train)
X_val_s   = sc.transform(X_val)

# Option B (alternative, often good for cosine-like): L2 normalize
# X_train_s = normalize(X_train, norm='l2')
# X_val_s   = normalize(X_val, norm='l2')

# Train logistic regression (use class_weight='balanced' if classes imbalanced)
clf = LogisticRegression(max_iter=2000, n_jobs=-1, C=1.0, class_weight=None, random_state=42)
clf.fit(X_train_s, y_train)

# Predict & evaluate
y_pred = clf.predict(X_val_s)
acc = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {acc:.4f}\n")
print("Classification report:\n", classification_report(y_val, y_pred))

# Save scaler + model
joblib.dump((sc, clf), MODEL_OUT)
print("Saved model to:", MODEL_OUT)


Shapes loaded — X_train: (14938, 1024) y_train: (14938,) X_val: (3201, 1024) y_val: (3201,)
Validation accuracy: 0.9916

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       351
           1       0.99      1.00      1.00      2850

    accuracy                           0.99      3201
   macro avg       0.98      0.98      0.98      3201
weighted avg       0.99      0.99      0.99      3201

Saved model to: /kaggle/working/embeddings/stage1_classifier_model.joblib


# STAGE 2 TOMATO 

In [10]:
from pathlib import Path
import numpy as np, torch
from PIL import Image
import torchvision.transforms as T
from transformers import AutoModel
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
from tqdm.auto import tqdm

# -------- CONFIG --------
SPLIT_ROOT = Path("/kaggle/working/Stage_2_Splits/Tomato")
OUT = Path("/kaggle/working/embeddings_tomato"); OUT.mkdir(parents=True, exist_ok=True)
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/dinov3-convnext-base-pretrain-lvd1689m"
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}
# -------------------------

# Hugging Face auth
try:
    hf_token = UserSecretsClient().get_secret("HF_TOKEN")
    login(token=hf_token, new_session=False)
except:
    print("HF_TOKEN not found in Kaggle Secrets — proceeding without it.")
    hf_token = None

# Load pretrained DINOv2 model
model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True, token=hf_token).to(DEVICE).eval()

# Image transform
transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

# ---------------------------------------------------------------
# Collect class folders under Tomato/train
# ---------------------------------------------------------------
train_dir = SPLIT_ROOT / "train"
if not train_dir.exists():
    raise SystemExit(f"The directory {train_dir} does not exist.")

classes = sorted([p.name for p in train_dir.iterdir() if p.is_dir()])
if not classes:
    raise SystemExit("No disease folders found inside Tomato/train.")

cls2idx = {c:i for i,c in enumerate(classes)}
print(f"Found {len(classes)} Tomato disease classes:\n", classes)

# ---------------------------------------------------------------
# Collect image paths and labels for each split
# ---------------------------------------------------------------
def collect_from_split(split_root, split_name):
    paths, labels = [], []
    split_folder = split_root / split_name
    if not split_folder.exists():
        return paths, np.array(labels, dtype=np.int32)

    for c in classes:
        class_folder = split_folder / c
        if not class_folder.exists():
            continue
        for f in sorted(class_folder.iterdir()):
            if f.is_file() and f.suffix.lower() in IMAGE_EXTS:
                paths.append(str(f))
                labels.append(cls2idx[c])
    return paths, np.array(labels, dtype=np.int32)

train_paths, y_train = collect_from_split(SPLIT_ROOT, "train")
val_paths,   y_val   = collect_from_split(SPLIT_ROOT, "validation")
test_paths,  y_test  = collect_from_split(SPLIT_ROOT, "test")

print(f"Images — train: {len(train_paths)}, val: {len(val_paths)}, test: {len(test_paths)}")

# ---------------------------------------------------------------
# Embedding extraction
# ---------------------------------------------------------------
def batch_embed(paths, batch_size=BATCH_SIZE):
    embs = []
    model.eval()
    for i in tqdm(range(0, len(paths), batch_size), desc="Embedding batches"):
        batch = paths[i:i+batch_size]
        imgs = []
        for p in batch:
            try:
                imgs.append(transform(Image.open(p).convert("RGB")))
            except Exception as e:
                print("skip:", p, "err:", e)
        if not imgs:
            continue
        x = torch.stack(imgs).to(DEVICE)
        with torch.no_grad():
            out = model(pixel_values=x)
        embs.append(out.last_hidden_state[:,0,:].cpu().numpy())
    return np.vstack(embs) if embs else np.zeros((0,0), dtype=np.float32)

# ---------------------------------------------------------------
# Save embeddings
# ---------------------------------------------------------------
if (OUT/"X_train.npy").exists():
    print("Embeddings already exist — loading.")
    X_train = np.load(OUT/"X_train.npy"); y_train = np.load(OUT/"y_train.npy")
    X_val   = np.load(OUT/"X_val.npy");   y_val   = np.load(OUT/"y_val.npy")
    X_test  = np.load(OUT/"X_test.npy");  y_test  = np.load(OUT/"y_test.npy")
else:
    print("Extracting train embeddings...")
    X_train = batch_embed(train_paths)
    np.save(OUT/"X_train.npy", X_train); np.save(OUT/"y_train.npy", y_train)
    print("Saved X_train", X_train.shape)

    print("Extracting val embeddings...")
    X_val = batch_embed(val_paths)
    np.save(OUT/"X_val.npy", X_val); np.save(OUT/"y_val.npy", y_val)
    print("Saved X_val", X_val.shape)

    print("Extracting test embeddings...")
    X_test = batch_embed(test_paths)
    np.save(OUT/"X_test.npy", X_test); np.save(OUT/"y_test.npy", y_test)
    print("Saved X_test", X_test.shape)

print("Done. Shapes:")
print("  Train:", np.load(OUT/'X_train.npy').shape)
print("  Val:  ", np.load(OUT/'X_val.npy').shape)
print("  Test: ", np.load(OUT/'X_test.npy').shape)
print("Tomato embeddings saved to:", OUT.resolve())


Found 10 Tomato disease classes:
 ['Bacterial spot', 'Early blight', 'Healthy', 'Late blight', 'Leaf mold', 'Mosaic virus', 'Septoria leaf spot', 'Spider mites two-spotted spider mite', 'Target spot', 'Yellow leaf curl virus']
Images — train: 13300, val: 2847, test: 2859
Extracting train embeddings...


Embedding batches:   0%|          | 0/1663 [00:00<?, ?it/s]

Saved X_train (13300, 1024)
Extracting val embeddings...


Embedding batches:   0%|          | 0/356 [00:00<?, ?it/s]

Saved X_val (2847, 1024)
Extracting test embeddings...


Embedding batches:   0%|          | 0/358 [00:00<?, ?it/s]

Saved X_test (2859, 1024)
Done. Shapes:
  Train: (13300, 1024)
  Val:   (2847, 1024)
  Test:  (2859, 1024)
Tomato embeddings saved to: /kaggle/working/embeddings_tomato


In [11]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, normalize
import joblib
import os

EMB_DIR = "/kaggle/working/embeddings_tomato"
MODEL_OUT = os.path.join(EMB_DIR, "stage2_classifier_model_tomato.joblib")


# load
X_train = np.load(os.path.join(EMB_DIR, "X_train.npy"))
y_train = np.load(os.path.join(EMB_DIR, "y_train.npy"))
X_val   = np.load(os.path.join(EMB_DIR, "X_val.npy"))
y_val   = np.load(os.path.join(EMB_DIR, "y_val.npy"))

print("Shapes loaded — X_train:", X_train.shape, "y_train:", y_train.shape, "X_val:", X_val.shape, "y_val:", y_val.shape)

# basic sanity
if X_train.size == 0 or X_val.size == 0:
    raise SystemExit("Empty embeddings — check previous step. Exiting.")

if X_train.shape[0] != y_train.shape[0] or X_val.shape[0] != y_val.shape[0]:
    raise SystemExit("Mismatch between number of embeddings and labels. Exiting.")

# Option A: Standardize (common). with_mean=True normally okay unless huge memmap / sparse
sc = StandardScaler(with_mean=True, with_std=True)
X_train_s = sc.fit_transform(X_train)
X_val_s   = sc.transform(X_val)

# Option B (alternative, often good for cosine-like): L2 normalize
# X_train_s = normalize(X_train, norm='l2')
# X_val_s   = normalize(X_val, norm='l2')

# Train logistic regression (use class_weight='balanced' if classes imbalanced)
clf = LogisticRegression(max_iter=2000, n_jobs=-1, C=1.0, class_weight=None, random_state=42)
clf.fit(X_train_s, y_train)

# Predict & evaluate
y_pred = clf.predict(X_val_s)
acc = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {acc:.4f}\n")
print("Classification report:\n", classification_report(y_val, y_pred))

# Save scaler + model
joblib.dump((sc, clf), MODEL_OUT)
print("Saved model to:", MODEL_OUT)


Shapes loaded — X_train: (13300, 1024) y_train: (13300,) X_val: (2847, 1024) y_val: (2847,)
Validation accuracy: 0.9456

Classification report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.92       335
           1       0.83      0.83      0.83       165
           2       0.96      0.96      0.96       252
           3       0.93      0.92      0.93       301
           4       0.91      0.94      0.93       159
           5       0.94      0.88      0.91        67
           6       0.94      0.94      0.94       291
           7       0.94      0.96      0.95       251
           8       0.94      0.93      0.94       213
           9       0.99      0.99      0.99       813

    accuracy                           0.95      2847
   macro avg       0.93      0.93      0.93      2847
weighted avg       0.95      0.95      0.95      2847

Saved model to: /kaggle/working/embeddings_tomato/stage2_classifier_model_tomato.joblib


# STAGE 2 POTATO

In [12]:
from pathlib import Path
import numpy as np, torch
from PIL import Image
import torchvision.transforms as T
from transformers import AutoModel
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
from tqdm.auto import tqdm

# -------- CONFIG --------
SPLIT_ROOT = Path("/kaggle/working/Stage_2_Splits/Potato")
OUT = Path("/kaggle/working/embeddings_potato"); OUT.mkdir(parents=True, exist_ok=True)
BATCH_SIZE = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/dinov3-convnext-base-pretrain-lvd1689m"
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}
# -------------------------

# Hugging Face auth
try:
    hf_token = UserSecretsClient().get_secret("HF_TOKEN")
    login(token=hf_token, new_session=False)
except:
    print("HF_TOKEN not found in Kaggle Secrets — proceeding without it.")
    hf_token = None

# Load pretrained DINOv2 model
model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True, token=hf_token).to(DEVICE).eval()

# Image transform
transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

# ---------------------------------------------------------------
# Collect class folders under Potato/train
# ---------------------------------------------------------------
train_dir = SPLIT_ROOT / "train"
if not train_dir.exists():
    raise SystemExit(f"The directory {train_dir} does not exist.")

classes = sorted([p.name for p in train_dir.iterdir() if p.is_dir()])
if not classes:
    raise SystemExit("No disease folders found inside Potato/train.")

cls2idx = {c:i for i,c in enumerate(classes)}
print(f"Found {len(classes)} Potato disease classes:\n", classes)

# ---------------------------------------------------------------
# Collect image paths and labels for each split
# ---------------------------------------------------------------
def collect_from_split(split_root, split_name):
    paths, labels = [], []
    split_folder = split_root / split_name
    if not split_folder.exists():
        return paths, np.array(labels, dtype=np.int32)

    for c in classes:
        class_folder = split_folder / c
        if not class_folder.exists():
            continue
        for f in sorted(class_folder.iterdir()):
            if f.is_file() and f.suffix.lower() in IMAGE_EXTS:
                paths.append(str(f))
                labels.append(cls2idx[c])
    return paths, np.array(labels, dtype=np.int32)

train_paths, y_train = collect_from_split(SPLIT_ROOT, "train")
val_paths,   y_val   = collect_from_split(SPLIT_ROOT, "validation")
test_paths,  y_test  = collect_from_split(SPLIT_ROOT, "test")

print(f"Images — train: {len(train_paths)}, val: {len(val_paths)}, test: {len(test_paths)}")

# ---------------------------------------------------------------
# Embedding extraction
# ---------------------------------------------------------------
def batch_embed(paths, batch_size=BATCH_SIZE):
    embs = []
    model.eval()
    for i in tqdm(range(0, len(paths), batch_size), desc="Embedding batches"):
        batch = paths[i:i+batch_size]
        imgs = []
        for p in batch:
            try:
                imgs.append(transform(Image.open(p).convert("RGB")))
            except Exception as e:
                print("skip:", p, "err:", e)
        if not imgs:
            continue
        x = torch.stack(imgs).to(DEVICE)
        with torch.no_grad():
            out = model(pixel_values=x)
        embs.append(out.last_hidden_state[:,0,:].cpu().numpy())
    return np.vstack(embs) if embs else np.zeros((0,0), dtype=np.float32)

# ---------------------------------------------------------------
# Save embeddings
# ---------------------------------------------------------------
if (OUT/"X_train.npy").exists():
    print("Embeddings already exist — loading.")
    X_train = np.load(OUT/"X_train.npy"); y_train = np.load(OUT/"y_train.npy")
    X_val   = np.load(OUT/"X_val.npy");   y_val   = np.load(OUT/"y_val.npy")
    X_test  = np.load(OUT/"X_test.npy");  y_test  = np.load(OUT/"y_test.npy")
else:
    print("Extracting train embeddings...")
    X_train = batch_embed(train_paths)
    np.save(OUT/"X_train.npy", X_train); np.save(OUT/"y_train.npy", y_train)
    print("Saved X_train", X_train.shape)

    print("Extracting val embeddings...")
    X_val = batch_embed(val_paths)
    np.save(OUT/"X_val.npy", X_val); np.save(OUT/"y_val.npy", y_val)
    print("Saved X_val", X_val.shape)

    print("Extracting test embeddings...")
    X_test = batch_embed(test_paths)
    np.save(OUT/"X_test.npy", X_test); np.save(OUT/"y_test.npy", y_test)
    print("Saved X_test", X_test.shape)

print("Done. Shapes:")
print("  Train:", np.load(OUT/'X_train.npy').shape)
print("  Val:  ", np.load(OUT/'X_val.npy').shape)
print("  Test: ", np.load(OUT/'X_test.npy').shape)
print("Potato embeddings saved to:", OUT.resolve())


Found 3 Potato disease classes:
 ['Early blight', 'Healthy', 'Late blight']
Images — train: 1639, val: 350, test: 355
Extracting train embeddings...


Embedding batches:   0%|          | 0/205 [00:00<?, ?it/s]

Saved X_train (1639, 1024)
Extracting val embeddings...


Embedding batches:   0%|          | 0/44 [00:00<?, ?it/s]

Saved X_val (350, 1024)
Extracting test embeddings...


Embedding batches:   0%|          | 0/45 [00:00<?, ?it/s]

Saved X_test (355, 1024)
Done. Shapes:
  Train: (1639, 1024)
  Val:   (350, 1024)
  Test:  (355, 1024)
Potato embeddings saved to: /kaggle/working/embeddings_potato


In [13]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, normalize
import joblib
import os

EMB_DIR = "/kaggle/working/embeddings_potato"
MODEL_OUT = os.path.join(EMB_DIR, "stage2_classifier_model_potato.joblib")


# load
X_train = np.load(os.path.join(EMB_DIR, "X_train.npy"))
y_train = np.load(os.path.join(EMB_DIR, "y_train.npy"))
X_val   = np.load(os.path.join(EMB_DIR, "X_val.npy"))
y_val   = np.load(os.path.join(EMB_DIR, "y_val.npy"))

print("Shapes loaded — X_train:", X_train.shape, "y_train:", y_train.shape, "X_val:", X_val.shape, "y_val:", y_val.shape)

# basic sanity
if X_train.size == 0 or X_val.size == 0:
    raise SystemExit("Empty embeddings — check previous step. Exiting.")

if X_train.shape[0] != y_train.shape[0] or X_val.shape[0] != y_val.shape[0]:
    raise SystemExit("Mismatch between number of embeddings and labels. Exiting.")

# Option A: Standardize (common). with_mean=True normally okay unless huge memmap / sparse
sc = StandardScaler(with_mean=True, with_std=True)
X_train_s = sc.fit_transform(X_train)
X_val_s   = sc.transform(X_val)

# Option B (alternative, often good for cosine-like): L2 normalize
# X_train_s = normalize(X_train, norm='l2')
# X_val_s   = normalize(X_val, norm='l2')

# Train logistic regression (use class_weight='balanced' if classes imbalanced)
clf = LogisticRegression(max_iter=2000, n_jobs=-1, C=1.0, class_weight=None, random_state=42)
clf.fit(X_train_s, y_train)

# Predict & evaluate
y_pred = clf.predict(X_val_s)
acc = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {acc:.4f}\n")
print("Classification report:\n", classification_report(y_val, y_pred))

# Save scaler + model
joblib.dump((sc, clf), MODEL_OUT)
print("Saved model to:", MODEL_OUT)

Shapes loaded — X_train: (1639, 1024) y_train: (1639,) X_val: (350, 1024) y_val: (350,)
Validation accuracy: 0.9457

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       164
           1       0.95      0.91      0.93        22
           2       0.93      0.96      0.94       164

    accuracy                           0.95       350
   macro avg       0.95      0.94      0.94       350
weighted avg       0.95      0.95      0.95       350

Saved model to: /kaggle/working/embeddings_potato/stage2_classifier_model_potato.joblib


# 2 STAGE INFERENCE PIPELINE

In [21]:
import os
import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# ================== PATHS ==================
STAGE1_DIR = "/kaggle/working/embeddings"
STAGE2_TOMATO_DIR = "/kaggle/working/embeddings_tomato"
STAGE2_POTATO_DIR = "/kaggle/working/embeddings_potato"

STAGE1_MODEL_PATH = os.path.join(STAGE1_DIR, "stage1_classifier_model.joblib")
STAGE2_TOMATO_MODEL_PATH = os.path.join(STAGE2_TOMATO_DIR, "stage2_classifier_model_tomato.joblib")
STAGE2_POTATO_MODEL_PATH = os.path.join(STAGE2_POTATO_DIR, "stage2_classifier_model_potato.joblib")

OUTPUT_CSV = "/kaggle/working/final_inference_results.csv"

# ================== UTILS ==================
def load_model_safely(path):
    """Handles joblib files that may store scaler+model tuples or dicts."""
    obj = joblib.load(path)

    # Case 1: tuple (scaler, model)
    if isinstance(obj, tuple):
        if len(obj) == 2 and hasattr(obj[1], "predict"):
            return obj  # (scaler, model)
        else:
            return obj[0]

    # Case 2: dict format
    if isinstance(obj, dict):
        if "scaler" in obj and "model" in obj:
            return (obj["scaler"], obj["model"])
        elif "model" in obj:
            return obj["model"]

    # Case 3: direct model
    return obj


def predict_with_optional_scaler(model_obj, X):
    """Applies scaler if available and returns predictions."""
    if isinstance(model_obj, tuple):
        scaler, model = model_obj
        X = scaler.transform(X)
        return model.predict(X), model
    else:
        return model_obj.predict(X), model_obj


def predict_proba_with_optional_scaler(model_obj, X):
    """Predicts probabilities (for multi-class models)."""
    if isinstance(model_obj, tuple):
        scaler, model = model_obj
        X = scaler.transform(X)
        return model.predict_proba(X), model
    else:
        return model_obj.predict_proba(X), model_obj


# ================== LOAD MODELS ==================
print(" Loading trained models...")
stage1_model = load_model_safely(STAGE1_MODEL_PATH)
stage2_tomato = load_model_safely(STAGE2_TOMATO_MODEL_PATH)
stage2_potato = load_model_safely(STAGE2_POTATO_MODEL_PATH)
print("All models loaded successfully.\n")

# ================== LOAD EMBEDDINGS ==================
print(" Loading precomputed embeddings...")

X_stage1 = np.load(os.path.join(STAGE1_DIR, "X_test.npy"))
y_stage1 = np.load(os.path.join(STAGE1_DIR, "y_test.npy"))

X_tomato = np.load(os.path.join(STAGE2_TOMATO_DIR, "X_test.npy"))
y_tomato = np.load(os.path.join(STAGE2_TOMATO_DIR, "y_test.npy"))

X_potato = np.load(os.path.join(STAGE2_POTATO_DIR, "X_test.npy"))
y_potato = np.load(os.path.join(STAGE2_POTATO_DIR, "y_test.npy"))

print(f"Stage1 embeddings: {X_stage1.shape}")
print(f"Tomato embeddings: {X_tomato.shape}")
print(f"Potato embeddings: {X_potato.shape}\n")

# ================== STAGE 1 INFERENCE ==================
print(" Running Stage 1: Plant type classification...")
stage1_preds, stage1_clf = predict_with_optional_scaler(stage1_model, X_stage1)
stage1_labels = np.array(["Tomato" if p == 1 else "Potato" for p in stage1_preds])
stage1_acc = accuracy_score(y_stage1, stage1_preds)
print(f"Stage 1 Accuracy: {stage1_acc*100:.2f}%\n")

# ================== STAGE 2 INFERENCE ==================
print(" Running Stage 2: Disease classification...")

# Tomato disease prediction
print(" Predicting Tomato diseases...")
tomato_probs, tomato_clf = predict_proba_with_optional_scaler(stage2_tomato, X_tomato)
tomato_preds = tomato_clf.classes_[np.argmax(tomato_probs, axis=1)]
tomato_acc = accuracy_score(y_tomato, tomato_preds)
print(f" Tomato Disease Accuracy: {tomato_acc*100:.2f}%\n")

# Potato disease prediction
print(" Predicting Potato diseases...")
potato_probs, potato_clf = predict_proba_with_optional_scaler(stage2_potato, X_potato)
potato_preds = potato_clf.classes_[np.argmax(potato_probs, axis=1)]
potato_acc = accuracy_score(y_potato, potato_preds)
print(f" Potato Disease Accuracy: {potato_acc*100:.2f}%\n")

# ================== SAVE RESULTS ==================
print("Saving predictions...")

df_stage1 = pd.DataFrame({
    "true_plant": y_stage1,
    "predicted_plant": stage1_labels
})

df_tomato = pd.DataFrame({
    "plant": "Tomato",
    "true_disease": y_tomato,
    "predicted_disease": tomato_preds
})

df_potato = pd.DataFrame({
    "plant": "Potato",
    "true_disease": y_potato,
    "predicted_disease": potato_preds
})

final_df = pd.concat([df_stage1, df_tomato, df_potato], axis=0, ignore_index=True)
final_df.to_csv(OUTPUT_CSV, index=False)

print(f" All predictions saved to {OUTPUT_CSV}")
print(f"\n Summary:")
print(f"Stage1 Accuracy: {stage1_acc*100:.2f}%")
print(f"Tomato Disease Accuracy: {tomato_acc*100:.2f}%")
print(f"Potato Disease Accuracy: {potato_acc*100:.2f}%")


 Loading trained models...
All models loaded successfully.

 Loading precomputed embeddings...
Stage1 embeddings: (3205, 1024)
Tomato embeddings: (2859, 1024)
Potato embeddings: (355, 1024)

 Running Stage 1: Plant type classification...
Stage 1 Accuracy: 99.16%

 Running Stage 2: Disease classification...
 Predicting Tomato diseases...
 Tomato Disease Accuracy: 95.21%

 Predicting Potato diseases...
 Potato Disease Accuracy: 96.34%

Saving predictions...
 All predictions saved to /kaggle/working/final_inference_results.csv

 Summary:
Stage1 Accuracy: 99.16%
Tomato Disease Accuracy: 95.21%
Potato Disease Accuracy: 96.34%


In [28]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score

# ---------------- CONFIG ----------------
STAGE1_DIR = "/kaggle/working/embeddings"
STAGE2_TOMATO_DIR = "/kaggle/working/embeddings_tomato"
STAGE2_POTATO_DIR = "/kaggle/working/embeddings_potato"

STAGE1_MODEL_PATH = os.path.join(STAGE1_DIR, "stage1_classifier_model.joblib")
STAGE2_TOMATO_MODEL_PATH = os.path.join(STAGE2_TOMATO_DIR, "stage2_classifier_model_tomato.joblib")
STAGE2_POTATO_MODEL_PATH = os.path.join(STAGE2_POTATO_DIR, "stage2_classifier_model_potato.joblib")

OUTPUT_CSV = "/kaggle/working/final_pipeline_results_named.csv"

# ---------------- HELPERS ----------------
def load_model(path):
    obj = joblib.load(path)
    if isinstance(obj, tuple):
        return obj
    if isinstance(obj, dict):
        return (obj.get("scaler", None), obj.get("model", None))
    return (None, obj)

def predict_with_scaler(model_tuple, X):
    scaler, model = model_tuple
    if scaler is not None:
        X = scaler.transform(X)
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X), model
    return model.predict(X), model

def get_labels(probs_or_preds, classes_map):
    if probs_or_preds.ndim > 1 and probs_or_preds.shape[1] > 1:
        preds = probs_or_preds.argmax(axis=1)
    else:
        preds = probs_or_preds.flatten()
    return np.array([classes_map[p] for p in preds]), preds

# ---------------- LOAD MODELS ----------------
stage1_model = load_model(STAGE1_MODEL_PATH)
stage2_tomato_model = load_model(STAGE2_TOMATO_MODEL_PATH)
stage2_potato_model = load_model(STAGE2_POTATO_MODEL_PATH)

# ---------------- LOAD EMBEDDINGS ----------------
X_stage1 = np.load(os.path.join(STAGE1_DIR, "X_test.npy"))
y_stage1 = np.load(os.path.join(STAGE1_DIR, "y_test.npy"))

X_tomato = np.load(os.path.join(STAGE2_TOMATO_DIR, "X_test.npy"))
y_tomato = np.load(os.path.join(STAGE2_TOMATO_DIR, "y_test.npy"))

X_potato = np.load(os.path.join(STAGE2_POTATO_DIR, "X_test.npy"))
y_potato = np.load(os.path.join(STAGE2_POTATO_DIR, "y_test.npy"))

STAGE1_CLASSES_MAP = {0: "Potato", 1: "Tomato"}

# ---------------- STAGE 1 ----------------
stage1_probs, _ = predict_with_scaler(stage1_model, X_stage1)
stage1_labels, stage1_preds = get_labels(stage1_probs, STAGE1_CLASSES_MAP)

# ---------------- STAGE 2 ----------------
# Define actual class names
tomato_class_names = ['Bacterial spot', 'Early blight', 'Healthy', 'Late blight', 'Leaf mold', 
                      'Mosaic virus', 'Septoria leaf spot', 'Spider mites two-spotted spider mite', 
                      'Target spot', 'Yellow leaf curl virus']
tomato_classes_map = {i: name for i, name in enumerate(tomato_class_names)}

potato_class_names = ['Early blight', 'Healthy', 'Late blight']
potato_classes_map = {i: name for i, name in enumerate(potato_class_names)}

# Predict Stage 2
tomato_probs, _ = predict_with_scaler(stage2_tomato_model, X_tomato)
tomato_labels, _ = get_labels(tomato_probs, tomato_classes_map)
y_tomato_named = np.array([tomato_classes_map[i] for i in y_tomato])

potato_probs, _ = predict_with_scaler(stage2_potato_model, X_potato)
potato_labels, _ = get_labels(potato_probs, potato_classes_map)
y_potato_named = np.array([potato_classes_map[i] for i in y_potato])

# ---------------- EVALUATION ----------------
total = correct_stage1 = correct_stage2 = 0
class_stats = []
pipeline_results = []

tomato_index = 0
potato_index = 0

for i in range(len(stage1_labels)):
    true_plant = STAGE1_CLASSES_MAP[y_stage1[i]]
    pred_plant = stage1_labels[i]

    # Assign correct disease
    if true_plant == "Tomato":
        true_disease = y_tomato_named[tomato_index]
        pred_disease = tomato_labels[tomato_index]
        tomato_index += 1
    else:
        true_disease = y_potato_named[potato_index]
        pred_disease = potato_labels[potato_index]
        potato_index += 1

    # Per-class stats
    cls_key = f"{true_plant}: {true_disease}"
    found = next((x for x in class_stats if x["Class"] == cls_key), None)
    if found is None:
        class_stats.append({"Class": cls_key, "Correct_Stage1": 0, "Correct_Stage2": 0, "Total": 0})
        found = class_stats[-1]

    found["Total"] += 1
    total += 1

    if pred_plant == true_plant:
        correct_stage1 += 1
        found["Correct_Stage1"] += 1
        if pred_disease == true_disease:
            correct_stage2 += 1
            found["Correct_Stage2"] += 1

    pipeline_results.append({
        "true_plant": true_plant,
        "predicted_plant": pred_plant,
        "true_disease": true_disease,
        "predicted_disease": pred_disease,
        "pipeline_correct": (pred_plant == true_plant and pred_disease == true_disease)
    })

# ---------------- OVERALL ACCURACY ----------------
stage1_acc = (correct_stage1 / total) * 100
stage2_acc = (correct_stage2 / total) * 100

print(f"1. Stage 1 (Plant Type) Accuracy: {stage1_acc:.2f}% ({correct_stage1}/{total})")
print(f"2. Overall Pipeline (Disease) Accuracy: {stage2_acc:.2f}% ({correct_stage2}/{total})\n")

# ---------------- PER-CLASS BREAKDOWN ----------------
print("[PER-CLASS ACCURACY BREAKDOWN]")
print(f"{'Class':<50} {'Stage1✅':>9} {'Stage2✅':>9} {'Total':>9} {'Acc Stage1%':>15} {'Acc Stage2%':>15}")
print("-" * 100)
for stats in sorted(class_stats, key=lambda x: x["Class"]):
    s1c = stats["Correct_Stage1"]
    s2c = stats["Correct_Stage2"]
    t = stats["Total"]
    s1a = (s1c / t) * 100 if t else 0
    s2a = (s2c / t) * 100 if t else 0
    print(f"{stats['Class']:<50} {s1c:>9} {s2c:>9} {t:>9} {s1a:>15.2f} {s2a:>15.2f}")

# ---------------- SAVE FULL RESULTS CSV ----------------
df_final = pd.DataFrame(pipeline_results)
df_final.to_csv(OUTPUT_CSV, index=False)
print(f"\nFull predictions with disease names saved to {OUTPUT_CSV}")


1. Stage 1 (Plant Type) Accuracy: 99.16% (3178/3205)
2. Overall Pipeline (Disease) Accuracy: 94.51% (3029/3205)

[PER-CLASS ACCURACY BREAKDOWN]
Class                                                Stage1✅   Stage2✅     Total     Acc Stage1%     Acc Stage2%
----------------------------------------------------------------------------------------------------
Potato: Early blight                                     165       157       166           99.40           94.58
Potato: Healthy                                           23        22        24           95.83           91.67
Potato: Late blight                                      154       151       163           94.48           92.64
Tomato: Bacterial spot                                   336       308       336          100.00           91.67
Tomato: Early blight                                     167       142       167          100.00           85.03
Tomato: Healthy                                          252       241       

# TRAIL