---
## 1. Install Dependencies and Setup

In [None]:
# Install required packages
!pip install -q kagglehub
!pip install -q ultralytics
!pip install -q scikit-learn

import os
import yaml
from pathlib import Path
import kagglehub

print("‚úÖ Dependencies installed successfully")

---
## 2. Download Dataset from KaggleHub

In [None]:
# Download the PlantVillage dataset for YOLO object detection
# This will download and cache the dataset locally
path = kagglehub.dataset_download("sebastianpalaciob/plantvillage-for-object-detection-yolo")

print("Path to dataset files:", path)
print("\nüìÅ Dataset downloaded successfully!")

# Store the dataset root path for later use
dataset_root = Path(path)
print(f"Dataset root: {dataset_root}")

---
## 3. Verify Dataset Structure and Locate Files

In [None]:
# Explore and verify the actual dataset structure
print("üîç Exploring dataset structure...\n")

# List top-level contents
print("üìÅ Top-level contents:")
for item in dataset_root.iterdir():
    item_type = "üìÅ" if item.is_dir() else "üìÑ"
    print(f"{item_type} {item.name}")

# Check for common structures
print("\nüîç Checking for dataset directories...")

# Check if Dataset folder exists
if (dataset_root / "Dataset").exists():
    print("‚úÖ Found 'Dataset' folder")
    dataset_dir = dataset_root / "Dataset"
    
    # List Dataset contents
    print("\nüìÅ Contents of Dataset folder:")
    for item in dataset_dir.iterdir():
        item_type = "üìÅ" if item.is_dir() else "üìÑ"
        print(f"{item_type} {item.name}")
else:
    print("‚ö†Ô∏è 'Dataset' folder not found, using root as dataset directory")
    dataset_dir = dataset_root

# Check for images and labels
images_dir = dataset_dir / "images"
labels_dir = dataset_dir / "labels"

if images_dir.exists() and labels_dir.exists():
    num_images = len(list(images_dir.glob('*.*')))
    num_labels = len(list(labels_dir.glob('*.txt')))
    print(f"\n‚úÖ Found images: {num_images}")
    print(f"‚úÖ Found labels: {num_labels}")
else:
    raise ValueError("‚ùå images or labels folder not found!")

# Check for classes file
classes_file = None
for possible_path in [dataset_root / "classes.yaml", dataset_dir / "classes.yaml"]:
    if possible_path.exists():
        print(f"\n‚úÖ Found classes.yaml at: {possible_path}")
        classes_file = possible_path
        break

if classes_file is None:
    raise ValueError("‚ùå classes.yaml not found!")

print("\n" + "="*60)
print("‚úÖ Dataset structure verified!")
print("="*60)

---
## 4. Read Classes and Split Dataset

In [None]:
# Read classes from classes.yaml
import shutil
from sklearn.model_selection import train_test_split
import random

print("üìñ Reading classes.yaml...\n")

# Read classes
with open(classes_file, 'r') as f:
    classes_data = yaml.safe_load(f)

print("Classes data:")
print(classes_data)

# Extract class names
if isinstance(classes_data, dict) and 'names' in classes_data:
    class_names = classes_data['names']
elif isinstance(classes_data, list):
    class_names = classes_data
else:
    # Assume it's a simple list or dict
    class_names = list(classes_data.values()) if isinstance(classes_data, dict) else classes_data

num_classes = len(class_names)
print(f"\nüìä Total classes: {num_classes}")
print(f"First 5 classes: {class_names[:5]}")

# Get all image files
image_files = sorted(
    list(images_dir.glob('*.jpg')) + 
    list(images_dir.glob('*.png')) + 
    list(images_dir.glob('*.jpeg'))
)
print(f"\nüì∏ Total images found: {len(image_files)}")

if len(image_files) == 0:
    raise ValueError("‚ùå No image files found! Check the dataset structure.")

# Create train/val/test directories
print("\nüìÅ Creating train/val/test directory structure...")

for split in ['train', 'valid', 'test']:
    (dataset_dir / split / 'images').mkdir(parents=True, exist_ok=True)
    (dataset_dir / split / 'labels').mkdir(parents=True, exist_ok=True)

print("‚úÖ Directories created")

# Split dataset: 70% train, 20% valid, 10% test
random.seed(42)
image_names = [img.name for img in image_files]

# First split: 70% train, 30% temp
train_names, temp_names = train_test_split(image_names, test_size=0.3, random_state=42)

# Second split: 20% valid, 10% test (from the 30% temp)
valid_names, test_names = train_test_split(temp_names, test_size=0.33, random_state=42)

print(f"\nüìä Dataset split:")
print(f"  Train: {len(train_names)} images ({len(train_names)/len(image_names)*100:.1f}%)")
print(f"  Valid: {len(valid_names)} images ({len(valid_names)/len(image_names)*100:.1f}%)")
print(f"  Test:  {len(test_names)} images ({len(test_names)/len(image_names)*100:.1f}%)")

# Copy files to respective directories
print("\nüì¶ Copying files to train/valid/test folders...")
print("‚è≥ This may take a few minutes...\n")

def copy_files(file_names, split):
    copied = 0
    for name in file_names:
        # Copy image
        src_img = images_dir / name
        dst_img = dataset_dir / split / 'images' / name
        if src_img.exists():
            shutil.copy2(src_img, dst_img)
            
            # Copy corresponding label
            label_name = src_img.stem + '.txt'
            src_lbl = labels_dir / label_name
            dst_lbl = dataset_dir / split / 'labels' / label_name
            if src_lbl.exists():
                shutil.copy2(src_lbl, dst_lbl)
                copied += 1
    return copied

train_copied = copy_files(train_names, 'train')
valid_copied = copy_files(valid_names, 'valid')
test_copied = copy_files(test_names, 'test')

print(f"  ‚úÖ Train: {train_copied} image-label pairs copied")
print(f"  ‚úÖ Valid: {valid_copied} image-label pairs copied")
print(f"  ‚úÖ Test:  {test_copied} image-label pairs copied")
print("\n‚úÖ Dataset split complete!")

---
## 5. Create data.yaml Configuration File

In [None]:
# Create data.yaml configuration file for YOLOv8
print("üìù Creating data.yaml configuration...\n")

# Create data.yaml content
data_config = {
    'path': str(dataset_dir.absolute()),
    'train': 'train/images',
    'val': 'valid/images',
    'test': 'test/images',
    'nc': num_classes,
    'names': class_names
}

print("data.yaml configuration:")
print(yaml.dump(data_config, default_flow_style=False, sort_keys=False))

# Write data.yaml file
yaml_path = dataset_dir / "data.yaml"
with open(yaml_path, 'w') as f:
    yaml.dump(data_config, f, default_flow_style=False, sort_keys=False)

print(f"\n‚úÖ data.yaml created at: {yaml_path}")
print(f"\nüìä Dataset configuration:")
print(f"  Classes: {num_classes}")
print(f"  Path: {data_config['path']}")
print(f"  Train: {data_config['train']}")
print(f"  Val: {data_config['val']}")
print(f"  Test: {data_config['test']}")

---
## 6. Verify GPU Availability

In [None]:
# Check GPU availability using nvidia-smi
print("üñ•Ô∏è Checking GPU availability...\n")
!nvidia-smi

# Verify PyTorch can access GPU
import torch
print(f"\nüî• PyTorch version: {torch.__version__}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è Warning: No GPU detected. Training will be slow on CPU.")

---
## 7. Train YOLOv8 with OOM Handling

In [None]:
# Train YOLOv8 with automatic batch size reduction on OOM
from ultralytics import YOLO
import gc

# Training parameters
model_name = 'yolov8n.pt'  # Lightweight YOLOv8 nano model
img_size = 640
epochs = 50
batch_sizes = [16, 8, 4]  # Try these batch sizes in order if OOM occurs

print(f"üöÄ Starting YOLOv8 training...\n")
print(f"Model: {model_name}")
print(f"Image size: {img_size}")
print(f"Epochs: {epochs}")
print(f"Dataset: {yaml_path}\n")

# Try training with different batch sizes if OOM occurs
trained = False
for batch_size in batch_sizes:
    try:
        print(f"\nüì¶ Attempting training with batch size: {batch_size}")
        
        # Clear GPU cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
        # Initialize model
        model = YOLO(model_name)
        
        # Train the model
        results = model.train(
            data=str(yaml_path),
            epochs=epochs,
            imgsz=img_size,
            batch=batch_size,
            patience=10,  # Early stopping patience
            save=True,
            project='runs/detect',
            name='train',
            exist_ok=True,
            pretrained=True,
            optimizer='auto',
            verbose=True,
            seed=42,
            deterministic=False,
            single_cls=False,
            rect=False,
            cos_lr=False,
            close_mosaic=10,
            resume=False,
            amp=True,  # Automatic Mixed Precision
            fraction=1.0,
            profile=False,
            overlap_mask=True,
            mask_ratio=4,
            dropout=0.0,
            val=True,
        )
        
        print(f"\n‚úÖ Training completed successfully with batch size {batch_size}!")
        trained = True
        break
        
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print(f"\n‚ö†Ô∏è OOM Error with batch size {batch_size}")
            if batch_size == batch_sizes[-1]:
                print("\n‚ùå Failed with smallest batch size. Cannot continue.")
                raise
            else:
                print(f"Retrying with smaller batch size...")
                continue
        else:
            print(f"\n‚ùå Training error: {e}")
            raise

if not trained:
    print("\n‚ùå Training failed. Please check the error messages above.")

---
## 8. Display Training Results

In [None]:
# Display training results and plots
from IPython.display import Image, display
import glob

print("üìä Training Results\n")

# Find the training results directory
results_dir = Path('runs/detect/train')

if results_dir.exists():
    # Display results plots
    plot_files = ['results.png', 'confusion_matrix.png', 'F1_curve.png', 'PR_curve.png', 'P_curve.png', 'R_curve.png']
    
    for plot_file in plot_files:
        plot_path = results_dir / plot_file
        if plot_path.exists():
            print(f"\nüìà {plot_file}")
            display(Image(filename=str(plot_path)))
    
    # Display sample predictions
    val_batch_files = list(results_dir.glob('val_batch*.jpg'))
    if val_batch_files:
        print(f"\nüñºÔ∏è Sample validation predictions:")
        for img_file in val_batch_files[:3]:  # Show first 3 batches
            print(f"\n{img_file.name}")
            display(Image(filename=str(img_file)))
else:
    print("‚ö†Ô∏è Results directory not found. Training may have failed.")

---
## 9. Run Validation and Display Metrics

In [None]:
# Load the best trained model and run validation
from ultralytics import YOLO

best_model_path = 'runs/detect/train/weights/best.pt'

if Path(best_model_path).exists():
    print("üîç Running validation on best model...\n")
    
    # Load best model
    model = YOLO(best_model_path)
    
    # Run validation
    metrics = model.val(data=str(yaml_path), split='val')
    
    # Display key metrics
    print("\n" + "="*60)
    print("üìä VALIDATION METRICS")
    print("="*60)
    print(f"mAP50-95:  {metrics.box.map:.4f}")
    print(f"mAP50:     {metrics.box.map50:.4f}")
    print(f"mAP75:     {metrics.box.map75:.4f}")
    print(f"Precision: {metrics.box.mp:.4f}")
    print(f"Recall:    {metrics.box.mr:.4f}")
    print("="*60)
    
    # Display per-class metrics if available
    if hasattr(metrics.box, 'maps'):
        print("\nüìã Per-class mAP50 (first 10 classes):")
        for i, map_value in enumerate(metrics.box.maps[:10]):
            class_name = class_names[i] if i < len(class_names) else f"Class {i}"
            print(f"  {class_name}: {map_value:.4f}")
else:
    print(f"‚ùå Best model not found at {best_model_path}")
    print("Training may not have completed successfully.")

---
## 10. Run Inference on Test Images

In [None]:
# Run predictions on test images with low confidence threshold
import random
from IPython.display import Image, display

if Path(best_model_path).exists():
    print("üéØ Running inference on test images...\n")
    
    # Load best model
    model = YOLO(best_model_path)
    
    # Get test images
    test_images_dir = dataset_dir / 'test' / 'images'
    test_images = list(test_images_dir.glob('*.jpg')) + list(test_images_dir.glob('*.png'))
    
    if test_images:
        # Select random test images
        num_samples = min(5, len(test_images))
        sample_images = random.sample(test_images, num_samples)
        
        print(f"Selected {num_samples} random test images for prediction\n")
        
        # Run predictions
        results = model.predict(
            source=sample_images,
            conf=0.10,  # Low confidence threshold to catch all detections
            iou=0.45,
            save=True,
            project='runs/detect',
            name='test_predictions',
            exist_ok=True,
            save_txt=True,
            save_conf=True,
            show_labels=True,
            show_conf=True,
            line_width=2,
        )
        
        print(f"\n‚úÖ Predictions completed!\n")
        
        # Display prediction results
        pred_dir = Path('runs/detect/test_predictions')
        if pred_dir.exists():
            pred_images = list(pred_dir.glob('*.jpg')) + list(pred_dir.glob('*.png'))
            print(f"üñºÔ∏è Displaying prediction results:\n")
            for pred_img in pred_images[:5]:
                print(f"\n{pred_img.name}")
                display(Image(filename=str(pred_img), width=800))
        
        # Print detection statistics
        print("\n" + "="*60)
        print("üìä DETECTION STATISTICS")
        print("="*60)
        for i, result in enumerate(results):
            num_detections = len(result.boxes)
            img_name = sample_images[i].name
            print(f"{img_name}: {num_detections} detections")
            
            if num_detections > 0:
                for box in result.boxes:
                    cls = int(box.cls[0])
                    conf = float(box.conf[0])
                    class_name = class_names[cls] if cls < len(class_names) else f"Class {cls}"
                    print(f"  - {class_name}: {conf:.3f}")
        print("="*60)
        
    else:
        print(f"‚ö†Ô∏è No test images found in {test_images_dir}")
else:
    print(f"‚ùå Best model not found at {best_model_path}")

---
## 11. Download Trained Weights and Results

In [None]:
# Create a downloadable archive of training results
import shutil
from google.colab import files

print("üì¶ Preparing files for download...\n")

# Files to download
download_files = [
    ('runs/detect/train/weights/best.pt', 'Best model weights'),
    ('runs/detect/train/weights/last.pt', 'Last epoch weights'),
    ('runs/detect/train/results.png', 'Training results plot'),
    ('runs/detect/train/results.csv', 'Training results CSV'),
    ('runs/detect/train/confusion_matrix.png', 'Confusion matrix'),
]

downloaded_count = 0
for file_path, description in download_files:
    if Path(file_path).exists():
        print(f"‚¨áÔ∏è Downloading: {description} ({file_path})")
        try:
            files.download(file_path)
            downloaded_count += 1
        except Exception as e:
            print(f"  ‚ö†Ô∏è Failed to download: {e}")
    else:
        print(f"  ‚ö†Ô∏è File not found: {file_path}")

print(f"\n‚úÖ Downloaded {downloaded_count}/{len(download_files)} files")

# Optionally create a zip archive
print("\nüì¶ Creating zip archive of all results...")
if Path('runs/detect/train').exists():
    shutil.make_archive('yolov8_training_results', 'zip', 'runs/detect/train')
    print("‚¨áÔ∏è Downloading complete training results archive...")
    files.download('yolov8_training_results.zip')
    print("‚úÖ Archive downloaded successfully!")
else:
    print("‚ö†Ô∏è Training results directory not found")

---
## 12. Troubleshooting Notes

### Common Issues and Solutions:

#### 1. Empty Detections
**Symptoms:** Model doesn't detect anything or very few detections

**Solutions:**
- **Lower confidence threshold:** Change `conf=0.10` to `conf=0.05` or even `conf=0.01`
- **Check label format:** YOLO labels must be normalized (0..1 range)
  ```python
  # Verify label format
  with open('path/to/label.txt', 'r') as f:
      for line in f:
          values = line.strip().split()
          # Format: class_id x_center y_center width height
          # All coordinates should be between 0 and 1
          print(values)
  ```
- **Verify image-label pairs:** Ensure each image has a corresponding label file
- **Check class indices:** Class IDs in labels must match the names list in data.yaml (0-indexed)

#### 2. Training Fails or Poor Performance
**Solutions:**
- **Insufficient training data:** Ensure you have enough samples per class (minimum 50-100)
- **Increase epochs:** Try 100-200 epochs for better convergence
- **Adjust learning rate:** Use `lr0=0.01` for initial learning rate
- **Data augmentation:** Ultralytics applies augmentation by default

#### 3. OOM (Out of Memory) Errors
**Solutions:**
- **Reduce batch size:** The code already handles this automatically (16‚Üí8‚Üí4)
- **Reduce image size:** Change `imgsz=640` to `imgsz=416` or `imgsz=320`
- **Use smaller model:** Keep using `yolov8n.pt` (nano is smallest)

#### 4. Dataset Format Issues
**Solutions:**
- **Verify YOLO format:** Each label file should contain:
  ```
  class_id x_center y_center width height
  ```
  All values normalized to 0-1 range
- **Matching filenames:** image.jpg should have corresponding label image.txt

#### 5. Low mAP Scores
**Solutions:**
- **Increase training time:** More epochs often improve mAP
- **Use larger model:** Upgrade from nano (n) to small (s): `yolov8s.pt`
- **Review confusion matrix:** Identify which classes are confused

### Additional Tips:
- **Resume training:** If training is interrupted:
  ```python
  model = YOLO('runs/detect/train/weights/last.pt')
  model.train(resume=True)
  ```
- **Export model:** Convert to other formats:
  ```python
  model.export(format='onnx')  # or 'tflite', 'coreml', etc.
  ```

---
## Quick Diagnostic Cell
Run this cell if you encounter issues

In [None]:
# Quick diagnostic checks
print("üîß Running diagnostics...\n")

# 1. Check dataset structure
print("1Ô∏è‚É£ Dataset Structure:")
for split in ['train', 'valid', 'test']:
    img_dir = dataset_dir / split / 'images'
    lbl_dir = dataset_dir / split / 'labels'
    if img_dir.exists() and lbl_dir.exists():
        num_images = len(list(img_dir.glob('*.*')))
        num_labels = len(list(lbl_dir.glob('*.txt')))
        print(f"  {split}: {num_images} images, {num_labels} labels")
    else:
        print(f"  {split}: ‚ö†Ô∏è Missing directories")

# 2. Check a sample label file
print("\n2Ô∏è‚É£ Sample Label Format:")
label_files = list((dataset_dir / 'train' / 'labels').glob('*.txt'))
if label_files:
    with open(label_files[0], 'r') as f:
        lines = f.readlines()[:3]
        for line in lines:
            values = line.strip().split()
            if len(values) >= 5:
                cls, x, y, w, h = values[:5]
                in_range = all(0 <= float(v) <= 1 for v in [x, y, w, h])
                status = "‚úÖ" if in_range else "‚ùå"
                print(f"  {status} class={cls}, x={x}, y={y}, w={w}, h={h}")
else:
    print("  ‚ö†Ô∏è No label files found")

# 3. Check data.yaml
print("\n3Ô∏è‚É£ data.yaml Configuration:")
if yaml_path.exists():
    with open(yaml_path, 'r') as f:
        config = yaml.safe_load(f)
        print(f"  Path: {config.get('path', 'NOT SET')}")
        print(f"  Classes: {config.get('nc', len(config.get('names', [])))}")
        print(f"  Train: {config.get('train', 'NOT SET')}")
        print(f"  Val: {config.get('val', 'NOT SET')}")
else:
    print("  ‚ùå data.yaml not found")

# 4. Check GPU
print("\n4Ô∏è‚É£ GPU Status:")
import torch
if torch.cuda.is_available():
    print(f"  ‚úÖ GPU: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("  ‚ö†Ô∏è No GPU available")

print("\n‚úÖ Diagnostics complete")