In [4]:
# =========================================================================
# SECTION 1: PACKAGE INSTALLATION AND SETUP
# =========================================================================
import subprocess
import sys
import os
from pathlib import Path
import time
from datetime import datetime
import warnings
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import yaml
import json
import psutil
from ultralytics import YOLO
import optuna
import albumentations as A
from sklearn.model_selection import train_test_split
import shutil
import glob
import random

warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
# Define the source and target directories relative to the notebook location
# Assuming your previously combined (80k) dataset is locally available here:
SOURCE_DIR = Path("combined_balanced_dataset") 

# Target for the new, smaller 24k-image dataset
TARGET_DIR = Path("rdd_sampled_24k_local")

# Target training set size (excluding validation split)
TARGET_TRAIN_SIZE = 24000 

# Define the classes (must match your dataset.yaml)
CLASS_NAMES = {
    0: 'longitudinal crack',
    1: 'transverse crack',
    2: 'alligator crack',
    3: 'other corruption',
    4: 'Pothole'
}

# Install necessary packages if missing (laptop-specific path handling not needed here)
def install_packages():
    packages = ["ultralytics", "optuna", "kaggle", "opencv-python", "albumentations", "psutil", "scikit-learn"]
    for package in packages:
        try:
            __import__(package.replace('-', '_'))
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
install_packages()

print("All libraries imported and packages checked.")
print(f"PyTorch version: {torch.__version__}, CUDA available: {torch.cuda.is_available()}")

# =========================================================================
# SECTION 2: RESOURCE MONITOR AND CONSERVATIVE OPTIMIZATION (FOR LAPTOP)
# =========================================================================

class LaptopResourceMonitor:
    """Monitor and manage laptop resources during training"""
    
    def __init__(self, max_training_hours=8): # Increased max hours for a CPU/slower-GPU laptop run
        self.max_training_hours = max_training_hours
        self.start_time = time.time()
        self.gpu_available = torch.cuda.is_available()
        self.initial_setup()
    
    def initial_setup(self):
        print("\nAnalyzing laptop capabilities (Conservative Settings)...")
        cpu_count = psutil.cpu_count(logical=False) or psutil.cpu_count()
        memory = psutil.virtual_memory()
        
        print(f"CPU: {cpu_count} logical cores")
        print(f"RAM: {memory.total / (1024**3):.1f}GB")
        
        if self.gpu_available and torch.cuda.get_device_properties(0).total_memory > 2 * (1024**3):
            gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
            print(f"GPU: {torch.cuda.get_device_name(0)}, Memory: {gpu_memory:.1f}GB")
            
            # Moderate GPU settings
            self.batch_size = 8
            self.image_size = 640
            print("Moderate GPU detected - using balanced settings.")
        else:
            print("No powerful GPU detected - **CPU or Low-VRAM training**.")
            # Highly conservative settings for stability
            self.batch_size = 2 # VERY small batch for VRAM/RAM stability
            self.image_size = 416
        
        self.workers = min(cpu_count // 2, 4) # Max 4 workers for CPU bound tasks
        
        print(f"\nOptimized settings for stability:")
        print(f"  Batch Size: {self.batch_size}")
        print(f"  Image Size: {self.image_size}")
        print(f"  Workers: {self.workers}")
    
    def get_optimized_config(self):
        return {
            'batch_size': self.batch_size,
            'image_size': self.image_size,
            'workers': self.workers,
            'epochs': min(100, int(self.max_training_hours * 8)), # Max 8 epochs per hour estimate
            'patience': 20,
            'amp': self.gpu_available, # Only use mixed precision if GPU is available
            'cache': False, # Keep off for large dataset on HDD/SSD to save RAM
            'save_period': -1,
        }

monitor = LaptopResourceMonitor()

  from .autonotebook import tqdm as notebook_tqdm


Installing opencv-python...
Installing scikit-learn...
All libraries imported and packages checked.
PyTorch version: 2.9.0+cpu, CUDA available: False

Analyzing laptop capabilities (Conservative Settings)...
CPU: 4 logical cores
RAM: 15.8GB
No powerful GPU detected - **CPU or Low-VRAM training**.

Optimized settings for stability:
  Batch Size: 2
  Image Size: 416
  Workers: 2


In [2]:
# =========================================================================
# SECTION 3: CLASS-BALANCED SAMPLING (24K IMAGES)
# =========================================================================

class ClassBalancedSampler:
    """Samples images to meet a target count while preserving class distribution."""
    
    def __init__(self, source_dir, target_dir, target_train_size):
        self.source_dir = Path(source_dir)
        self.target_dir = Path(target_dir)
        self.target_train_size = target_train_size
        self.class_images = {}
        # Assuming the source structure is 'images/train' and 'labels/train'
        self.val_images = list((self.source_dir / 'images' / 'val').glob('*'))
        self.train_images = list((self.source_dir / 'images' / 'train').glob('*'))

    def _analyze_labels(self):
        print("Analyzing current training labels for class distribution...")
        
        for class_id in CLASS_NAMES.keys():
            self.class_images[class_id] = []
            
        for img_path in self.train_images:
            label_path = self.source_dir / 'labels' / 'train' / (img_path.stem + '.txt')
            
            if label_path.exists():
                try:
                    with open(label_path, 'r') as f:
                        content = f.read().strip()
                        if content:
                            # Use set comprehension for speed and correct parsing
                            present_classes = set(int(line.split()[0]) 
                                                  for line in content.split('\n') if line.strip())
                            
                            for class_id in present_classes:
                                self.class_images[class_id].append(img_path)
                except: continue
        
        print("Analysis complete.")
        current_counts = {CLASS_NAMES[k]: len(v) for k, v in self.class_images.items()}
        print(f"Images per class (Train Source): {current_counts}")
        return current_counts

    def _select_sample_images(self, current_counts):
        """Selects a unique list of images while maintaining class balance."""
        
        total_images_in_source = len(self.train_images)
        if total_images_in_source == 0:
            print("ERROR: No images found in source training folder.")
            return []
            
        # Target number of images for each class based on the smallest class, scaled up
        # We ensure every class gets at least 'target_per_class' samples (images that contain that class)
        
        # Calculate proportional target per class based on current distribution
        total_annotations = sum(c['count'] for c in monitor_combiner.rdd_distribution.values() if 'count' in c)
        
        # Simple proportionate sampling based on current distribution (more robust than min-count)
        # We use the existing class imbalance knowledge to guide sampling.
        selected_images = set()
        
        for class_id, count_dict in monitor_combiner.rdd_distribution.items():
            if 'count' not in count_dict: continue
                
            class_annotations = count_dict['count']
            
            # Sample image count proportional to its annotation count relative to the whole dataset
            if total_annotations > 0:
                proportion = class_annotations / total_annotations
            else:
                proportion = 1 / len(CLASS_NAMES) # Equal weight if no annotations found
                
            # Max number of images to try to select for this class based on overall target
            target_image_count_for_class = int(self.target_train_size * proportion * 1.5) # 1.5 multiplier for overlap

            img_list = self.class_images.get(class_id, [])
            random.shuffle(img_list)
            
            count = 0
            for img_path in img_list:
                if img_path not in selected_images and count < target_image_count_for_class:
                    selected_images.add(img_path)
                    count += 1
                
                if len(selected_images) >= self.target_train_size:
                    break
            
            if len(selected_images) >= self.target_train_size:
                break
        
        # Fill up remaining if the target wasn't met through class-specific sampling
        remaining_to_sample = self.target_train_size - len(selected_images)
        if remaining_to_sample > 0:
            print(f"Need {remaining_to_sample} more images to hit target. Filling randomly.")
            unselected_images = list(set(self.train_images) - selected_images)
            random.shuffle(unselected_images)
            selected_images.update(unselected_images[:remaining_to_sample])

        print(f"Final sampled training image count: {len(selected_images)}")
        return list(selected_images)


    def _copy_dataset(self, selected_images):
        """Creates the new, smaller dataset structure."""
        print("Creating new dataset structure and copying files...")
        
        if self.target_dir.exists():
             shutil.rmtree(self.target_dir) # Remove old directory
        
        for split in ['train', 'val']:
            (self.target_dir / 'images' / split).mkdir(parents=True, exist_ok=True)
            (self.target_dir / 'labels' / split).mkdir(parents=True, exist_ok=True)
            
        # Copy selected training data
        print(f"Copying {len(selected_images)} training files...")
        for img_path in selected_images:
            shutil.copy2(img_path, self.target_dir / 'images' / 'train' / img_path.name)
            label_path = self.source_dir / 'labels' / 'train' / (img_path.stem + '.txt')
            if label_path.exists():
                shutil.copy2(label_path, self.target_dir / 'labels' / 'train' / (img_path.stem + '.txt'))
        
        # Copy *all* validation data (to keep test consistent)
        print(f"Copying {len(self.val_images)} validation files...")
        for img_path in self.val_images:
            shutil.copy2(img_path, self.target_dir / 'images' / 'val' / img_path.name)
            label_path = self.source_dir / 'labels' / 'val' / (img_path.stem + '.txt')
            if label_path.exists():
                shutil.copy2(label_path, self.target_dir / 'labels' / 'val' / (img_path.stem + '.txt'))
                
        print("Copying complete.")

    def _create_yaml(self):
        """Creates the new dataset YAML file."""
        yaml_config = {
            'path': str(self.target_dir.absolute()), 
            'train': 'images/train',
            'val': 'images/val',
            'nc': len(CLASS_NAMES),
            'names': CLASS_NAMES
        }
        
        yaml_file = self.target_dir / 'dataset.yaml'
        with open(yaml_file, 'w') as f:
            yaml.dump(yaml_config, f, sort_keys=False)
            
        return str(yaml_file)

    def run_sampling(self):
        """Executes the full sampling pipeline."""
        current_counts = self._analyze_labels()
        selected_images = self._select_sample_images(current_counts)
        self._copy_dataset(selected_images)
        new_yaml_file = self._create_yaml()
        
        print("\n Sampling SUCCESS! ")
        print(f"New dataset location: {self.target_dir}")
        print(f"New YAML file: {new_yaml_file}")
        
        return new_yaml_file

# --- HACK: Re-using the class distribution analysis from the COMBINER for better proportional sampling ---
# NOTE: This assumes you have the distribution knowledge from your previous run.
# If you don't have this, you'll need to re-run the full data analysis step first.

class DummyCombinerMonitor:
    """Simulates the distribution analysis result to enable proportional sampling."""
    def __init__(self):
        # Using the distribution you saw in the Colab output (approximate annotations count)
        self.rdd_distribution = {
            0: {'count': 22091}, # longitudinal crack (high)
            1: {'count': 10155}, # transverse crack (moderate)
            2: {'count': 9080},  # alligator crack (moderate)
            3: {'count': 9118},  # other corruption (moderate)
            4: {'count': 27429}, # Pothole (high)
        }
monitor_combiner = DummyCombinerMonitor()

# --- EXECUTE THE SAMPLING ---
print("STARTING CLASS-BALANCED SAMPLING...")
sampler = ClassBalancedSampler(SOURCE_DIR, TARGET_DIR, TARGET_TRAIN_SIZE)
yaml_file = sampler.run_sampling()

STARTING CLASS-BALANCED SAMPLING...
Analyzing current training labels for class distribution...
Analysis complete.
Images per class (Train Source): {'longitudinal crack': 9457, 'transverse crack': 5433, 'alligator crack': 5976, 'other corruption': 5247, 'Pothole': 8896}
Final sampled training image count: 24000
Creating new dataset structure and copying files...
Copying 24000 training files...
Copying 7333 validation files...
Copying complete.

 Sampling SUCCESS! 
New dataset location: rdd_sampled_24k_local
New YAML file: rdd_sampled_24k_local\dataset.yaml


In [3]:
# =========================================================================
# SECTION 4: BAYESIAN HYPERPARAMETER OPTIMIZATION (FAST)
# =========================================================================

class BayesianOptimizer:
    """Efficient Bayesian optimization for laptop training"""
    
    def __init__(self, dataset_yaml, resource_monitor):
        self.dataset_yaml = dataset_yaml
        self.monitor = resource_monitor
        self.best_params = {}
    
    def objective(self, trial):
        """Optimization objective function (short 5 epoch trial)"""
        
        # Suggest hyperparameters (using corrected names)
        lr0 = trial.suggest_float('lr0', 0.005, 0.02)
        box = trial.suggest_float('box', 0.02, 0.08) # CORRECTED NAME
        cls = trial.suggest_float('cls', 0.3, 0.7)   # CORRECTED NAME
        warmup_epochs = trial.suggest_int('warmup_epochs', 3, 8)
        degrees = trial.suggest_float('degrees', 0, 10)
        scale = trial.suggest_float('scale', 0.1, 0.4)
        
        try:
            model = YOLO('yolov8s.pt')
            config = self.monitor.get_optimized_config()
            
            results = model.train(
                data=self.dataset_yaml,
                epochs=5,  # VERY short run for trials on laptop
                batch=config['batch_size'],
                imgsz=config['image_size'],
                lr0=lr0,
                box=box, # CORRECTED
                cls=cls, # CORRECTED
                warmup_epochs=warmup_epochs,
                degrees=degrees,
                scale=scale,
                patience=3,
                workers=min(config['workers'], 1), # Max 1 worker for stable Optuna trials
                amp=config['amp'],
                cache=False,
                verbose=False, plots=False,
                project="optuna_trials_local", name=f"trial_{trial.number}",
                save=False, exist_ok=True
            )
            
            # Use mAP50 for speed
            mAP = results.results_dict.get('metrics/mAP50(B)', 0) 
            return mAP
            
        except Exception as e:
            print(f"Trial {trial.number} failed: {e}")
            return 0.0
    
    def optimize(self, n_trials=8, timeout_minutes=15): # Max 8 trials, 15 min total
        print("\nStarting FAST Bayesian hyperparameter optimization...")
        study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_startup_trials=2, n_warmup_steps=2))
        
        try:
            study.optimize(self.objective, n_trials=n_trials, timeout=timeout_minutes * 60, show_progress_bar=True)
            self.best_params = study.best_params
            
            print(f"\nOptimization complete! Best mAP@0.5: {study.best_value:.3f}")
            print(f"Best parameters: {self.best_params}")
            return self.best_params
            
        except Exception as e:
            print(f"Optimization interrupted: {e}")
            return {}

# Execute optimization
best_params = {}
if 'yaml_file' in locals() and Path(yaml_file).exists():
    optimizer = BayesianOptimizer(yaml_file, monitor)
    best_params = optimizer.optimize() 
else:
    print("Skipping optimization - sampled dataset not ready.")

# Ensure we have defaults if optimization failed
if not best_params:
    print("Using conservative defaults for main training.")
    best_params = {'lr0': 0.01, 'box': 0.05, 'cls': 0.5, 'warmup_epochs': 5, 'degrees': 10, 'scale': 0.3}

[I 2025-11-10 14:44:36,772] A new study created in memory with name: no-name-c556a685-6cf7-40b4-a21e-cb61199d6e8a



Starting FAST Bayesian hyperparameter optimization...


  0%|          | 0/8 [00:00<?, ?it/s]

New https://pypi.org/project/ultralytics/8.3.227 available  Update with 'pip install -U ultralytics'
Ultralytics 8.3.221  Python-3.10.0 torch-2.9.0+cpu CPU (Intel Core i7-8705G 3.10GHz)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=False, augment=False, auto_augment=randaugment, batch=2, bgr=0.0, box=0.05508430131440244, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.4358569455220789, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=rdd_sampled_24k_local\dataset.yaml, degrees=1.2894838326758262, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=5, erasing=0.4, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=416, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.017255863620831164, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8s.pt, momentum=

Best trial: 0. Best value: 0.380245:  12%|█▎        | 1/8 [18:56:24<132:34:49, 68184.26s/it, 68184.26/900 seconds]

[I 2025-11-11 09:41:01,037] Trial 0 finished with value: 0.380245434046406 and parameters: {'lr0': 0.017255863620831164, 'box': 0.05508430131440244, 'cls': 0.4358569455220789, 'warmup_epochs': 6, 'degrees': 1.2894838326758262, 'scale': 0.3708657319520028}. Best is trial 0 with value: 0.380245434046406.

Optimization complete! Best mAP@0.5: 0.380
Best parameters: {'lr0': 0.017255863620831164, 'box': 0.05508430131440244, 'cls': 0.4358569455220789, 'warmup_epochs': 6, 'degrees': 1.2894838326758262, 'scale': 0.3708657319520028}





In [6]:
# =========================================================================
# SECTION 5: OPTIMIZED TRAINING AND EVALUATION
# =========================================================================


class OptimizedTrainer:
    """Optimized model training with resource management"""
    
    def __init__(self, dataset_yaml, resource_monitor, best_params):
        self.dataset_yaml = dataset_yaml
        self.monitor = resource_monitor
        self.best_params = best_params
        self.model = None
        self.results = None
        self.run_name = f'rdd_40epoch_laptop_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
        # Overriding the max epochs for a faster run plan
        self.MAX_EPOCHS = 40 
        self.PATIENCE = 10 # Aggressive early stopping
        self.val_images = list((Path(self.dataset_yaml).parent / 'images' / 'val').glob('*'))


    def train_model(self):
        print(f"\nTraining optimized YOLOv8 model ({self.MAX_EPOCHS} epochs max)...")
        
        model_size = 'yolov8s'
        self.model = YOLO(f'{model_size}.pt')
        config = self.monitor.get_optimized_config()
        
        train_params = {
            'data': self.dataset_yaml,
            'epochs': self.MAX_EPOCHS,
            'batch': config['batch_size'],
            'imgsz': config['image_size'],
            
            # Optimized hyperparameters
            'lr0': self.best_params.get('lr0', 0.01),
            'box': self.best_params.get('box', 0.05),
            'cls': self.best_params.get('cls', 0.5),
            'warmup_epochs': self.best_params.get('warmup_epochs', 5),
            'degrees': self.best_params.get('degrees', 10),
            'scale': self.best_params.get('scale', 0.3),
            
            # Training efficiency & Stability
            'patience': self.PATIENCE, # Use faster patience
            'workers': config['workers'],
            'amp': config['amp'],
            'cache': config['cache'],
            'save_period': config['save_period'],
            'cos_lr': True,
            
            # Augmentation 
            'fliplr': 0.5, 'mosaic': 0.8, 'mixup': 0.1,
            
            'project': 'runs/detect_final_local',
            'name': self.run_name,
            'plots': True, 'verbose': True, 'exist_ok': True
        }
        
        print(f"Training parameters: Batch={config['batch_size']}, Workers={config['workers']}, GPU/AMP={config['amp']}")
        
        try:
            self.results = self.model.train(**train_params)
            print("\n Training completed successfully!")
            return True
        except Exception as e:
            print(f" Training failed: {e}")
            return False
    
    def evaluate_model(self):
        """Comprehensive model evaluation on the final model"""
        print("\n==================================================")
        print("COMPREHENSIVE MODEL EVALUATION")
        print("==================================================")

        if not self.results:
            print("No training results available.")
            return None

        # Rerunning validation to get clean metrics object
        val_results = self.model.val(data=self.dataset_yaml, verbose=True)
        
        # --- Core Performance Metrics ---
        performance = {
            'mAP50': val_results.box.map50,
            'mAP50_95': val_results.box.map,
            'precision': val_results.box.mp,
            'recall': val_results.box.mr,
            'epochs_target': self.MAX_EPOCHS,
            'epochs_trained': val_results.box.map.size, # Use AP array size as proxy for classes/epochs count
        }
        
        print(f"Performance Metrics (Validation Set):")
        print(f"  mAP@0.5:    {performance['mAP50']:.3f} ({performance['mAP50']:.1%})")
        print(f"  mAP@0.5:0.95: {performance['mAP50_95']:.3f} ({performance['mAP50_95']:.1%})")
        print(f"  Precision:  {performance['precision']:.3f} ({performance['precision']:.1%})")
        print(f"  Recall:     {performance['recall']:.3f} ({performance['recall']:.1%})")
        
        # --- Per-Class Performance ---
        print("\nPer-Class Performance (mAP@0.5):")
        class_results = val_results.box.ap_class_index
        class_maps = val_results.box.ap
        
        for i, class_index in enumerate(class_results):
            class_name = CLASS_NAMES.get(class_index, f'Class {class_index}')
            print(f"  - {class_name:<20}: {class_maps[i]:.3f}")
            
        # --- Improvement Analysis ---
        baseline_mAP = 0.417 
        improvement = performance['mAP50'] - baseline_mAP
        print(f"\nImprovement vs Baseline (41.7%): {improvement:+.1%}")
        
        return performance

# --- EXECUTE FINAL TRAINING AND EVALUATION ---
final_performance = None
if 'yaml_file' in locals() and Path(yaml_file).exists():
    trainer = OptimizedTrainer(yaml_file, monitor, best_params)
    
    if trainer.train_model():
        final_performance = trainer.evaluate_model()

    # Save summary to file
    summary = {
        'timestamp': datetime.now().isoformat(),
        'dataset_size': f"{trainer.MAX_EPOCHS} train + {len(trainer.val_images)} val",
        'performance': final_performance,
        'hyperparameters': best_params,
        'model_path': f"runs/detect_final_local/{trainer.run_name}/weights/best.pt"
    }
    with open('laptop_training_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)
    print("\nTraining summary saved to: laptop_training_summary.json")
    print(f"Best model saved to: {summary['model_path']}")
else:
    print("FATAL ERROR: Training YAML file not found. Pipeline stopped.")

FATAL ERROR: Training YAML file not found. Pipeline stopped.
