In [2]:
import random
import shutil
from pathlib import Path
from collections import defaultdict
import yaml
from tqdm import tqdm
import numpy as np

In [3]:
def create_balanced_subset(
    dataset_root,
    output_root,
    train_samples=4000,
    val_samples=1000,
    negative_ratio=0.25,
    seed=42
):
    """
    T·∫°o subset c√¢n b·∫±ng t·ª´ dataset g·ªëc
    
    Args:
        dataset_root: ƒê∆∞·ªùng d·∫´n ƒë·∫øn dataset g·ªëc (dataset_output)
        output_root: ƒê∆∞·ªùng d·∫´n ƒë·ªÉ l∆∞u subset m·ªõi
        train_samples: S·ªë l∆∞·ª£ng samples cho train (3500)
        val_samples: S·ªë l∆∞·ª£ng samples cho val (500)
        negative_ratio: T·ªâ l·ªá negative samples (0.25 = 25%)
        seed: Random seed
    
    Strategy "C√¢n b·∫±ng":
    - Perfectly balanced: M·ªói class c√≥ s·ªë l∆∞·ª£ng samples b·∫±ng nhau
    - N·∫øu m·ªôt class kh√¥ng ƒë·ªß samples, l·∫•y t·∫•t c·∫£ v√† ph√¢n b·ªï l·∫°i cho c√°c class kh√°c
    - Negative samples ƒë∆∞·ª£c l·∫•y ng·∫´u nhi√™n t·ª´ t·∫•t c·∫£ negative samples
    """
    random.seed(seed)
    np.random.seed(seed)
    
    dataset_root = Path(dataset_root)
    output_root = Path(output_root)
    
    # Load class names
    with open(dataset_root / 'data.yaml', 'r') as f:
        data = yaml.safe_load(f)
        class_names = data.get('names', [])
        num_classes = data.get('nc', len(class_names))
    
    print("=" * 70)
    print("üìä T·∫†O BALANCED SUBSET")
    print("=" * 70)
    print(f"Classes: {class_names}")
    print(f"Train samples: {train_samples} (25% negative = {int(train_samples * negative_ratio)})")
    print(f"Val samples: {val_samples} (25% negative = {int(val_samples * negative_ratio)})")
    print("=" * 70)
    
    # T√≠nh to√°n s·ªë l∆∞·ª£ng samples cho m·ªói class
    train_positive = int(train_samples * (1 - negative_ratio))  # 2625
    val_positive = int(val_samples * (1 - negative_ratio))      # 375
    train_negative = train_samples - train_positive              # 875
    val_negative = val_samples - val_positive                    # 125
    
    samples_per_class_train = train_positive // num_classes     # ~375
    samples_per_class_val = val_positive // num_classes          # ~53
    
    print(f"\nüìà Ph√¢n b·ªï:")
    print(f"Train - Positive: {train_positive} ({samples_per_class_train}/class)")
    print(f"Train - Negative: {train_negative}")
    print(f"Val - Positive: {val_positive} ({samples_per_class_val}/class)")
    print(f"Val - Negative: {val_negative}")
    
    # T·∫°o th∆∞ m·ª•c output
    for split in ['train', 'val']:
        (output_root / split / 'images').mkdir(parents=True, exist_ok=True)
        (output_root / split / 'labels').mkdir(parents=True, exist_ok=True)
    
    # X·ª≠ l√Ω t·ª´ng split
    for split in ['train', 'val']:
        print(f"\n{'='*70}")
        print(f"üîÑ X·ª≠ l√Ω {split.upper()}")
        print(f"{'='*70}")
        
        images_dir = dataset_root / split / 'images'
        labels_dir = dataset_root / split / 'labels'
        
        # ƒê·ªçc t·∫•t c·∫£ images
        image_files = list(images_dir.glob('*.tif'))
        print(f"T·ªïng s·ªë images: {len(image_files)}")
        
        # Ph√¢n lo·∫°i samples
        positive_by_class = defaultdict(list)  # {class_id: [image_paths]}
        negative_samples = []  # [image_paths]
        
        for img_path in tqdm(image_files, desc=f"Ph√¢n lo·∫°i {split}"):
            label_path = labels_dir / (img_path.stem + '.txt')
            
            if not label_path.exists() or label_path.stat().st_size == 0:
                # Negative sample (kh√¥ng c√≥ label ho·∫∑c label r·ªóng)
                negative_samples.append(img_path)
            else:
                # Positive sample - ƒë·ªçc classes trong label
                with open(label_path, 'r') as f:
                    lines = f.readlines()
                    classes_in_image = set()
                    for line in lines:
                        if line.strip():
                            class_id = int(line.strip().split()[0])
                            classes_in_image.add(class_id)
                    
                    # Th√™m v√†o t·∫•t c·∫£ classes c√≥ trong image
                    for class_id in classes_in_image:
                        positive_by_class[class_id].append(img_path)
        
        print(f"\nüìä Th·ªëng k√™ {split}:")
        print(f"Negative samples: {len(negative_samples)}")
        for class_id in range(num_classes):
            unique_samples = len(set(positive_by_class[class_id]))
            print(f"Class {class_id} ({class_names[class_id]}): {unique_samples} samples")
        
        # Sample c√¢n b·∫±ng
        target_samples = train_samples if split == 'train' else val_samples
        target_positive = train_positive if split == 'train' else val_positive
        target_negative = train_negative if split == 'train' else val_negative
        samples_per_class = samples_per_class_train if split == 'train' else samples_per_class_val
        
        selected_images = []
        selected_labels = []
        
        # 1. Sample t·ª´ m·ªói class (c√¢n b·∫±ng)
        print(f"\nüéØ Sampling t·ª´ m·ªói class ({samples_per_class}/class)...")
        class_counts = {}
        
        for class_id in range(num_classes):
            class_samples = list(set(positive_by_class[class_id]))  # Remove duplicates
            random.shuffle(class_samples)
            
            # L·∫•y samples_per_class ho·∫∑c t·∫•t c·∫£ n·∫øu kh√¥ng ƒë·ªß
            num_to_sample = min(samples_per_class, len(class_samples))
            selected = class_samples[:num_to_sample]
            
            class_counts[class_id] = len(selected)
            print(f"  Class {class_id} ({class_names[class_id]}): {len(selected)}/{len(class_samples)}")
            
            for img_path in selected:
                if img_path not in selected_images:  # Tr√°nh duplicate
                    selected_images.append(img_path)
        
        # 2. N·∫øu thi·∫øu samples, ph√¢n b·ªï l·∫°i t·ª´ c√°c class c√≤n d∆∞
        current_positive = len(selected_images)
        if current_positive < target_positive:
            print(f"\n‚ö†Ô∏è  Thi·∫øu {target_positive - current_positive} positive samples, ph√¢n b·ªï l·∫°i...")
            remaining_needed = target_positive - current_positive
            
            # T√¨m c√°c class c√≤n d∆∞ samples
            available_samples = []
            for class_id in range(num_classes):
                class_samples = list(set(positive_by_class[class_id]))
                already_selected = [img for img in selected_images if img in class_samples]
                remaining = [img for img in class_samples if img not in already_selected]
                available_samples.extend(remaining)
            
            random.shuffle(available_samples)
            additional = available_samples[:remaining_needed]
            selected_images.extend(additional)
            print(f"  ƒê√£ th√™m {len(additional)} samples t·ª´ c√°c class c√≤n d∆∞")
        
        # 3. Sample negative samples
        print(f"\nüéØ Sampling negative samples ({target_negative})...")
        random.shuffle(negative_samples)
        selected_negative = negative_samples[:target_negative]
        selected_images.extend(selected_negative)
        
        print(f"\n‚úÖ ƒê√£ ch·ªçn {len(selected_images)} samples:")
        print(f"  - Positive: {len(selected_images) - len(selected_negative)}")
        print(f"  - Negative: {len(selected_negative)}")
        print(f"  - Ph√¢n b·ªï theo class:")
        for class_id in range(num_classes):
            count = sum(1 for img in selected_images[:len(selected_images)-len(selected_negative)]
                       if img in positive_by_class[class_id])
            print(f"    Class {class_id} ({class_names[class_id]}): {count}")
        
        # 4. Copy files
        print(f"\nüìÅ Copying files...")
        for img_path in tqdm(selected_images, desc=f"Copy {split}"):
            # Copy image
            dst_img = output_root / split / 'images' / img_path.name
            shutil.copy2(img_path, dst_img)
            
            # Copy label (n·∫øu c√≥)
            label_path = labels_dir / (img_path.stem + '.txt')
            if label_path.exists() and label_path.stat().st_size > 0:
                dst_label = output_root / split / 'labels' / label_path.name
                shutil.copy2(label_path, dst_label)
    
    # Copy data.yaml
    shutil.copy2(dataset_root / 'data.yaml', output_root / 'data.yaml')
    
    print(f"\n{'='*70}")
    print("‚úÖ HO√ÄN TH√ÄNH!")
    print(f"{'='*70}")
    print(f"Output: {output_root}")
    print(f"Train: {len(list((output_root / 'train' / 'images').glob('*.tif')))} samples")
    print(f"Val: {len(list((output_root / 'val' / 'images').glob('*.tif')))} samples")
    print(f"{'='*70}")

In [None]:


create_balanced_subset(
    dataset_root='/home/t1tc01-hoangphan/code/t1tc01-personal/ultralytics_sia-aero-eyes-Zalo-2025/dataset_output',
    output_root='/home/t1tc01-hoangphan/code/t1tc01-personal/ultralytics_sia-aero-eyes-Zalo-2025/dataset_output_balanced',
    train_samples=4000,
    val_samples=1000,
    negative_ratio=0.25,
    seed=42
)

üìä T·∫†O BALANCED SUBSET
Classes: ['Backpack', 'Jacket', 'Laptop', 'Lifering', 'Mobilephone', 'Person1', 'WaterBottle']
Train samples: 3500 (25% negative = 875)
Val samples: 500 (25% negative = 125)

üìà Ph√¢n b·ªï:
Train - Positive: 2625 (375/class)
Train - Negative: 875
Val - Positive: 375 (53/class)
Val - Negative: 125

üîÑ X·ª≠ l√Ω TRAIN
T·ªïng s·ªë images: 18646


Ph√¢n lo·∫°i train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18646/18646 [00:06<00:00, 2985.14it/s]



üìä Th·ªëng k√™ train:
Negative samples: 3726
Class 0 (Backpack): 4638 samples
Class 1 (Jacket): 1852 samples
Class 2 (Laptop): 1871 samples
Class 3 (Lifering): 2645 samples
Class 4 (Mobilephone): 1857 samples
Class 5 (Person1): 2057 samples
Class 6 (WaterBottle): 0 samples

üéØ Sampling t·ª´ m·ªói class (375/class)...
  Class 0 (Backpack): 375/4638
  Class 1 (Jacket): 375/1852
  Class 2 (Laptop): 375/1871
  Class 3 (Lifering): 375/2645
  Class 4 (Mobilephone): 375/1857
  Class 5 (Person1): 375/2057
  Class 6 (WaterBottle): 0/0

‚ö†Ô∏è  Thi·∫øu 375 positive samples, ph√¢n b·ªï l·∫°i...
  ƒê√£ th√™m 375 samples t·ª´ c√°c class c√≤n d∆∞

üéØ Sampling negative samples (875)...

‚úÖ ƒê√£ ch·ªçn 3500 samples:
  - Positive: 2625
  - Negative: 875
  - Ph√¢n b·ªï theo class:
    Class 0 (Backpack): 506
    Class 1 (Jacket): 421
    Class 2 (Laptop): 422
    Class 3 (Lifering): 432
    Class 4 (Mobilephone): 422
    Class 5 (Person1): 422
    Class 6 (WaterBottle): 0

üìÅ Copying files...


Copy train:  33%|‚ñà‚ñà‚ñà‚ñé      | 1153/3500 [00:18<00:36, 63.54it/s]


KeyboardInterrupt: 