In [1]:
# !pip install -qqq tifffile

In [2]:
OUTPUT_ROOT = '/home/t1tc01-hoangphan/code/t1tc01-personal/ultralytics_sia-aero-eyes-Zalo-2025/dataset_output'
DATASET_ROOT= '/home/t1tc01-hoangphan/code/t1tc01-personal/Zalo-AI-2025-Challenger/aero-eyes-data/train'
class_names = ['Backpack', 
'Jacket', 
'Laptop', 
'Lifering', 
'Mobilephone', 
'Person1', 
'WaterBottle'
]

## Analyze data

In [3]:
import json
import cv2
import numpy as np
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
from tifffile import imwrite

def analyze_dataset_statistics(
    dataset_root,
    train_split=0.8,
    frame_stride=1,
    negative_sample_ratio=0.3
):
    """
    Ph√¢n t√≠ch v√† t√≠nh to√°n s·ªë l∆∞·ª£ng samples sau conversion
    
    Args:
        dataset_root: Path to dataset root
        train_split: Train/val split ratio
        frame_stride: Frame sampling stride
        negative_sample_ratio: Ratio of negative samples
    """
    dataset_root = Path(dataset_root)
    
    # Load annotations
    annotations_path = dataset_root / 'annotations' / 'annotations.json'
    if not annotations_path.exists():
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y file: {annotations_path}")
        return
    
    with open(annotations_path, 'r') as f:
        annotations = json.load(f)
    
    print("=" * 70)
    print("üìä PH√ÇN T√çCH DATASET STATISTICS")
    print("=" * 70)
    
    # Statistics
    total_videos = len(annotations)
    num_train_videos = int(total_videos * train_split)
    num_val_videos = total_videos - num_train_videos
    
    video_stats = []
    total_frames = 0
    total_annotated_frames = 0
    total_bboxes = 0
    class_counts = defaultdict(int)
    
    print(f"\nüìÅ T·ªïng s·ªë videos: {total_videos}")
    print(f"   - Train videos: {num_train_videos} ({train_split*100:.0f}%)")
    print(f"   - Val videos: {num_val_videos} ({(1-train_split)*100:.0f}%)")
    print("\n" + "-" * 70)
    print("üìπ Chi ti·∫øt t·ª´ng video:")
    print("-" * 70)
    
    for idx, video_data in enumerate(annotations):
        video_id = video_data['video_id']
        video_path = dataset_root / 'samples' / video_id / 'drone_video.mp4'
        
        # Get class
        class_name = video_id.rsplit('_', 1)[0]
        
        # Count annotations
        annotated_frames = set()
        bbox_count = 0
        for ann in video_data['annotations']:
            for bbox in ann['bboxes']:
                frame_num = bbox['frame']
                annotated_frames.add(frame_num)
                bbox_count += 1
                class_counts[class_name] += 1
        
        num_annotated_frames = len(annotated_frames)
        
        # Get video info
        total_video_frames = 0
        fps = 30  # default
        duration = 0
        if video_path.exists():
            cap = cv2.VideoCapture(str(video_path))
            total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = cap.get(cv2.CAP_PROP_FPS) or 30
            duration = total_video_frames / fps
            cap.release()
        else:
            print(f"‚ö†Ô∏è  Video kh√¥ng t·ªìn t·∫°i: {video_path}")
        
        # Calculate negative candidates
        negative_candidates = []
        for frame_num in range(0, total_video_frames, frame_stride):
            if frame_num not in annotated_frames:
                negative_candidates.append(frame_num)
        
        num_negative = min(
            int(num_annotated_frames * negative_sample_ratio),
            len(negative_candidates)
        )
        
        split = 'train' if idx < num_train_videos else 'val'
        
        video_stats.append({
            'video_id': video_id,
            'class': class_name,
            'split': split,
            'total_frames': total_video_frames,
            'annotated_frames': num_annotated_frames,
            'bboxes': bbox_count,
            'negative_samples': num_negative,
            'duration': duration,
            'fps': fps
        })
        
        total_frames += total_video_frames
        total_annotated_frames += num_annotated_frames
        total_bboxes += bbox_count
        
        # Print video info
        print(f"\n[{idx+1}/{total_videos}] {video_id} ({split})")
        print(f"   Class: {class_name}")
        print(f"   Duration: {duration:.1f}s ({duration/60:.1f} min)")
        print(f"   Total frames: {total_video_frames:,} @ {fps:.1f} fps")
        print(f"   Annotated frames: {num_annotated_frames:,} ({num_annotated_frames/total_video_frames*100:.1f}%)")
        print(f"   Bboxes: {bbox_count:,}")
        print(f"   Negative samples: {num_negative:,}")
        print(f"   Total samples: {num_annotated_frames + num_negative:,}")
    
    # Calculate totals by split
    train_positive = sum(s['annotated_frames'] for s in video_stats if s['split'] == 'train')
    train_negative = sum(s['negative_samples'] for s in video_stats if s['split'] == 'train')
    val_positive = sum(s['annotated_frames'] for s in video_stats if s['split'] == 'val')
    val_negative = sum(s['negative_samples'] for s in video_stats if s['split'] == 'val')
    
    print("\n" + "=" * 70)
    print("üìà T·ªîNG K·∫æT")
    print("=" * 70)
    
    print(f"\nüé¨ Videos:")
    print(f"   - T·ªïng videos: {total_videos}")
    print(f"   - Train: {num_train_videos} videos")
    print(f"   - Val: {num_val_videos} videos")
    
    print(f"\nüéûÔ∏è  Frames:")
    print(f"   - T·ªïng frames: {total_frames:,}")
    print(f"   - Frames c√≥ annotation: {total_annotated_frames:,} ({total_annotated_frames/total_frames*100:.1f}%)")
    print(f"   - T·ªïng bboxes: {total_bboxes:,}")
    print(f"   - Trung b√¨nh bboxes/frame: {total_bboxes/total_annotated_frames:.2f}")
    
    print(f"\nüì¶ Samples sau conversion:")
    print(f"   Train:")
    print(f"      - Positive: {train_positive:,} samples")
    print(f"      - Negative: {train_negative:,} samples")
    print(f"      - T·ªïng: {train_positive + train_negative:,} samples")
    print(f"   Val:")
    print(f"      - Positive: {val_positive:,} samples")
    print(f"      - Negative: {val_negative:,} samples")
    print(f"      - T·ªïng: {val_positive + val_negative:,} samples")
    print(f"   T·ªîNG C·ªòNG: {train_positive + train_negative + val_positive + val_negative:,} samples")
    
    print(f"\nüìä Ph√¢n b·ªë theo class:")
    for class_name, count in sorted(class_counts.items()):
        print(f"   - {class_name}: {count:,} bboxes")
    
    # Storage estimation
    avg_size_per_sample = 8  # MB (6-channel TIFF)
    total_storage = (train_positive + train_negative + val_positive + val_negative) * avg_size_per_sample / 1024
    
    print(f"\nüíæ Storage ∆∞·ªõc t√≠nh:")
    print(f"   - M·ªói sample: ~{avg_size_per_sample} MB")
    print(f"   - T·ªïng storage: ~{total_storage:.1f} GB")
    
    # Frame stride impact
    print(f"\n‚öôÔ∏è  Tham s·ªë conversion:")
    print(f"   - train_split: {train_split}")
    print(f"   - frame_stride: {frame_stride}")
    print(f"   - negative_sample_ratio: {negative_sample_ratio}")
    
    # Recommendations
    print(f"\nüí° Khuy·∫øn ngh·ªã:")
    if total_annotated_frames < 10000:
        print(f"   ‚ö†Ô∏è  Dataset nh·ªè (<10K samples) - n√™n d√πng frame_stride=1 ƒë·ªÉ t·ªëi ƒëa d·ªØ li·ªáu")
    elif total_annotated_frames > 50000:
        print(f"   üí° Dataset l·ªõn (>50K samples) - c√≥ th·ªÉ d√πng frame_stride=2-3 ƒë·ªÉ gi·∫£m storage")
    else:
        print(f"   ‚úÖ Dataset v·ª´a ph·∫£i - frame_stride=1 l√† ph√π h·ª£p")
    
    if total_storage > 200:
        print(f"   ‚ö†Ô∏è  Storage l·ªõn (>200GB) - c√¢n nh·∫Øc tƒÉng frame_stride ho·∫∑c gi·∫£m resolution")
    
    print("\n" + "=" * 70)
    
    return {
        'total_videos': total_videos,
        'total_frames': total_frames,
        'total_annotated_frames': total_annotated_frames,
        'total_bboxes': total_bboxes,
        'train_positive': train_positive,
        'train_negative': train_negative,
        'val_positive': val_positive,
        'val_negative': val_negative,
        'total_samples': train_positive + train_negative + val_positive + val_negative,
        'estimated_storage_gb': total_storage,
        'video_stats': video_stats
    }

# Ch·∫°y ph√¢n t√≠ch
stats = analyze_dataset_statistics(
    dataset_root=DATASET_ROOT,
    train_split=0.8,
    frame_stride=1,
    negative_sample_ratio=0.3
)

üìä PH√ÇN T√çCH DATASET STATISTICS

üìÅ T·ªïng s·ªë videos: 14
   - Train videos: 11 (80%)
   - Val videos: 3 (20%)

----------------------------------------------------------------------
üìπ Chi ti·∫øt t·ª´ng video:
----------------------------------------------------------------------

[1/14] Backpack_0 (train)
   Class: Backpack
   Duration: 418.6s (7.0 min)
   Total frames: 10,466 @ 25.0 fps
   Annotated frames: 3,184 (30.4%)
   Bboxes: 3,189
   Negative samples: 955
   Total samples: 4,139

[2/14] Backpack_1 (train)
   Class: Backpack
   Duration: 180.0s (3.0 min)
   Total frames: 4,500 @ 25.0 fps
   Annotated frames: 1,454 (32.3%)
   Bboxes: 1,456
   Negative samples: 436
   Total samples: 1,890

[3/14] Jacket_0 (train)
   Class: Jacket
   Duration: 203.4s (3.4 min)
   Total frames: 5,085 @ 25.0 fps
   Annotated frames: 1,162 (22.9%)
   Bboxes: 1,165
   Negative samples: 348
   Total samples: 1,510

[4/14] Jacket_1 (train)
   Class: Jacket
   Duration: 208.8s (3.5 min)
   Tota

## Conversion Data

In [4]:
import json
import cv2
import numpy as np
from pathlib import Path
from tifffile import imwrite
from tqdm import tqdm


In [5]:
def get_class_id(class_name):
    """Map class name to class ID"""
    class_mapping = {
        'Backpack': 0,
        'Jacket': 1,
        'Laptop': 2,
        'Lifering': 3,
        'MobilePhone': 4,
        'Person1': 5,
        'WaterBottle': 6,
    }
    return class_mapping.get(class_name, 0)

def create_data_yaml(output_root, class_names):
    """Create data.yaml file"""
    output_root = Path(output_root)
    yaml_content = f"""names:
{chr(10).join([f"- {name}" for name in class_names])}
nc: {len(class_names)}

train: train/images
val: val/images
test: test/images
"""
    with open(output_root / 'data.yaml', 'w') as f:
        f.write(yaml_content)

In [6]:
def convert_video_to_siamese_format(
    dataset_root,
    output_root,
    reference_frame_offset=10,  # Use frame N frames before as reference
    frame_stride=1  # Sample every Nth frame
):
    """
    Convert video dataset to siamese YOLO format.
    
    Strategy: Use previous frame as "clean" reference, current frame as "annotated"
    """
    dataset_root = Path(dataset_root)
    output_root = Path(output_root)
    
    # Load annotations
    with open(dataset_root / 'annotations' / 'annotations.json', 'r') as f:
        annotations = json.load(f)
    
    # Create output directories
    for split in ['train', 'val']:
        (output_root / split / 'images').mkdir(parents=True, exist_ok=True)
        (output_root / split / 'labels').mkdir(parents=True, exist_ok=True)
    
    # Process each video
    for video_data in tqdm(annotations, desc="Processing videos"):
        video_id = video_data['video_id']
        video_path = dataset_root / 'samples' / video_id / 'drone_video.mp4'
        
        if not video_path.exists():
            continue
        
        # Extract class from video_id (e.g., "Backpack_0" -> "Backpack")
        class_name = video_id.rsplit('_', 1)[0]
        class_id = get_class_id(class_name)  # You'll need to define this
        
        # Load video
        cap = cv2.VideoCapture(str(video_path))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # Create frame-to-bbox mapping
        bbox_map = {}
        for ann in video_data['annotations']:
            for bbox in ann['bboxes']:
                frame_num = bbox['frame']
                if frame_num not in bbox_map:
                    bbox_map[frame_num] = []
                bbox_map[frame_num].append({
                    'x1': bbox['x1'],
                    'y1': bbox['y1'],
                    'x2': bbox['x2'],
                    'y2': bbox['y2']
                })
        
        # Process frames
        frame_buffer = []
        frame_count = 0
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            frame_buffer.append(frame)
            
            # When we have enough frames, create siamese pairs
            if len(frame_buffer) > reference_frame_offset:
                # Reference frame (clean)
                ref_frame = frame_buffer[-reference_frame_offset]
                # Current frame (with annotations)
                curr_frame = frame_buffer[-1]
                
                # Create 6-channel image: [ref_R, ref_G, ref_B, curr_R, curr_G, curr_B]
                ref_rgb = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2RGB)
                curr_rgb = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2RGB)
                six_channel = np.concatenate([ref_rgb, curr_rgb], axis=2)
                
                # Get image dimensions
                h, w = curr_frame.shape[:2]
                
                # Create label file
                label_lines = []
                if frame_count in bbox_map:
                    for bbox in bbox_map[frame_count]:
                        # Convert xyxy to normalized xywh
                        x1, y1, x2, y2 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']
                        
                        # Normalize
                        x_center = ((x1 + x2) / 2) / w
                        y_center = ((y1 + y2) / 2) / h
                        width = (x2 - x1) / w
                        height = (y2 - y1) / h
                        
                        label_lines.append(f"{class_id} {x_center} {y_center} {width} {height}")
                
                # Save 6-channel TIFF
                image_name = f"{video_id}_frame_{frame_count:06d}"
                image_path = output_root / 'train' / 'images' / f"{image_name}.tif"
                imwrite(str(image_path), six_channel)
                
                # Save label
                label_path = output_root / 'train' / 'labels' / f"{image_name}.txt"
                with open(label_path, 'w') as f:
                    f.write('\n'.join(label_lines))
                
                frame_count += frame_stride
                
                # Keep buffer size manageable
                if len(frame_buffer) > reference_frame_offset + 10:
                    frame_buffer.pop(0)
        
        cap.release()
    
    # Create data.yaml
    create_data_yaml(output_root, class_names)

In [7]:
def convert_with_object_templates(
    dataset_root, 
    output_root, 
    train_split=0.8, 
    frame_stride=2,  # ‚≠ê Best option: process every 2nd frame
    negative_sample_ratio=0.25,  # ‚≠ê Best option: 25% negative samples
    target_size=(640, 640),  # ‚≠ê Best option: resize to 640x640
    compress_level=6  # ‚≠ê Best option: compression level 6
):
    """
    Use object_images as reference, video frames as query
    
    Best option settings:
    - Resolution: 640x640 (reduces storage by ~80%)
    - Frame stride: 2 (process every 2nd frame)
    - Negative ratio: 25% (balanced dataset)
    - Compression: level 6 (reduces storage by ~40%)
    """
    dataset_root = Path(dataset_root)
    output_root = Path(output_root)
    
    # Load annotations
    with open(dataset_root / 'annotations' / 'annotations.json', 'r') as f:
        annotations = json.load(f)
    
    # Create output directories
    for split in ['train', 'val']:
        (output_root / split / 'images').mkdir(parents=True, exist_ok=True)
        (output_root / split / 'labels').mkdir(parents=True, exist_ok=True)
    
    # Determine train/val split by video
    num_videos = len(annotations)
    train_count = int(num_videos * train_split)
    
    # Compression mapping for tifffile
    # tifffile uses 'compression' parameter: None, 'lzw', 'zlib', 'jpeg', etc.
    compression_map = {
        1: 'lzw',      # LZW compression (good balance)
        2: 'lzw',
        3: 'lzw',
        4: 'zlib',     # Zlib compression (better compression)
        5: 'zlib',
        6: 'zlib',     # Best option default
        7: 'zlib',
        8: 'zlib',
        9: 'zlib'
    }
    compression_method = compression_map.get(compress_level, 'zlib') if compress_level > 0 else None
    
    # Collect all frames with annotations for negative sampling
    all_annotated_frames = set()
    for video_data in annotations:
        for ann in video_data['annotations']:
            for bbox in ann['bboxes']:
                all_annotated_frames.add((video_data['video_id'], bbox['frame']))
    
    # Process each video
    for idx, video_data in enumerate(tqdm(annotations, desc="Processing videos")):
        video_id = video_data['video_id']
        video_path = dataset_root / 'samples' / video_id / 'drone_video.mp4'
        object_images_dir = dataset_root / 'samples' / video_id / 'object_images'
        
        if not video_path.exists():
            continue
        
        split = 'train' if idx < train_count else 'val'
        class_name = video_id.rsplit('_', 1)[0]
        class_id = get_class_id(class_name)
        
        # Load template images
        template_images = sorted(list(object_images_dir.glob('*.jpg')))
        if not template_images:
            continue
        
        ref_images = []
        for template_path in template_images[:3]:
            img = cv2.imread(str(template_path))
            if img is not None:
                ref_images.append(img)
        
        if not ref_images:
            continue
        
        # Resize all template images to the same size (use first image's size as reference)
        target_h, target_w = ref_images[0].shape[:2]
        ref_images_resized = []
        for img in ref_images:
            if img.shape[:2] != (target_h, target_w):
                img_resized = cv2.resize(img, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
                ref_images_resized.append(img_resized)
            else:
                ref_images_resized.append(img)
        
        # ‚≠ê Median + Post-processing (khuy·∫øn ngh·ªã)
        # 1. D√πng median thay v√¨ mean (√≠t m·ªù h∆°n, gi·ªØ m√†u s·∫Øc t·ªët h∆°n)
        ref_images_array = np.array(ref_images_resized)
        ref_image = np.median(ref_images_array, axis=0).astype(np.uint8)
        
        # 2. TƒÉng saturation ƒë·ªÉ l√†m r√µ m√†u s·∫Øc v·∫≠t th·ªÉ
        hsv = cv2.cvtColor(ref_image, cv2.COLOR_BGR2HSV).astype(np.float32)
        hsv[:, :, 1] = np.clip(hsv[:, :, 1] * 1.15, 0, 255)  # TƒÉng saturation 15%
        hsv[:, :, 2] = np.clip(hsv[:, :, 2] * 1.05, 0, 255)  # TƒÉng brightness 5%
        ref_image = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
        
        # 3. TƒÉng contrast nh·∫π
        ref_image = cv2.convertScaleAbs(ref_image, alpha=1.1, beta=5)
        
        # Convert to RGB
        ref_rgb = cv2.cvtColor(ref_image, cv2.COLOR_BGR2RGB)
        
        # Create frame-to-bbox mapping
        bbox_map = {}
        for ann in video_data['annotations']:
            for bbox in ann['bboxes']:
                frame_num = bbox['frame']
                if frame_num not in bbox_map:
                    bbox_map[frame_num] = []
                bbox_map[frame_num].append({
                    'x1': bbox['x1'], 'y1': bbox['y1'],
                    'x2': bbox['x2'], 'y2': bbox['y2']
                })
        
        # Get video info
        cap = cv2.VideoCapture(str(video_path))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        cap.release()
        
        # Collect frames for negative sampling
        positive_frames = set(bbox_map.keys())
        negative_candidates = []
        for frame_num in range(0, total_frames, frame_stride):
            if frame_num not in positive_frames:
                negative_candidates.append(frame_num)
        
        # Calculate how many negative samples to include
        num_positive = len(positive_frames)
        num_negative = int(num_positive * negative_sample_ratio)
        selected_negatives = np.random.choice(
            negative_candidates, 
            min(num_negative, len(negative_candidates)), 
            replace=False
        )
        negative_frames = set(selected_negatives)
        
        # Process video frames
        cap = cv2.VideoCapture(str(video_path))
        frame_count = 0
        processed_count = 0
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            # Process positive frames (with annotations) and selected negative frames
            is_positive = frame_count in bbox_map
            is_negative = frame_count in negative_frames
            
            if not (is_positive or is_negative):
                frame_count += 1
                continue
            
            # Get original frame dimensions for bbox normalization
            original_h, original_w = frame.shape[:2]
            
            # Resize reference to match frame
            ref_resized = cv2.resize(ref_rgb, (original_w, original_h), interpolation=cv2.INTER_LINEAR)
            
            # Create 6-channel image
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            six_channel = np.concatenate([ref_resized, frame_rgb], axis=2)
            
            # ‚≠ê Resize to target size if specified
            if target_size:
                target_w, target_h = target_size
                # Resize 6-channel image (resize each channel separately)
                six_channel_resized = np.zeros((target_h, target_w, 6), dtype=six_channel.dtype)
                for i in range(6):
                    six_channel_resized[:, :, i] = cv2.resize(
                        six_channel[:, :, i], 
                        (target_w, target_h), 
                        interpolation=cv2.INTER_LINEAR
                    )
                six_channel = six_channel_resized
                # Update dimensions for bbox normalization
                h, w = target_h, target_w
                # Calculate scale factors
                scale_x = target_w / original_w
                scale_y = target_h / original_h
            else:
                h, w = original_h, original_w
                scale_x, scale_y = 1.0, 1.0
            
            # Create label file
            label_lines = []
            if is_positive and frame_count in bbox_map:
                for bbox in bbox_map[frame_count]:
                    x1, y1, x2, y2 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']
                    
                    # Clamp to original image bounds
                    x1 = max(0, min(x1, original_w))
                    y1 = max(0, min(y1, original_h))
                    x2 = max(0, min(x2, original_w))
                    y2 = max(0, min(y2, original_h))
                    
                    if x2 <= x1 or y2 <= y1:
                        continue
                    
                    # Scale bbox coordinates if image was resized
                    if target_size:
                        x1 = x1 * scale_x
                        y1 = y1 * scale_y
                        x2 = x2 * scale_x
                        y2 = y2 * scale_y
                    
                    # Normalize to [0, 1]
                    x_center = ((x1 + x2) / 2) / w
                    y_center = ((y1 + y2) / 2) / h
                    width = (x2 - x1) / w
                    height = (y2 - y1) / h
                    
                    # Ensure normalized values are in valid range
                    x_center = max(0.0, min(1.0, x_center))
                    y_center = max(0.0, min(1.0, y_center))
                    width = max(0.001, min(1.0, width))
                    height = max(0.001, min(1.0, height))
                    
                    label_lines.append(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")
            
            # Save 6-channel TIFF with compression
            image_name = f"{video_id}_frame_{frame_count:06d}"
            image_path = output_root / split / 'images' / f"{image_name}.tif"
            if compression_method:
                imwrite(str(image_path), six_channel, compression=compression_method)
            else:
                imwrite(str(image_path), six_channel)
            
            # Save label (empty for negative samples)
            label_path = output_root / split / 'labels' / f"{image_name}.txt"
            with open(label_path, 'w') as f:
                if label_lines:
                    f.write('\n'.join(label_lines))
            
            frame_count += 1
            processed_count += 1
        
        cap.release()
    
    # Create data.yaml
    create_data_yaml(output_root, class_names)
    
    # Calculate statistics
    train_images = len(list((output_root / 'train' / 'images').glob('*.tif')))
    val_images = len(list((output_root / 'val' / 'images').glob('*.tif')))
    total_samples = train_images + val_images
    
    # Estimate storage
    if target_size:
        w, h = target_size
        size_per_sample_mb = (w * h * 6) / (1024 * 1024)  # MB
        if compress_level > 0:
            size_per_sample_mb *= 0.6  # Compression reduces by ~40%
    else:
        size_per_sample_mb = 8  # Default estimate
    
    total_storage_gb = (total_samples * size_per_sample_mb) / 1024
    
    print(f"\n{'='*70}")
    print(f"‚úÖ CONVERSION COMPLETE!")
    print(f"{'='*70}")
    print(f"\nüìÅ Output directory: {output_root}")
    print(f"üìä Statistics:")
    print(f"   - Train videos: {train_count}/{num_videos}")
    print(f"   - Val videos: {num_videos - train_count}/{num_videos}")
    print(f"   - Train samples: {train_images:,}")
    print(f"   - Val samples: {val_images:,}")
    print(f"   - Total samples: {total_samples:,}")
    print(f"\n‚öôÔ∏è  Settings used:")
    print(f"   - Resolution: {target_size if target_size else 'Original'}")
    print(f"   - Frame stride: {frame_stride}")
    print(f"   - Negative ratio: {negative_sample_ratio*100:.0f}%")
    print(f"   - Compression: Level {compress_level if compress_level > 0 else 'None'}")
    print(f"\nüíæ Storage estimate:")
    print(f"   - Size per sample: ~{size_per_sample_mb:.2f} MB")
    print(f"   - Total storage: ~{total_storage_gb:.1f} GB")
    print(f"\n‚úÖ Ready for training! Use: model.train(data='{output_root}/data.yaml', ...)")
    print(f"{'='*70}")

In [8]:
# Run conversion with BEST OPTION settings
# Default settings (already optimized):
# - Resolution: 640x640 (reduces storage by ~80%)
# - Frame stride: 2 (process every 2nd frame)
# - Negative ratio: 25% (balanced dataset)
# - Compression: level 6 (reduces storage by ~40%)
# Expected storage: ~16-20 GB (instead of 200+ GB)

convert_with_object_templates(
    dataset_root=DATASET_ROOT,
    output_root=OUTPUT_ROOT,
    train_split=0.8,  # 80% videos for train, 20% for val
    # All other parameters use best option defaults:
    # - frame_stride=2
    # - negative_sample_ratio=0.25
    # - target_size=(640, 640)
    # - compress_level=6
)

Processing videos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [1:18:05<00:00, 334.69s/it]



‚úÖ CONVERSION COMPLETE!

üìÅ Output directory: /home/t1tc01-hoangphan/code/t1tc01-personal/ultralytics_sia-aero-eyes-Zalo-2025/dataset_output
üìä Statistics:
   - Train videos: 11/14
   - Val videos: 3/14
   - Train samples: 18,646
   - Val samples: 6,481
   - Total samples: 25,127

‚öôÔ∏è  Settings used:
   - Resolution: (640, 640)
   - Frame stride: 2
   - Negative ratio: 25%
   - Compression: Level 6

üíæ Storage estimate:
   - Size per sample: ~1.41 MB
   - Total storage: ~34.5 GB

‚úÖ Ready for training! Use: model.train(data='/home/t1tc01-hoangphan/code/t1tc01-personal/ultralytics_sia-aero-eyes-Zalo-2025/dataset_output/data.yaml', ...)
