In [5]:
import numpy as np
import os
from pathlib import Path
from tqdm import tqdm

def compute_mask_percentages(mask):
    """
    Compute min and max percentage locations for each dimension from mask.
    Returns dict with 'col', 'row', 'z' keys, each containing [min_pct, max_pct]
    """
    # Find all non-zero positions
    non_zero = np.argwhere(mask > 0)
    
    if len(non_zero) == 0:
        return None
    
    # Compute min and max indices for each dimension
    # mask shape is (z, row, col) based on your processing code
    z_indices = non_zero[:, 0]
    row_indices = non_zero[:, 1]
    col_indices = non_zero[:, 2]
    
    # Compute percentages (min_index / 256, max_index / 256)
    percentages = {
        'z': [z_indices.min() / 256.0, (z_indices.max() + 1) / 256.0],
        'row': [row_indices.min() / 256.0, (row_indices.max() + 1) / 256.0],
        'col': [col_indices.min() / 256.0, (col_indices.max() + 1) / 256.0]
    }
    
    return percentages

def mask_pcts_to_bbox(mask_pcts):
    """
    Convert mask percentages dict to bbox vector.
    Returns vector [z_min, z_max, row_min, row_max, col_min, col_max]
    """
    return np.array([
        mask_pcts['z'][0], mask_pcts['z'][1],
        mask_pcts['row'][0], mask_pcts['row'][1],
        mask_pcts['col'][0], mask_pcts['col'][1]
    ], dtype=np.float32)

def extend_bbox_with_targets(mask_pcts, labels):
    """
    Extend mask bounding box to include all target points.
    labels: (num_targets, 3) - [col, row, z]
    Returns vector [z_min, z_max, row_min, row_max, col_min, col_max]
    """
    # Start with mask bbox
    z_min = mask_pcts['z'][0]
    z_max = mask_pcts['z'][1]
    row_min = mask_pcts['row'][0]
    row_max = mask_pcts['row'][1]
    col_min = mask_pcts['col'][0]
    col_max = mask_pcts['col'][1]
    
    # Extend to include all targets
    for target in labels:
        col_pct, row_pct, z_pct = target
        
        # Extend z
        z_min = min(z_min, z_pct)
        z_max = max(z_max, z_pct)
        
        # Extend row
        row_min = min(row_min, row_pct)
        row_max = max(row_max, row_pct)
        
        # Extend col
        col_min = min(col_min, col_pct)
        col_max = max(col_max, col_pct)
    
    return np.array([z_min, z_max, row_min, row_max, col_min, col_max], dtype=np.float32)

def check_target_in_range(target_pct, mask_pct):
    """
    Check if target percentage is within mask range.
    target_pct: single value
    mask_pct: [min_pct, max_pct]
    """
    return mask_pct[0] <= target_pct <= mask_pct[1]

def analyze_masks_and_labels(mask_dir, label_dir, bbox_output_dir):
    """
    Analyze all masks and compare with labels, then save extended bounding boxes.
    """
    # Create output directory
    os.makedirs(bbox_output_dir, exist_ok=True)
    
    mask_files = list(Path(mask_dir).glob("*.npy"))
    
    print(f"Found {len(mask_files)} mask files\n")
    
    results = []
    missing_labels = []
    targets_outside = []
    bboxes_saved = 0
    
    for mask_file in tqdm(mask_files, desc="Processing masks"):
        series_uid = mask_file.stem
        
        # Load mask
        mask = np.load(mask_file)
        
        # Compute mask percentages
        mask_pcts = compute_mask_percentages(mask)
        
        if mask_pcts is None:
            print(f"Warning: No non-zero values in mask {series_uid}")
            continue
        
        # Load corresponding label
        label_file = Path(label_dir) / f"{series_uid}.npy"
        
        if not label_file.exists():
            missing_labels.append(series_uid)
            # Use mask bbox directly when labels are missing
            bbox = mask_pcts_to_bbox(mask_pcts)
            bbox_file = Path(bbox_output_dir) / f"{series_uid}.npy"
            np.save(bbox_file, bbox)
            bboxes_saved += 1
            continue
        
        labels = np.load(label_file)  # Shape: (num_targets, 3) - [col, row, z]
        
        if len(labels) == 0:
            print(f"Warning: No targets for {series_uid}")
            # Use mask bbox directly when no targets
            bbox = mask_pcts_to_bbox(mask_pcts)
            bbox_file = Path(bbox_output_dir) / f"{series_uid}.npy"
            np.save(bbox_file, bbox)
            bboxes_saved += 1
            continue
        
        # Extend mask bbox to include all targets
        extended_bbox = extend_bbox_with_targets(mask_pcts, labels)
        
        # Save bounding box
        bbox_file = Path(bbox_output_dir) / f"{series_uid}.npy"
        np.save(bbox_file, extended_bbox)
        bboxes_saved += 1
        
        # Check each target
        for target_idx, target in enumerate(labels):
            col_pct, row_pct, z_pct = target
            
            # Check if each dimension is in range
            col_in_range = check_target_in_range(col_pct, mask_pcts['col'])
            row_in_range = check_target_in_range(row_pct, mask_pcts['row'])
            z_in_range = check_target_in_range(z_pct, mask_pcts['z'])
            
            all_in_range = col_in_range and row_in_range and z_in_range
            
            result = {
                'series_uid': series_uid,
                'target_idx': target_idx,
                'target': target,
                'mask_ranges': mask_pcts,
                'col_in_range': col_in_range,
                'row_in_range': row_in_range,
                'z_in_range': z_in_range,
                'all_in_range': all_in_range
            }
            
            results.append(result)
            
            if not all_in_range:
                targets_outside.append(result)
    
    # Print summary
    print(f"\n{'='*80}")
    print("ANALYSIS SUMMARY")
    print(f"{'='*80}")
    print(f"Total masks processed: {len(mask_files)}")
    print(f"Missing label files: {len(missing_labels)}")
    print(f"Bounding boxes saved: {bboxes_saved}")
    print(f"Total targets analyzed: {len(results)}")
    print(f"Targets outside mask range: {len(targets_outside)}")
    if len(results) > 0:
        print(f"Percentage outside: {len(targets_outside)/len(results)*100:.2f}%")
    print(f"{'='*80}\n")
    
    # Print details of targets outside range
    if targets_outside:
        print(f"TARGETS OUTSIDE MASK RANGE ({len(targets_outside)} cases):")
        print(f"{'='*80}")
        for i, result in enumerate(targets_outside[:20]):  # Show first 20
            print(f"\n{i+1}. Series: {result['series_uid']}, Target: {result['target_idx']}")
            print(f"   Target location [col, row, z]: {result['target']}")
            print(f"   Mask ranges:")
            print(f"     Col: {result['mask_ranges']['col']} - In range: {result['col_in_range']}")
            print(f"     Row: {result['mask_ranges']['row']} - In range: {result['row_in_range']}")
            print(f"     Z:   {result['mask_ranges']['z']} - In range: {result['z_in_range']}")
        
        if len(targets_outside) > 20:
            print(f"\n... and {len(targets_outside) - 20} more")
    
    return results, targets_outside, missing_labels

if __name__ == "__main__":
    mask_dir = "./mask_256"
    label_dir = r"E:\kaggle-rsna-data_processing3\label_percentage"
    bbox_output_dir = "./bbox"
    
    results, targets_outside, missing_labels = analyze_masks_and_labels(
        mask_dir, label_dir, bbox_output_dir
    )

Found 170 mask files



Processing masks: 100%|██████████| 170/170 [00:07<00:00, 21.45it/s]


ANALYSIS SUMMARY
Total masks processed: 170
Missing label files: 34
Bounding boxes saved: 170
Total targets analyzed: 174
Targets outside mask range: 6
Percentage outside: 3.45%

TARGETS OUTSIDE MASK RANGE (6 cases):

1. Series: 1.2.826.0.1.3680043.8.498.12896910506681881306246412668919668702, Target: 2
   Target location [col, row, z]: [0.48444894 0.47671568 0.71428573]
   Mask ranges:
     Col: [0.25390625, 0.71875] - In range: True
     Row: [0.265625, 0.6171875] - In range: True
     Z:   [0.05859375, 0.57421875] - In range: False

2. Series: 1.2.826.0.1.3680043.8.498.12914952223659958493995413641114579279, Target: 0
   Target location [col, row, z]: [0.5020027  0.66533864 0.16666667]
   Mask ranges:
     Col: [0.3125, 0.68359375] - In range: True
     Row: [0.31640625, 0.59375] - In range: False
     Z:   [0.01953125, 0.5703125] - In range: True

3. Series: 1.2.826.0.1.3680043.8.498.16386250344855221757144432829845114733, Target: 1
   Target location [col, row, z]: [0.46875    0.




In [6]:
import numpy as np
from pathlib import Path
from tqdm import tqdm

def check_bbox_coverage(bbox_dir, label_dir):
    """
    Verify that all bounding boxes cover their corresponding targets.
    """
    bbox_files = list(Path(bbox_dir).glob("*.npy"))
    
    print(f"Checking {len(bbox_files)} bounding boxes...\n")
    
    all_covered = True
    issues = []
    
    for bbox_file in tqdm(bbox_files, desc="Verifying coverage"):
        series_uid = bbox_file.stem
        
        # Load bbox: [z_min, z_max, row_min, row_max, col_min, col_max]
        bbox = np.load(bbox_file)
        z_min, z_max, row_min, row_max, col_min, col_max = bbox
        
        # Load corresponding labels: (num_targets, 3) - [col, row, z]
        label_file = Path(label_dir) / f"{series_uid}.npy"
        if not label_file.exists():
            continue
            
        labels = np.load(label_file)
        
        # Check each target
        for target_idx, target in enumerate(labels):
            col_pct, row_pct, z_pct = target
            
            # Check if target is within bbox
            z_ok = z_min <= z_pct <= z_max
            row_ok = row_min <= row_pct <= row_max
            col_ok = col_min <= col_pct <= col_max
            
            if not (z_ok and row_ok and col_ok):
                all_covered = False
                issues.append({
                    'series_uid': series_uid,
                    'target_idx': target_idx,
                    'target': target,
                    'bbox': bbox,
                    'z_ok': z_ok,
                    'row_ok': row_ok,
                    'col_ok': col_ok
                })
    
    # Print results
    print(f"\n{'='*80}")
    if all_covered:
        print("✓ SUCCESS: All targets are covered by their bounding boxes!")
    else:
        print(f"✗ FAILED: Found {len(issues)} targets not covered by bounding boxes")
        print(f"\nDetails of uncovered targets:")
        for issue in issues[:10]:  # Show first 10
            print(f"\nSeries: {issue['series_uid']}, Target: {issue['target_idx']}")
            print(f"  Target [col, row, z]: {issue['target']}")
            print(f"  BBox: z[{issue['bbox'][0]:.4f}, {issue['bbox'][1]:.4f}], "
                  f"row[{issue['bbox'][2]:.4f}, {issue['bbox'][3]:.4f}], "
                  f"col[{issue['bbox'][4]:.4f}, {issue['bbox'][5]:.4f}]")
            print(f"  Coverage: z={issue['z_ok']}, row={issue['row_ok']}, col={issue['col_ok']}")
    print(f"{'='*80}")
    
    return all_covered, issues

if __name__ == "__main__":
    bbox_dir = "./bbox"
    label_dir = r"E:\kaggle-rsna-data_processing3\label_percentage"
    
    all_covered, issues = check_bbox_coverage(bbox_dir, label_dir)

Checking 170 bounding boxes...



Verifying coverage: 100%|██████████| 170/170 [00:00<00:00, 221.15it/s]


✓ SUCCESS: All targets are covered by their bounding boxes!





In [7]:
import os
import shutil
from pathlib import Path

# Define paths
source_folder = Path(r"E:\kaggle-rsna-data_processing3\volume_uint8_256")
reference_folder = Path(r"E:\kaggle-rsna-data_processing3\bbox")
destination_folder = Path(r"E:\kaggle-rsna-data_processing3\volume_uint8_256_mask")

# Create destination folder if it doesn't exist
destination_folder.mkdir(parents=True, exist_ok=True)

# Get list of filenames (without extensions) in the reference folder
reference_files = {f.stem for f in reference_folder.iterdir() if f.is_file()}

# Counter for moved files
moved_count = 0

# Iterate through files in source folder
for file_path in source_folder.iterdir():
    if file_path.is_file():
        # Check if filename (without extension) exists in reference folder
        if file_path.stem in reference_files:
            # Move the file to destination
            destination_path = destination_folder / file_path.name
            shutil.move(str(file_path), str(destination_path))
            moved_count += 1
            print(f"Moved: {file_path.name}")

print(f"\nTotal files moved: {moved_count}")

Moved: 1.2.826.0.1.3680043.8.498.10035643165968342618460849823699311381.npy
Moved: 1.2.826.0.1.3680043.8.498.10076056930521523789588901704956188485.npy
Moved: 1.2.826.0.1.3680043.8.498.10188636688783982623025997809119805350.npy
Moved: 1.2.826.0.1.3680043.8.498.10410600166004340343973545138447283460.npy
Moved: 1.2.826.0.1.3680043.8.498.10540586847553109495238524904638776495.npy
Moved: 1.2.826.0.1.3680043.8.498.10557880026294057874761753231388788828.npy
Moved: 1.2.826.0.1.3680043.8.498.10759842474698331813589731619457567641.npy
Moved: 1.2.826.0.1.3680043.8.498.10838261583340080792086755879475952843.npy
Moved: 1.2.826.0.1.3680043.8.498.10865391592895615633871689438787039175.npy
Moved: 1.2.826.0.1.3680043.8.498.10929608782694347957516071062422315982.npy
Moved: 1.2.826.0.1.3680043.8.498.10935907012185032169927418164924236382.npy
Moved: 1.2.826.0.1.3680043.8.498.11140496970152788589837488009637704168.npy
Moved: 1.2.826.0.1.3680043.8.498.11163718560814217911019576488539324434.npy
Moved: 1.2.8