In [1]:
# Download data set from Kaggle and unzip into directory
!pip install kaggle
!mkdir -p ~/.kaggle
!kaggle datasets download -d abdohamdg/fracatlas-dataset
!unzip -q fracatlas-dataset.zip -d fracatlas_directory/

Dataset URL: https://www.kaggle.com/datasets/abdohamdg/fracatlas-dataset
License(s): ODC Public Domain Dedication and Licence (PDDL)
Downloading fracatlas-dataset.zip to /content
 74% 65.0M/87.6M [00:00<00:00, 147MB/s]
100% 87.6M/87.6M [00:00<00:00, 154MB/s]


In [2]:
import os
import re

In [3]:
# Base directory and subsets to access for images
base_directory = 'fracatlas_directory/Fracatlas/'
subsets = ['train', 'test', 'validation']

In [4]:
# Check number of images and labels in each subset

# Iterate through each subset and count number of files
for subset in subsets:
    image_directory = os.path.join(base_directory, subset, 'images')

    if subset == 'validation':
        label_directory = os.path.join(base_directory, subset, 'lables')  # Possible typo in folder name
    else:
        label_directory = os.path.join(base_directory, subset, 'labels')

    # Count image files
    image_files = os.listdir(image_directory)
    num_images = len(image_files)

    # Count label files
    label_files = os.listdir(label_directory)
    num_labels = len(label_files)

    # Print the counts
    print(f"{subset.capitalize()} subset:")
    print(f"  Number of images: {num_images}")
    print(f"  Number of labels: {num_labels}\n")

Train subset:
  Number of images: 574
  Number of labels: 574

Test subset:
  Number of images: 61
  Number of labels: 61

Validation subset:
  Number of images: 82
  Number of labels: 82



In [5]:
# Check if image and label names match

def name_match(image_dir, label_dir):
    image_files = os.listdir(image_dir) # List of image files
    label_files = os.listdir(label_dir) # List of label files

    # Check if each image has a corresponding label file
    missing_labels = []
    for image_file in image_files:
        label_file = os.path.splitext(image_file)[0] + '.txt'
        if label_file not in label_files:
            missing_labels.append(image_file)

    if missing_labels:
        print(f"Missing labels for images: {missing_labels}")
    else:
        print("All images have corresponding label files.")

# Iterate through each subset and count number of files
for subset in subsets:
    image_directory = os.path.join(base_directory, subset, 'images')

    if subset == 'validation':
        label_directory = os.path.join(base_directory, subset, 'lables')  # Possible typo in folder name
    else:
        label_directory = os.path.join(base_directory, subset, 'labels')

    # Check labels for the current subset
    print(f"Checking for mismatched labels in {subset} subset:")
    name_match(image_directory, label_directory)
    print()

Checking for mismatched labels in train subset:
All images have corresponding label files.

Checking for mismatched labels in test subset:
All images have corresponding label files.

Checking for mismatched labels in validation subset:
All images have corresponding label files.



In [6]:
# Check if all label files have same format

# Function to check number of entries in label file and return total count
def check_label_file(label_file_path):
    with open(label_file_path, 'r') as f:
        content = f.readlines()  # Read all lines
        all_numbers = []

        for line in content:
            numbers = line.strip().split()  # Split by spaces
            all_numbers.extend(numbers)  # Collect all numbers

        return len(all_numbers)

# Dictionary to store number of occurrences of each number of entries
count_dict = {}
total_empty_files = 0

# Check label files in each subset
for subset in subsets:
    if subset == 'validation':
        label_directory = os.path.join(base_directory, subset, 'lables')  # Possible typo in folder name
    else:
        label_directory = os.path.join(base_directory, subset, 'labels')

    label_files = os.listdir(label_directory)

    for label_file in label_files:
        label_file_path = os.path.join(label_directory, label_file)
        num_count = check_label_file(label_file_path)

        # Check conditions
        if num_count == 0:
            total_empty_files += 1
        else:
            # Update count in the dictionary
            if num_count in count_dict:
                count_dict[num_count] += 1
            else:
                count_dict[num_count] = 1

# Output the total empty files first
print(f"Total empty files: {total_empty_files}")

# Output the counts in ascending order for non-empty files
for number in sorted(count_dict.keys()):
    print(f"Total files with {number} numbers: {count_dict[number]}")

Total empty files: 0
Total files with 5 numbers: 546
Total files with 10 numbers: 146
Total files with 15 numbers: 17
Total files with 20 numbers: 7
Total files with 25 numbers: 1
