In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import numpy as np
from PIL import Image
from pathlib import Path
import os

"""
Parameters to change depending on your dataset
"""

data_path = Path("/content/gdrive/MyDrive/FoodSeg103/Images/")

ann_path = data_path / "ann_dir"

train_path = ann_path / "train/"
test_path = ann_path / "test/"


# Unique pixel counter for each image array
def count_pixel_values(path):
  counter = 0
  counts = {}

  for entry in sorted(os.scandir(path), key=lambda e: e.name):
      if entry.is_file():
          img_path = entry.path

          img_open = Image.open(img_path)

          img_array = np.array(img_open)




          # Collect unique pixels in each image array
          unique_values = np.unique(img_array)
          for value in unique_values:
              counts[value] = counts.get(value, 0) + 1

      counter = counter + 1
      #if(counter >=10):
      #   break

  return counts, counter


 # Unique pixel counter for each image array
def count_percentage_pixels_per_class(path):
  counter = 0
  counts = {}

  for entry in sorted(os.scandir(path), key=lambda e: e.name):
      if entry.is_file():
          img_path = entry.path

          img_open = Image.open(img_path)

          img_array = np.array(img_open)


          # Loop through each unique class label
          unique_labels = np.unique(img_array)
          for label in unique_labels:
            # Count the number of pixels with the current class label
            pixel_count = np.sum(img_array == label)

            # Store the pixel count for the class label
            counts[label] = (counts.get(label,0) + (pixel_count/(img_array.shape[0]*img_array.shape[1])))


      counter = counter + 1
      #if(counter >=10):
      #   break



  for key, value in counts.items():
    counts[key] = (value / counter) * 100

  return counts, counter





"""
Print counts more elegantly
"""
def printCounts(sorted_pixel_counts):
  print("\n")
  for key, value in sorted_pixel_counts.items():
      print(f"{key}: {value}")

"""
Print counts side by side (key sorted, and value sorted)
"""
def printC(data):
  # Sort the data based on the values (second column) in descending order
  value_sorted_data = sorted(data.items(), key=lambda x: x[1], reverse=True)

  # Sort the data based on the keys (first column) in descending order
  key_sorted_data = sorted(data.items(), key=lambda x: x[0], reverse=False)

  # Find the maximum key length and value length for formatting
  max_key_length = max(len(str(key)) for key in data.keys())
  max_value_length = max(len(str(value)) for value in data.values())

  # Display the original, value-sorted, and key-sorted key-value pairs side by side
  for (key, value), (value_sorted_key, value_sorted_value), (key_sorted_key, key_sorted_value) in zip(data.items(), value_sorted_data, key_sorted_data):
    #print(f"{value_sorted_key}: {value_sorted_value}     {key_sorted_key}: {key_sorted_value}")

    key_sorted_format = f"{key_sorted_key: <{max_key_length}}: {key_sorted_value: <{max_value_length}}"
    value_sorted_format = f"{value_sorted_key: <{max_key_length}}: {value_sorted_value: <{max_value_length}}"
    print(f"{key_sorted_format}     {value_sorted_format}")


"""
Check counts percentage
"""

def check_counts_percentage(counts, number_masks, percentage):
  threshold = number_masks * percentage / 100
  count_keys = []
  for key, count in counts.items():
      if count < threshold:
          count_keys.append(key)
  return count_keys


# Usage

unique_pixel_counts_train, number_masks_train = count_pixel_values(train_path)
sorted_unique_pixel_counts_train = dict(sorted(unique_pixel_counts_train.items(), key=lambda x: x[0]))

unique_pixel_counts_test, number_masks_test = count_pixel_values(test_path)
sorted_unique_pixel_counts_test = dict(sorted(unique_pixel_counts_test.items(), key=lambda x: x[0]))


print("Average percentage of pixels per class / category \n")

pixel_counts_train, total_number_masks_train = count_percentage_pixels_per_class(train_path)
printC(pixel_counts_train)


print("---------------------------------------------------------------------")
print("---------- Train set : MASK -------------- " +  str(number_masks_train) + " img -----------------------" )
print("---------------------------------------------------------------------")

print("Number of times the class / category appears accross all images(labels)\n")
printC(unique_pixel_counts_train)

sum_sorted_pixel_counts_train = sum(unique_pixel_counts_train.values())
print(sum_sorted_pixel_counts_train)

print("\nClasses appearing less than 50% of the time in train set")
print(check_counts_percentage(sorted_unique_pixel_counts_train, number_masks_train, 50))

print("\nClasses appearing less than 30% of the time in train set")
print(check_counts_percentage(sorted_unique_pixel_counts_train, number_masks_train, 30))

print("\nClasses appearing less than 15% of the time in train set")
print(check_counts_percentage(sorted_unique_pixel_counts_train, number_masks_train, 15))


print("---------------------------------------------------------------------")
print("---------- Test set : MASK -------------- " +  str(number_masks_test) + " img -----------------------" )
print("---------------------------------------------------------------------")

printC(unique_pixel_counts_test)

sum_sorted_pixel_counts_test = sum(sorted_unique_pixel_counts_test.values())
print(sum_sorted_pixel_counts_test)

print("\nClasses appearing less than 50% of the time in test set")
print(check_counts_percentage(sorted_unique_pixel_counts_test, number_masks_test, 50))

print("\nClasses appearing less than 30% of the time in test set")
print(check_counts_percentage(sorted_unique_pixel_counts_test, number_masks_test, 30))

print("\nClasses appearing less than 15% of the time in test set")
print(check_counts_percentage(sorted_unique_pixel_counts_test, number_masks_test, 15))




Average percentage of pixels per class / category 

0  : 47.52340094346105         0  : 47.52340094346105    
1  : 0.03029396877690047       58 : 4.168726076788935    
2  : 0.01630457717143645       48 : 3.852416978646833    
3  : 0.6354181825868515        46 : 2.9598895549286297   
4  : 0.11723304391003136       70 : 2.3473082513452006   
5  : 0.8932332782613723        47 : 2.0973388878838217   
6  : 0.05579409427492126       67 : 1.9660779174941034   
7  : 0.01593216820547152       84 : 1.8562395236934086   
8  : 1.6619126339454309        10 : 1.8362147148513646   
9  : 0.4781026850305686        87 : 1.8094642686847535   
10 : 1.8362147148513646        52 : 1.7058094745421204   
11 : 0.11281105919959021       8  : 1.6619126339454309   
12 : 0.3833381542052304        66 : 1.4315301643384895   
13 : 0.1286066378870847        73 : 1.3218847394342383   
14 : 0.3958635834686095        54 : 1.210889185156124    
15 : 0.1316467069827316        30 : 1.0761959254631228   
16 : 0.0332641907597