In [124]:
import os
import shutil
import json

In [103]:
TRAIN_DIR = "data/rflow_data/train/"
TRAIN_ANNOTS = "data/rflow_data/train/_annotations.coco.json"

In [104]:
len(os.listdir(TRAIN_DIR))

138

In [13]:
def read_json(file) -> dict:
    """
    Reads json file and returns python dictionary.
        Args: 
            file(str): path to json file.
    """
    with open(file, 'r') as f:
        data = json.load(f)
    return data

In [14]:
rflow_annots = read_json(TRAIN_ANNOTS)
rflow_annots.keys()

dict_keys(['info', 'licenses', 'categories', 'images', 'annotations'])

In [30]:
num_cats = len(rflow_annots['categories'])
num_imgs = len(rflow_annots['images'])
num_annots = len(rflow_annots['annotations'])
num_cats, num_imgs, num_annots

(50, 137, 1049)

In [16]:
def sep_annots_by_category(annots) -> dict:
    """
    Segregates annotation data by category. Returns a python dictionary with key value pairs indicating category-id : annotation.
        Args:
            annots(dict): Python dictionary containing annotation data to be segregated.
    """
    categories = {}
    
    for annotation in annots['annotations']:
        category_id = annotation['category_id']
        
        if category_id not in categories:
            categories[category_id] = []
        
        categories[category_id].append(annotation)
    return categories

In [17]:
data_by_cat = sep_annots_by_category(rflow_annots)
data_by_cat.keys()

dict_keys([33, 19, 28, 27, 23, 12, 31, 21, 45, 39, 25, 4, 46, 29, 41, 18, 48, 14, 40, 38, 3, 13, 22, 30, 17, 15, 34, 8, 24, 35, 7, 37, 44, 10, 42, 20, 36, 2, 43, 26, 6, 32, 5, 11, 47, 9, 16, 1, 49])

In [21]:
data_by_cat[33][0:3]

[{'id': 0,
  'image_id': 0,
  'category_id': 33,
  'bbox': [8, 22, 624, 606],
  'area': 378144,
  'segmentation': [],
  'iscrowd': 0},
 {'id': 10,
  'image_id': 1,
  'category_id': 33,
  'bbox': [121, 133, 511, 426.5],
  'area': 217941.5,
  'segmentation': [],
  'iscrowd': 0},
 {'id': 36,
  'image_id': 4,
  'category_id': 33,
  'bbox': [163, 130, 353, 325],
  'area': 114725,
  'segmentation': [],
  'iscrowd': 0}]

In [48]:
def create_id2cat(annots) -> dict:
    """
    Returns a python dictionary mapping category_id to their corresponding category name.
    Args:
        annots(dict): Python dictionary containing annotation data of dataset.
    """
    id2cat = {}
    
    for category in annots['categories']:
        id2cat[category['id']] = category['name']
    return id2cat

In [49]:
rflow_id2cat = create_id2cat(rflow_annots)
rflow_id2cat

{0: 'Cattle',
 1: 'BCS-1',
 2: 'BCS-1.25',
 3: 'BCS-1.5',
 4: 'BCS-1.75',
 5: 'BCS-2',
 6: 'BCS-2.25',
 7: 'BCS-2.5',
 8: 'BCS-2.75',
 9: 'BCS-3',
 10: 'BCS-3.25',
 11: 'BCS-3.5',
 12: 'BCS-3.75',
 13: 'BCS-4',
 14: 'BCS-4.25',
 15: 'BCS-4.5',
 16: 'BCS-4.75',
 17: 'Breed-Grade-A2',
 18: 'Breed-Grade-A3',
 19: 'Breed-Grade-B',
 20: 'Breed-Grade-C',
 21: 'Compact-Udder',
 22: 'Deep-Clefted',
 23: 'Dry-Wound',
 24: 'Gir-Cow',
 25: 'HF-Crossbreed-Cow',
 26: 'Hallikar-Cow',
 27: 'Horn Present',
 28: 'Jersey-Crossbreed-Cow',
 29: 'Moderate-Worm-Load',
 30: 'Moderately pendulous Udder',
 31: 'Moderately-Clefted',
 32: 'Moderately-Pendulous-Udder',
 33: 'Moderately-Rough-Skin-Coat',
 34: 'No-Worms',
 35: 'No-Wound',
 36: 'Non-Descript-Breed-Cow',
 37: 'Pendulous-Udder',
 38: 'Polled',
 39: 'Pot-Belly-Present',
 40: 'Rough-Skin-Coat',
 41: 'Shiny-Skin-Coat',
 42: 'Slightly-Clefted',
 43: 'Small-Tight-Udder',
 44: 'Teat-Score-3',
 45: 'Teat-Score-5',
 46: 'Teat-Score-7',
 47: 'Teat-Score-9',
 4

In [50]:
def create_id_2_imgname(annots) -> dict:
    """
    Returns a python dictionary mapping image_ids to their corresponding image names.
    Args:
        annots(dict): Python dictionary containing annotation data of dataset.
    """
    id2name = {}
    
    for image in annots['images']:
        id2name[image['id']] = image['file_name']
    return id2name

In [52]:
rflow_id2name = create_id_2_imgname(rflow_annots)
len(rflow_id2name)

137

In [190]:
def create_categorized_dset(og_img_dir, data_by_cat, id2cat, id2name) -> None:
    """
    Segregates categorized annotation data into folders, each with its corresponding images and json annotation file.
    Args:
        og_img_dir(str): Folder container original image data to be split into categories.
        data_by_cat(dict): Python dictionary containing category-wise annotations for the dataset.
        id2cat(dict): Python dictionary mapping category id to corresponding category name.
        id2name(dict): Python dictionary mapping image_id to corresponding image name.
        
    Returns:
        None
    """
    for id, cat in id2cat.items():
        cat_annots = {}
        cat_dir = f"data/RF_sorted_data/{cat}/"
        
        if not os.path.exists(cat_dir):
            os.makedirs(cat_dir)
        
        if id in data_by_cat:
            for img_annot in data_by_cat[id]:
                img_name = id2name[img_annot['image_id']]
                og_img_path = os.path.join(og_img_dir, img_name)
                new_img_path = os.path.join(cat_dir, img_name)
                cat_annots[img_name] = img_annot
                shutil.copy(og_img_path, new_img_path)
        with open(os.path.join(cat_dir, f"annotations.{cat}.json"), 'w+') as f:
            json.dump(cat_annots, f)

In [191]:
create_categorized_dset(TRAIN_DIR, data_by_cat, rflow_id2cat, rflow_id2name)

In [192]:
def get_dir_filecount(directory):
    """
    Prints the count of files present in each subdirectory of a given main directory.
    Args:
        directory(str): Path to directory containing files to be counted.

    Returns:
        None
    """
    for subdir in os.listdir(directory):
        subdir_path = os.path.join(directory, subdir)
        print(f"Sub-Dir: {subdir} | No. of files: {len(os.listdir(subdir_path))}")
get_dir_filecount("data/RF_sorted_data/")

Sub-Dir: Breed-Grade-A3 | No. of files: 41
Sub-Dir: Moderately-Clefted | No. of files: 83
Sub-Dir: Teat-Score-5 | No. of files: 53
Sub-Dir: Polled | No. of files: 19
Sub-Dir: BCS-1.25 | No. of files: 2
Sub-Dir: Breed-Grade-B | No. of files: 38
Sub-Dir: Shiny-Skin-Coat | No. of files: 37
Sub-Dir: No-Worms | No. of files: 25
Sub-Dir: Cattle | No. of files: 1
Sub-Dir: Breed-Grade-C | No. of files: 15
Sub-Dir: Breed-Grade-A2 | No. of files: 11
Sub-Dir: Hallikar-Cow | No. of files: 6
Sub-Dir: Teat-Score-7 | No. of files: 58
Sub-Dir: Moderate-Worm-Load | No. of files: 56
Sub-Dir: BCS-1 | No. of files: 2
Sub-Dir: Rough-Skin-Coat | No. of files: 14
Sub-Dir: Horn Present | No. of files: 87
Sub-Dir: Dry-Wound | No. of files: 63
Sub-Dir: No-Wound | No. of files: 21
Sub-Dir: BCS-4.75 | No. of files: 3
Sub-Dir: BCS-4 | No. of files: 12
Sub-Dir: HF-Crossbreed-Cow | No. of files: 71
Sub-Dir: Non-Descript-Breed-Cow | No. of files: 10
Sub-Dir: BCS-1.5 | No. of files: 11
Sub-Dir: Moderately-Pendulous-Ud