This notebook contains the data preprocessing and cleanup codes for the ImageNet-1k dataset. Its main purpose is to select the object of interest (OOI) out of the original dataset and create a new dataset out of it.

Its input is the original ImageNet-1k dataset (downloaded from https://huggingface.co/datasets/imagenet-1k/blob/main/data/train_images_0.tar.gz), and its output is a new dataset categorized by either having the OOI (positive class) or not (negative class).

#### Import libraries

In [5]:
import os
import shutil
from tqdm import tqdm

import numpy as np
from PIL import Image
np.random.seed(0)

#### Settings

In [6]:
# dir of the extracted image files
images_dir = './dataset/images_train/'
# path to the annotations file "LOC_synset_mapping.txt"
annotations_file = './dataset/LOC_synset_mapping.txt'
# object of interest specified by a list of keywords
ooi_keywords = [
    'english springer',
    'welsh springer',
    'irish water spaniel',
    'brittany spaniel',
    'clumber',
    'cocker spaniel',
    'sussex spaniel',
    'german short-haired pointer',
    'vizsla',
    'chesapeake bay retriever',
    'curly-coated retriever',
    'flat-coated retriever',
    'golden retriever',
    'labrador retriever',
    'english setter',
    'gordon setter',
    'irish setter',
    'sealyham terrier',
    'lakeland terrier',
    'american staffordshire terrier',
    'staffordshire bullterrier',
    'wire-haired fox terrier',
    'giant schnauzer',
    'miniature schnauzer',
    'standard schnauzer',
    'airedale',
    'australian terrier',
    'bedlington terrier',
    'border terrier',
    'boston bull',
    'cairn',
    'dandie dinmont',
    'irish terrier',
    'kerry blue terrier',
    'lhasa',
    'norfolk terrier',
    'norwich terrier',
    'scotch terrier',
    'silky terrier',
    'soft-coated wheaten terrier',
    'tibentan terrier',
    'west highland white terrier',
    'yorkshire terrier',
    'black-and-tan coonhound',
    'english foxhound',
    'walker hound',
    'italian greyhound',
    'whippet',
    'borzoi',
    'irish wolfhound',
    'afghan hound',
    'basset',
    'beagle',
    'bloodhound',
    'bluetick',
    'ibizan hound',
    'norwegian elkhound',
    'otterhound',
    'redbone',
    'saluki',
    'scottish deerhound',
    'weimaraner',
    'rhodesian ridgeback',
    'blenheim spaniel',
    'papillon',
    'chihuahua',
    'japanese spaniel',
    'maltese dog',
    'pekinese',
    'shih-tzu',
    'toy terrier',
    'groenendael',
    'malinois',
    'boarder collie',
    'bouvier des flandres',
    'briard',
    'collie',
    'german shepherd',
    'keipie',
    'komondor',
    'old english sheepdog',
    'rottweiler',
    'shetland sheepdog',
    'affenpinscher',
    'doberman',
    'miniature pinscher',
    'kuvasz',
    'french bulldog',
    'tibetan mastiff',
    'appenzeller',
    'bernese mountain dog',
    'entlebucher',
    'great swiss mountain dog',
    'malamute',
    'siberian husky',
    'boxer',
    'bull mastiff',
    'eskimo dog',
    'great dane',
    'saint bernard',
    'cardigan',
    'pembroke',
    'brabancon griffon',
    'miniature poodle',
    'standard poodle',
    'toy poodle',
    'chow',
    'keeshond',
    'pomeranian',
    'samoyed',
    'basenji',
    'daimatian',
    'great pyrenees',
    'leonberg',
    'mexciain hairless',
    'newfoundland',
    'pug',
]
# limit on the max number of samples
max_samples = 10000
# output dir
output_dir = './dataset/preprocessed'

neg_dir = os.path.join(output_dir, 'neg')
pos_dir = os.path.join(output_dir, 'pos')

#### Codes

In [7]:
annotations = {}
with open(annotations_file, 'r') as f:
    for line in f.readlines():
        line = line.strip().lower()
        id = line.split(' ')[0]
        annotations[id] = ' '.join(line.split(' ')[1:]).split(', ')
        # print(annotations[id])

ooi_id = set()
for keyword in ooi_keywords:
    for key, value in annotations.items():
        if keyword in value:
            ooi_id.add(key)
if not ooi_id:
    raise Warning(f'No object of interest given by {ooi_keywords} found in the annotations file.')

In [8]:
images_path = []
with os.scandir(images_dir) as it:
    for entry in it:
        if entry.is_file():
            images_path.append(entry.path)

print('Collecting positive samples...')
pos_samples = []
for i, path in tqdm(enumerate(images_path)):
    filename = os.path.basename(path)
    if filename.split('.')[0].split('_')[0] in ooi_id:
        pos_samples.append(path)
    if len(pos_samples) >= max_samples:
        break
print(f'Done. {len(pos_samples)} positive samples collected.')
print('')

print('Randomly collecting negative samples...')
neg_samples = []
np.random.shuffle(images_path)
i = 0
with tqdm(total=len(pos_samples)) as pbar:
    while len(neg_samples) < len(pos_samples):
        filename = os.path.basename(images_path[i])
        if filename.split('.')[0].split('_')[0] not in ooi_id:
            neg_samples.append(images_path[i])
            pbar.update(1)
        i += 1
print(f'Done. {len(neg_samples)} negative samples collected.')

Collecting positive samples...


49111it [00:00, 334661.45it/s]


Done. 10000 positive samples collected.

Randomly collecting negative samples...


100%|██████████| 10000/10000 [00:00<00:00, 265706.12it/s]

Done. 10000 negative samples collected.





In [9]:
os.makedirs(neg_dir, exist_ok=True)
os.makedirs(pos_dir, exist_ok=True)

print('Copying pos samples to output dir...')
for path in tqdm(pos_samples):
    image = Image.open(path)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    image.save(os.path.join(pos_dir, os.path.basename(path)))
print('Copying neg samples to output dir...')
for path in tqdm(neg_samples):
    image = Image.open(path)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    image.save(os.path.join(neg_dir, os.path.basename(path)))
print('Done.')

Copying neg samples to output dir...


100%|██████████| 10000/10000 [01:12<00:00, 138.22it/s]


Copying pos samples to output dir...


100%|██████████| 10000/10000 [00:30<00:00, 327.95it/s]

Done.



