In [11]:
'''
DESCRIPTION: 
A program to concurrently download subsets from ImageNet using ImageNet API.
To run the script correctly, please modify the arguments in this cell then run all cells.

ARGUMENT LIST:
scrape_only_flickr:        Set to True if only want images from Flickr.
number_of_classes:         The number of classes to be randomly picking for downloading.
images_per_class:          How many images to be downloaded for each class.
data_root:                 The dir for storing the downloaded images.
use_class_list:            Whether to use customized class list instead of random picking.
class_list:                A list of class to be downloaded. Please put the labels of the 
                           classes (e.g. n12345678...) instead of the class names (e.g. 
                           person, dog...) in type str in the list. For the full list of 
                           label-name pairs, please refer to the ./imagenet_class_info.json
                           file in this directoy. If you want to get the keywords of each
                           class, please refer to the ./words.txt file in the directoy.
multiprocessing_workers:   How many threads to process the request simultaneousy.
'''
scrape_only_flickr = False
number_of_classes = 156
images_per_class = 5
data_root = 'data'
use_class_list = False
use_subdir = True
class_list = ['n00007846']
multiprocessing_workers = 8

In [12]:
# import packages and components
import os, requests, json, time
import numpy as np
from multiprocessing import Pool, Process, Value, Lock
from requests.exceptions import ConnectionError, ReadTimeout, TooManyRedirects, MissingSchema, InvalidURL

In [13]:
# class for adding arguments
class Args:
    def __init__(self, 
                 scrape_only_flickr, 
                 number_of_classes, 
                 images_per_class, 
                 data_root, 
                 use_class_list, 
                 class_list, 
                 use_subdir,
                 multiprocessing_workers):
        self.scrape_only_flickr = scrape_only_flickr
        self.number_of_classes = number_of_classes
        self.images_per_class = images_per_class
        self.data_root = data_root
        self.use_class_list = use_class_list
        self.class_list = class_list
        self.multiprocessing_workers = multiprocessing_workers
        self.use_subdir = use_subdir
        self.checkArgs()
    
    def checkArgs(self):
        if type(self.scrape_only_flickr) is not bool:
            raise TypeError('Use boolean value for scrape_only_flickr')
        if type(self.number_of_classes) is not int:
            raise TypeError('Use integers for number_of_classes')
        if type(self.images_per_class) is not int:
            raise TypeError('Use integers for images_per_class')
        if type(self.data_root) is not str:
            raise TypeError('Use str for data_root')
        if type(self.use_class_list) is not bool:
            raise TypeError('Use boolean value for use_class_list')
        if type(self.class_list) is not list:
            raise TypeError('Use list for class_list')
        if type(self.use_subdir) is not bool:
            raise TypeError('Use boolean value for use_subdir')
        if type(multiprocessing_workers) is not int:
            raise TypeError('Use int for multiprocessing workers')


In [14]:
args = Args(scrape_only_flickr,
           number_of_classes,
           images_per_class,
           data_root,
           use_class_list,
           class_list,
           use_subdir,
           multiprocessing_workers)

In [15]:
# check if data_root is valid
if len(args.data_root) == 0:
    raise Exception("-data_root is required to run downloader!")
    
if not os.path.isdir(args.data_root):
    raise Exception(f'folder {args.data_root} does not exist! please provide existing folder in -data_root arg!')

In [16]:
# get imagenet class info and the names of the classes to download
current_folder = os.path.realpath(os.path.abspath(''))
print(current_folder)
class_info_json_filename = 'imagenet_class_info.json'
class_info_json_filepath = os.path.join(current_folder, class_info_json_filename)
class_info_dict = dict()

with open(class_info_json_filepath) as class_info_json_f:
    class_info_dict = json.load(class_info_json_f)
    
classes_to_scrape = []

if args.use_class_list:
   for item in args.class_list:
       classes_to_scrape.append(item)
       if item not in class_info_dict:
           raise Exception(f'Class {item} not found in ImageNete')

else:
    potential_class_pool = []
    for key, val in class_info_dict.items():
        if args.scrape_only_flickr:
            if int(val['flickr_img_url_count']) * 0.9 > args.images_per_class:
                potential_class_pool.append(key)
        else:
            if int(val['img_url_count']) * 0.8 > args.images_per_class:
                potential_class_pool.append(key)

    if (len(potential_class_pool) < args.number_of_classes):
        raise Exception(f'''With {args.images_per_class} images per class there are 
                           {len(potential_class_pool)} to choose from.
                           Decrease number of classes or decrease images per class.''')

    picked_classes_idxes = np.random.choice(len(potential_class_pool), args.number_of_classes, replace = False)

    for idx in picked_classes_idxes:
        classes_to_scrape.append(potential_class_pool[idx])


print("Picked the following clases:")
print([ class_info_dict[class_wnid]['class_name'] for class_wnid in classes_to_scrape ])

/Users/xinye/Library/CloudStorage/OneDrive-Personal/education/M23_CMU/01_dataset/01_imageNet/ImageNet-Datasets-Downloader/00_ImageNet
Picked the following clases:
['tachymeter', 'acquaintance', 'borage', 'ring-necked parakeet', 'veal cordon bleu', 'votary', 'New World sparrow', 'kidney fern', 'bearberry', 'cape forget-me-not', 'junior middleweight', 'yam', 'trogon', 'dinner jacket', 'tube', 'bunghole', 'concept album', 'ashcake', 'licenser', 'thermopile', 'jig', 'insurgent', 'radiobiologist', 'air search radar', 'sitar player', 'confectionery', 'clingfish', 'corn', 'rib', 'Moreton Bay tulipwood', 'negative magnetic pole', 'teak', 'giant chinkapin', 'structural member', 'red shrubby penstemon', 'Turkish towel', 'constructivist', 'bowline', 'white yam', 'queen', 'portrait lens', 'gorgonian', 'gift wrapping', 'pond-scum parasite', 'common horehound', 'extern', 'pyrograph', 'daybook', 'wire cloth', 'tattoo', 'head', 'Bernese mountain dog', 'arbovirus', 'basketball', 'wayfaring tree', 'devi

In [None]:
# create dir for storing images
imagenet_images_folder = os.path.join(args.data_root, 'imagenet_images')
if not os.path.isdir(imagenet_images_folder):
    os.mkdir(imagenet_images_folder)

In [None]:
# a class for storing downloading stats
class MultiStats():
    def __init__(self):

        self.lock = Lock()

        self.stats = dict(
            all=dict(
                tried=Value('d', 0),
                success=Value('d',0),
                time_spent=Value('d',0),
            ),
            is_flickr=dict(
                tried=Value('d', 0),
                success=Value('d',0),
                time_spent=Value('d',0),
            ),
            not_flickr=dict(
                tried=Value('d', 0),
                success=Value('d', 0),
                time_spent=Value('d', 0),
            )
        )
        
    def inc(self, cls, stat, val):
        with self.lock:
            self.stats[cls][stat].value += val

    def get(self, cls, stat):
        with self.lock:
            ret = self.stats[cls][stat].value
        return ret

multi_stats = MultiStats()

In [None]:
lock = Lock()
url_tries = Value('d', 0)
scraping_t_start = Value('d', time.time())
class_folder = ''
class_images = Value('d', 0)

IMAGENET_API_WNID_TO_URLS = lambda wnid: f'http://www.image-net.org/api/imagenet.synset.geturls?wnid={wnid}'

In [None]:
def print_stats(cls, print_func):
    global scraping_t_start
    actual_all_time_spent = time.time() - scraping_t_start.value
    processes_all_time_spent = multi_stats.get('all', 'time_spent')

    if processes_all_time_spent == 0:
        actual_processes_ratio = 1.0
    else:
        actual_processes_ratio = actual_all_time_spent / processes_all_time_spent

    #print(f"actual all time: {actual_all_time_spent} proc all time {processes_all_time_spent}")

    print_func(f'STATS For class {cls}:')
    print_func(f' tried {multi_stats.get(cls, "tried")} urls with'
               f' {multi_stats.get(cls, "success")} successes')

    if multi_stats.get(cls, "tried") > 0:
        print_func(f'{100.0 * multi_stats.get(cls, "success")/multi_stats.get(cls, "tried")}% success rate for {cls} urls ')
    if multi_stats.get(cls, "success") > 0:
        print_func(f'{multi_stats.get(cls,"time_spent") * actual_processes_ratio / multi_stats.get(cls,"success")} seconds spent per {cls} succesful image download')

In [None]:
def get_image(img_url):

    print(f'Processing {img_url}')

    #time.sleep(3)
    
    global lock, url_tries, class_folder, class_images

    if len(img_url) <= 1:
        return


    cls_imgs = 0
    with lock:
        cls_imgs = class_images.value

    if cls_imgs >= args.images_per_class:
        return

    cls = ''

    if 'flickr' in img_url:
        cls = 'is_flickr'
    else:
        cls = 'not_flickr'
        if args.scrape_only_flickr:
            return

    t_start = time.time()

    def finish(status):
        t_spent = time.time() - t_start
        multi_stats.inc(cls, 'time_spent', t_spent)
        multi_stats.inc('all', 'time_spent', t_spent)

        multi_stats.inc(cls,'tried', 1)
        multi_stats.inc('all', 'tried', 1)

        if status == 'success':
            multi_stats.inc(cls,'success', 1)
            multi_stats.inc('all', 'success', 1)

        elif status == 'failure':
            pass
        else:
            raise Exception(f'No such status {status}!!')
        return


    with lock:
        url_tries.value += 1
        if url_tries.value % 250 == 0:
            print(f'\nScraping stats:')
            print_stats('is_flickr', print)
            print_stats('not_flickr', print)
            print_stats('all', print)

    try:
        img_resp = requests.get(img_url, timeout = 1)
    except ConnectionError:
        return finish('failure')
    except ReadTimeout:
        return finish('failure')
    except TooManyRedirects:
        return finish('failure')
    except MissingSchema:
        return finish('failure')
    except InvalidURL:
        return finish('failure')

    if not 'content-type' in img_resp.headers:
        return finish('failure')

    if not 'image' in img_resp.headers['content-type']:
        return finish('failure')

    if (len(img_resp.content) < 1000):
        return finish('failure')

    img_name = img_url.split('/')[-1]
    img_name = img_name.split("?")[0]

    if (len(img_name) <= 1):
        return finish('failure')

    img_file_path = os.path.join(class_folder, img_name)

    with open(img_file_path, 'wb') as img_f:
        img_f.write(img_resp.content)

        with lock:
            class_images.value += 1

        return finish('success')

In [None]:
args = Args(scrape_only_flickr,
           number_of_classes,
           images_per_class,
           data_root,
           use_class_list,
           class_list,
           use_subdir,
           multiprocessing_workers)

In [None]:
# normal multiprocessing.Pool will not work with interactive python tools
# but this multiprocess works
# !pip install multiprocess
import multiprocess as mp

for class_wnid in classes_to_scrape:

    class_name = class_info_dict[class_wnid]["class_name"]
    print(f'Scraping images for class \"{class_name}\"')
    url_urls = IMAGENET_API_WNID_TO_URLS(class_wnid)

    time.sleep(0.05)
    resp = requests.get(url_urls)
    
    if args.use_subdir:
        class_folder = os.path.join(imagenet_images_folder, class_name)
    else:
        class_folder = imagenet_images_folder
        
    if not os.path.exists(class_folder):
        os.mkdir(class_folder)

    class_images.value = 0

    urls = [url.decode('utf-8') for url in resp.content.splitlines()]

    #for url in  urls:
    #    get_image(url)

    print(f"Multiprocessing workers: {args.multiprocessing_workers}")
    
    with mp.Pool(processes=args.multiprocessing_workers) as p:
        p.map(get_image,urls)

In [None]:
class_info_dict