In [1]:
import os
import shutil
import urllib.request
from pycocotools.coco import COCO


In [None]:
COUNT_IMAGES_TO_DOWNLOAD = 4000

In [2]:
PATH_TO_STORE_DOWNLOADED_PHOTOS = "./drive/MyDrive/Project/Photos/"
PATH_TO_COCO_ANNOTATIONS_ROOT_FOLDER = "./drive/MyDrive/Project/Photos"
DATA_TYPE='train2017'
ANNOTATION_FILE = '{}/annotations/instances_{}.json'.format(PATH_TO_COCO_ANNOTATIONS_ROOT_FOLDER, DATA_TYPE)
PHOTO_ZIPFILE_NAME = './drive/MyDrive/Project/coco'

In [4]:
def init_counter(path_to_count_content):
    if not os.path.exists(path_to_count_content):
        return 0
    else:
        already_existing_no_of_files = len(os.listdir(path_to_count_content))
        print("Already downloaded files: ", already_existing_no_of_files)
        return already_existing_no_of_files

def download_non_already_existing_images(filename_with_path, url):
    if not os.path.exists(filename_with_path):
        return download_image(url, filename_with_path)
    else:
        print("Skipping file, already exists: ", filename_with_path)
        return False

def download_image(url, filename):
    try:
        with urllib.request.urlopen(url) as response, open(filename, 'wb') as saving_file:
            shutil.copyfileobj(response, saving_file)
            return True
    except urllib.error.HTTPError as e:
        print(e)
        return False

def zip_images(zipfile_name, path_to_data_to_be_zipped):
    shutil.make_archive(zipfile_name, 'zip', path_to_data_to_be_zipped)


In [None]:
def main():
    if not os.path.exists(PATH_TO_STORE_DOWNLOADED_PHOTOS):
        os.makedirs(PATH_TO_STORE_DOWNLOADED_PHOTOS)

    coco = COCO(ANNOTATION_FILE)

    personCategory = coco.getCatIds(catNms=['person'])
    personImageIds = coco.getImgIds(catIds=personCategory)
    personImageData = coco.loadImgs(personImageIds)
    personImageDataIterator = iter(personImageData)

    counter = init_counter(PATH_TO_STORE_DOWNLOADED_PHOTOS)
    while counter <= COUNT_IMAGES_TO_DOWNLOAD:
        try:
            filename_with_path, url = get_filename_and_url(personImageDataIterator)
            success = download_non_already_existing_images(filename_with_path, url)
            if success:
                counter += 1
                print("Download #", counter)
        except StopIteration:
            print('Tried to download more photos than available, stopping.')
            counter = COUNT_IMAGES_TO_DOWNLOAD + 1
            pass
    zip_images(PHOTO_ZIPFILE_NAME, PATH_TO_STORE_DOWNLOADED_PHOTOS)

def get_filename_and_url(personImageDataIterator):
    photo_data = next(personImageDataIterator)
    url = photo_data['coco_url']
    filename = photo_data['file_name']
    filename_with_path = PATH_TO_STORE_DOWNLOADED_PHOTOS + filename

    return filename_with_path, url


In [None]:
if __name__ == '__main__':
    main()

loading annotations into memory...
Done (t=23.20s)
creating index...
index created!
Already downloaded files:  714
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000262145.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000262146.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000524291.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000262148.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000393223.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000393224.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000524297.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000393227.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000131084.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000393230.jpg
Skipping file, already exists:  ./drive/MyDrive/Project/Photos/000000262161.jpg
Skipp

RuntimeError: ignored

In [5]:
zip_images(PHOTO_ZIPFILE_NAME, PATH_TO_STORE_DOWNLOADED_PHOTOS)