In [None]:
# default_exp imgtools

# imgtools

> Functions for handling images

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from suptools import *
from multiprocessing.pool import ThreadPool
from functools import partial
import pathlib
import urllib
import tqdm
import PIL
import warnings
import uuid
import requests

## References  
Functions `download_images` and `verify_images` functions are based on the below sources. Please visit attached links for further understanding.
- fastai library by Jeremy Howard and contributors: [Building your own dataset documentation](https://docs.fast.ai/vision.data.html#Building-your-own-dataset)
- Blog post by Mark Needham: [Parallel download files using requests](https://markhneedham.com/blog/2018/07/15/python-parallel-download-files-requests/)

## Download

In [None]:
#export
def download_image(dest, url, timeout=5):
    """
    Downloads an image from url to destination file path (pathlib.Path object).
    Tries to detect and preserve original suffix and defaults to '.jpg' if none 
    Assigns a randomly generated filename to avoid name clashes.
    """
    tmp = urllib.parse.urlparse(url)
    suffix = pathlib.Path(tmp.path).suffix
    if suffix == '':
        suffix = '.jpg'
    stem = str(uuid.uuid4())
    local_file = dest/f'{stem + suffix}'
    result = ''
    user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'
    try:
        r = requests.get(url, stream=True, timeout=5, headers={"User-Agent": user_agent})
        if r.status_code == 200:
            with open(local_file, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
            result = f"Success: {local_file}. Downloaded {url}"
        else:
            result = f"Failed: Can't download {url}"
    except Exception as e:
        result = f"Failed: Can't download {url}. Exception: {str(e)}"
        
    return result

Additional links on user agents to handle HTTP 406:
- [Python Requests HTTP Response 406](https://stackoverflow.com/questions/56101612/python-requests-http-response-406)
- [Detect your user agent by whatismybrowser.com](https://www.whatismybrowser.com/detect/what-is-my-user-agent)

In [None]:
#export
def download_images(url_file_path, dest, n_threads=4, timeout=5):
    """
    Downloads images from list of urls.
    Makes use of multiprocessing library to support concurrent downloads if supported by CPU.
    Visualizes progress using tqdm.
    """
    urls = pathlib.Path(url_file_path).read_text().strip().split("\n")
    dest = pathlib.Path(dest)
    dest.mkdir(exist_ok=True, parents=True)
    results = []
    for x in tqdm.tqdm_notebook(
        ThreadPool(n_threads).imap_unordered(
            partial(download_image, dest, timeout=timeout), 
            urls), total=len(urls)):
        results.append(x)
    return results

## Verify

In [None]:
#export
def verify_image_tf(img_file):
    """
    Verifies if an image can be opened by tf.image module
    tf.image currently supports only BMP, JPEG, GIF and PNG.
    Function attempts to convert non-supported formats to JPEG.
    If the image cannot be opened or converted, it is deleted.
    """
    try:
        import tensorflow as tf
        import PIL
        warnings.simplefilter("ignore")
        suffix = img_file.suffix
        if suffix.lower() not in ['.bmp', '.jpeg', '.jpg', '.gif', '.png']:
            renamed = img_file.parent/f'{img_file.name+".jpg"}'
            PIL.Image.open(img_file).save(renamed, "JPEG")
            img_file.unlink()
            print(f"File converted: {renamed}")
            img_file = renamed
        img = tf.io.read_file(str(img_file))
        img = tf.image.decode_image(img, channels=3, expand_animations=False)
    except Exception as e:
        print(f'Failed: Deleting {str(img_file)}: {e}')
        img_file.unlink()

In [None]:
#export
def verify_images(file_path, n_threads=4, recurse=False):
    """
    Verifies images from file path.
    Makes use of multiprocessing library to support concurrent verification if supported by CPU.
    Visualizes progress using tqdm.
    """
    path = pathlib.Path(file_path)
    files = get_all_files(file_path, recurse=recurse)
    for x in tqdm.tqdm_notebook(
        ThreadPool(n_threads).imap_unordered(
            verify_image_tf, 
            files), total=len(files)):
        pass

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_imgtools.ipynb.
Converted 02_tftools.ipynb.
Converted index.ipynb.
