## Import statements

In [56]:
import re
import os
import cv2
import wget
import json
import glob
import urllib
import hashlib
import requests
import flickrapi
import posixpath
from skimage import io
import multiprocessing
from pprint import pprint
import concurrent.futures
from tqdm.notebook import tqdm

## Helper functions

In [5]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def thread_it(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
    
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                executor.submit(thread_function, item)
                if tq:
                    tq.update(1)
    tq.close()


def thread_it_return(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
        
    results = []
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                future = executor.submit(thread_function, item)
                
                return_value = future.result()
                if return_value != None:
                    results.append(return_value)
                    
                if tq:
                    tq.update(1)
    
    tq.close()
    
    return results


def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

## Link scraping (DuckDuckGo)

In [6]:
def extract_json(objs, exts):
    links = []
    for obj in objs:
        
        """
        print("Width {0}, Height {1}".format(obj["width"], obj["height"]))
        print("Thumbnail {0}".format(obj["thumbnail"]))
        print("Url {0}".format(obj["url"]))
        print("Title {0}".format(obj["title"].encode('utf-8')))
        print("Image {0}".format(obj["image"]))
        
        -- EXAMPLE OUTPUT --
        Width 3840, Height 2560
        Thumbnail https://tse1.mm.bing.net/th?id=OIF.BrhofaJg5Fx2yl9jrBBQLQ&pid=Api
        Url https://www.airantares.ro/cazare/in-Paris/Franta/beaugrenelle-eiffel-tour-3-stars-paris-franta/
        Title b'Beaugrenelle Tour Eiffel, Paris, Franta'
        Image https://i.travelapi.com/hotels/2000000/1070000/1063000/1062936/c5a49732.jpg
        """

        if (obj["width"] * obj["height"]) > 307200 and obj["image"].split(".")[-1].lower() in exts:
            links.append(obj["image"])

    return links

def links_from_ddg(topic, max_images=None, exts=["jpg", "png", "bmp", "jpeg"]):
    link_list = []

    url = 'https://duckduckgo.com/' 
    params = {'q': topic} 

    #   First make a request to above URL, and parse out the 'vqd'
    #   This is a special token, which should be used in the subsequent request
    res = requests.post(url, data=params)
    searchObj = re.search(r'vqd=([\d-]+)\&', res.text, re.M|re.I) 

    if not searchObj:
        # Token parsing failed
        return -1 

    headers = {
        'authority': 'duckduckgo.com',
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'sec-fetch-dest': 'empty',
        'x-requested-with': 'XMLHttpRequest',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'referer': 'https://duckduckgo.com/',
        'accept-language': 'en-US,en;q=0.9',
    }

    params = (
        ('l', 'us-en'),
        ('o', 'json'),
        ('q', topic),
        ('vqd', searchObj.group(1)),
        ('f', ',,,'),
        ('p', '1'),
        ('v7exp', 'a'),
    )

    requestUrl = url + "i.js"

    print("Scraping links from DuckDuckGo")
    tq = tqdm(total=max_images)
    link_count = 0
    while True:
        while True:
            try:
                res = requests.get(requestUrl, headers=headers, params=params)
                data = json.loads(res.text)
                break
            except ValueError:
                # Hitting Url Failure - Sleep and Retry
                time.sleep(5)
                continue

        links = extract_json(data["results"], exts)
        for link in links:
            if max_images and link_count != max_images:
                link_list += [link]
                link_count += 1
                tq.update(1)
            else:
                return link_list
            

        if "next" not in data:
            # No next page
            return link_list

        requestUrl = url + data["next"]

## Link scraping (Flickr)

In [7]:
def links_from_flickr(topic, max_images):
    KEY = '88a8660edd2e770b1b00e878af174879'
    SECRET = 'f3063c276e3ad859'

    SIZES = ["url_o", "url_k", "url_h", "url_l", "url_c"]  # in order of preference

    """
    - url_o: Original (4520 × 3229)
    - url_k: Large 2048 (2048 × 1463)
    - url_h: Large 1600 (1600 × 1143)
    - url_l=: Large 1024 (1024 × 732)
    - url_c: Medium 800 (800 × 572)
    - url_z: Medium 640 (640 × 457)
    - url_m: Medium 500 (500 × 357)
    - url_n: Small 320 (320 × 229)
    - url_s: Small 240 (240 × 171)
    - url_t: Thumbnail (100 × 71)
    - url_q: Square 150 (150 × 150)
    - url_sq: Square 75 (75 × 75)
    """
    
    extras = ','.join(SIZES)
    flickr = flickrapi.FlickrAPI(KEY, SECRET)
    photos = flickr.walk(text=topic,  # it will search by image title and image tags
                            extras=extras,  # get the urls for each size we want
                            privacy_filter=1,  # search only for public photos
                            per_page=50,
                            sort='relevance')  # we want what we are looking for to appear first
    counter, urls = 0, []

    print("Scraping links from Flickr")
    tq = tqdm(total = max_images)
    for photo in photos:
        if counter < max_images:
            for i in range(len(SIZES)):  # makes sure the loop is done in the order we want
                url = photo.get(SIZES[i])
                if url:  # if url is None try with the next size
                    urls.append(url)
                    counter += 1
                    tq.update(1)
                    break
        else:
            break

    return urls

## Link scraping (Bing)

In [69]:
def links_from_bing(topic, max_images, exts=["jpg", "png", "bmp", "jpeg"], adult="off", bing_filter="filterui:imagesize-custom_640_480"):
    links = []
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}
    image_counter = 0
    page_counter = 0

    print("Scraping links from Flickr")
    tq = tqdm(total = max_images)
    while image_counter < max_images:
        # Parse the page source and download pics
        request_url = 'https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(topic) \
                        + '&first=' + str(page_counter) + '&count=' + str(max_images) \
                        + '&adlt=' + adult + '&qft=' + bing_filter
        request = urllib.request.Request(request_url, None, headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf8')
        found_links = re.findall('murl&quot;:&quot;(.*?)&quot;', html)

        for link in found_links:
            if image_counter < max_images:
                try:
                    path = urllib.parse.urlsplit(link).path
                    filename = posixpath.basename(path).split('?')[0]
                    file_type = filename.split(".")[-1]
                    link = link[:link.index("."+file_type)]+"."+file_type
                    if file_type.lower() in exts:
                        links.append(link)
                        image_counter += 1
                        tq.update(1)
                except:
                    pass
            else:
                break

        page_counter += 1
    tq.close()
    return links


## Download Function

In [62]:
def thread_download(item):
    link = item["link"]
    folder = item["folder"]
    service = item["service"]
    link_hash = str(hashlib.md5(link.encode("utf-8")).hexdigest())
    ext = link.split(".")[-1].lower()
    fname = "image_{}.{}".format(link_hash, ext)
    
    path = os.path.join(folder, fname)

    if not os.path.isfile(path):
        myfile = None
        if service == "ddg":
            myfile = requests.get(link, allow_redirects=True)
            open(path, 'wb').write(myfile.content)
        elif service == "flickr":
            myfile = requests.get(link, stream=True)
            open(path, 'wb').write(myfile.content)
        elif service == "bing":
            wget.download(link, path)

def download(links, folder, service="flickr"):
    items = []
    for link in links:
        items.append({"link": link, "folder": folder, "service": service})
    thread_it(thread_download, items, WORKERS=None)

## Download definition

In [66]:
# Define task
compare_dir = "compare_set/"
data_dir = "images/"

dirs = [compare_dir, data_dir]
for path in dirs:
    create_folder(path)

CPUs = multiprocessing.cpu_count()
# It will be double, since downloading from both Bing and Flickr.
data_num = 300
compare_num = 100
topic = "eiffel tower"

## Download the sets
#### Download the comparison set and count number of downloaded files 

In [None]:
# Download comparison set
exts = ["jpg", "jpeg"]
links = links_from_ddg(topic, max_images = compare_num, exts = exts)
download(links, compare_dir, service="ddg")
file_num = len(glob.glob(compare_dir+"*", recursive=True))
print("Downloaded {} images for the comparison set".format(file_num))

#### Download the data set and count number of downloaded files 

In [64]:
# Download data set from Flickr
links = links_from_flickr(topic, max_images=data_num)
download(links, data_dir, service="flickr")
file_num = len(glob.glob(data_dir+"*", recursive=True))
print("Downloaded {} images for the data set".format(file_num))

Downloaded 492 images for the data set


In [73]:
# Download data set from Bing
links = links_from_bing(topic, max_images=data_num)
download(links, data_dir, service="bing")
file_num_new = len(glob.glob(data_dir+"*", recursive=True))
print("Downloaded {} images for the data set".format(file_num_new-file_num))

Scraping links from Flickr


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value=&#39;&#39;)))


[&#39;http://justinekibler.files.wordpress.com/2014/06/eiffel-tower-night.jpg&#39;, &#39;https://www.pixelstalk.net/wp-content/uploads/2015/05/eiffel-tower-night.jpg&#39;, &#39;http://creativetravelguide.com/wp-content/uploads/2016/11/eiffel-tower-header.jpg&#39;, &#39;http://www.reidsfrance.com/site/assets/files/5978/eiffel-tower-at-night.jpg&#39;, &#39;http://2.bp.blogspot.com/-8XPuDaEb4uY/UqsegNDSxfI/AAAAAAAB5NI/mv1On3rGARw/s1600/Eiffel+Tower+Landscape+Wallpaper.jpg&#39;, &#39;https://new7wonders.com/app/uploads/sites/3/2016/09/ET_night_ls_central2_V-1920x1440.jpg&#39;, &#39;http://upload.wikimedia.org/wikipedia/commons/e/e2/Eiffel_tower_at_night_WLM.JPG&#39;, &#39;https://travelwithsandra.files.wordpress.com/2013/03/14915042-the-eiffel-tower-and-trocadero-fountain-in-paris-france.jpg&#39;, &#39;http://3.bp.blogspot.com/-DDBsWZ1hOE0/UGqxiQ9m6DI/AAAAAAAAGpk/PJwiefRjc-s/s1600/Paris___Eiffel_Tower_by_tariyoko.jpg&#39;, &#39;http://momvoyage.hilton.com/wp-content/uploads/2014/06/DSCN19

## Post-Download Cleaning
#### Rename all files to ".jpg" file format, remove anything not ".jpg".

In [None]:
def img_rename(image):
    fname = image.split(".")[-2]
    ext = image.split(".")[-1]
    if ext == "jpeg":
        shutil.move(image, fname+".jpg")
    elif ext == "jpg":
        return
    else:
        os.remove(image)
        
def images_rename(path):
    files = glob.glob(path+"*.*", recursive=True)
    thread_it(img_rename, files)

images_rename(compare_dir)
images_rename(data_dir)

#### Detect file corruption

In [None]:
def verify_thread(image):
    try:
        img = io.imread(image)
    except Exception as e:
        return (image, e)

def verify_images(path):
    files = glob.glob(path+"*.jpg", recursive=True)
    corrupt = thread_it_return(verify_thread, files)
    return corrupt

corrupt_compare = verify_images(compare_dir)
print("Compare directory:")
pprint(corrupt_compare)

corrupt_data = verify_images(data_dir)
print("Data directory:")
pprint(corrupt_data)

#### Remove corrupt

In [None]:
def img_remove_thread(item):
    os.remove(item[0])

def remove_items(items):
    thread_it(img_remove_thread, items)
    
remove_items(corrupt_compare)
remove_items(corrupt_data)

#### Checking minimum image size

In [None]:
def res_check_thread(image):
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)

    # Get image height and width
    #height, width, channels = img.shape
    height, width = img.shape

    # Count maximum resolution
    val = width * height
    
    # Show warning if problem
    # 640*480
    if val < 307200:
        return image


def get_under_res(path):
    files = glob.glob(path+"*.jpg")
    under_res = thread_it_return(res_check_thread, files)
    
    return under_res

print("Checking resolution for images (Compare):")
compare_under_res = get_under_res(compare_dir)
pprint(compare_under_res)
print("Checking resolution for images (Data):")
data_under_res = get_under_res(data_dir)
pprint(data_under_res)

print("Deleting found items:")
remove_items(compare_under_res)
remove_items(data_under_res)


# Ensure the images in the "compare" directory contain the subject and aren't blurred.

## Then proceed, to "Honours" notebook.