## Import statements

In [7]:
import re
import os
import cv2
import wget
import json
import glob
import pickle
import shutil
import urllib
import hashlib
import requests
import flickrapi
import imagehash
import posixpath
from PIL import Image
from skimage import io
import multiprocessing
from pprint import pprint
import concurrent.futures
from tqdm.notebook import tqdm

## Helper functions

In [8]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def thread_it(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
    
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                executor.submit(thread_function, item)
                if tq:
                    tq.update(1)
    tq.close()


def thread_it_return(thread_function, my_list, tq=True, WORKERS=None):
    # Set worker number to CPU count
    if not WORKERS:
        WORKERS = multiprocessing.cpu_count()
    
    if tq:
        tq = tqdm(total=len(my_list))
        
    results = []
    # Separate into chunks and execute threaded
    thread_list = chunks(my_list, WORKERS)
    for chunk in thread_list:
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
            for item in chunk:
                future = executor.submit(thread_function, item)
                
                return_value = future.result()
                if return_value != None:
                    results.append(return_value)
                    
                if tq:
                    tq.update(1)
    
    tq.close()
    
    return results


def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)


def move_to(file_list, dest):
    tq = tqdm(total=len(file_list))
    exception_flag = False
    for item in file_list:
        try:
            shutil.move(item, dest)
        except Exception as e:
            print(e)
            exception_flag = True
        tq.update(1)
    tq.close()
    return exception_flag

## Link scraping (DuckDuckGo)

In [9]:
def extract_json(objs, exts):
    links = []
    for obj in objs:
        
        """
        print("Width {0}, Height {1}".format(obj["width"], obj["height"]))
        print("Thumbnail {0}".format(obj["thumbnail"]))
        print("Url {0}".format(obj["url"]))
        print("Title {0}".format(obj["title"].encode('utf-8')))
        print("Image {0}".format(obj["image"]))
        
        -- EXAMPLE OUTPUT --
        Width 3840, Height 2560
        Thumbnail https://tse1.mm.bing.net/th?id=OIF.BrhofaJg5Fx2yl9jrBBQLQ&pid=Api
        Url https://www.airantares.ro/cazare/in-Paris/Franta/beaugrenelle-eiffel-tour-3-stars-paris-franta/
        Title b'Beaugrenelle Tour Eiffel, Paris, Franta'
        Image https://i.travelapi.com/hotels/2000000/1070000/1063000/1062936/c5a49732.jpg
        """

        if (obj["width"] * obj["height"]) > 307200 and obj["image"].split(".")[-1].lower() in exts:
            links.append(obj["image"])

    return links


def links_from_ddg(topic, max_images=None, exts=["jpg", "png", "jpeg"]):
    link_list = []

    url = 'https://duckduckgo.com/' 
    params = {'q': topic} 

    #   First make a request to above URL, and parse out the 'vqd'
    #   This is a special token, which should be used in the subsequent request
    res = requests.post(url, data=params)
    searchObj = re.search(r'vqd=([\d-]+)\&', res.text, re.M|re.I) 

    if not searchObj:
        # Token parsing failed
        return -1 

    headers = {
        'authority': 'duckduckgo.com',
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'sec-fetch-dest': 'empty',
        'x-requested-with': 'XMLHttpRequest',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'referer': 'https://duckduckgo.com/',
        'accept-language': 'en-US,en;q=0.9',
    }

    params = (
        ('l', 'us-en'),
        ('o', 'json'),
        ('q', topic),
        ('vqd', searchObj.group(1)),
        ('f', ',,,'),
        ('p', '1'),
        ('v7exp', 'a'),
    )

    requestUrl = url + "i.js"

    print("Scraping links from DuckDuckGo")
    tq = tqdm(total=max_images)
    link_count = 0
    while True:
        while True:
            try:
                res = requests.get(requestUrl, headers=headers, params=params)
                data = json.loads(res.text)
                break
            except ValueError:
                # Hitting Url Failure - Sleep and Retry
                time.sleep(5)
                continue

        links = extract_json(data["results"], exts)
        for link in links:
            if max_images and link_count != max_images:
                link_list += [link]
                link_count += 1
                tq.update(1)
            else:
                return link_list
            

        if "next" not in data:
            # No next page
            return link_list

        requestUrl = url + data["next"]

## Link scraping (Flickr)

In [10]:
def links_from_flickr(topic, max_images):
    KEY = '88a8660edd2e770b1b00e878af174879'
    SECRET = 'f3063c276e3ad859'

    SIZES = ["url_o", "url_k", "url_h", "url_l", "url_c"]  # in order of preference

    """
    - url_o: Original (4520 × 3229)
    - url_k: Large 2048 (2048 × 1463)
    - url_h: Large 1600 (1600 × 1143)
    - url_l=: Large 1024 (1024 × 732)
    - url_c: Medium 800 (800 × 572)
    - url_z: Medium 640 (640 × 457)
    - url_m: Medium 500 (500 × 357)
    - url_n: Small 320 (320 × 229)
    - url_s: Small 240 (240 × 171)
    - url_t: Thumbnail (100 × 71)
    - url_q: Square 150 (150 × 150)
    - url_sq: Square 75 (75 × 75)
    """
    
    extras = ','.join(SIZES)
    flickr = flickrapi.FlickrAPI(KEY, SECRET)
    photos = flickr.walk(text=topic,  # it will search by image title and image tags
                            extras=extras,  # get the urls for each size we want
                            privacy_filter=1,  # search only for public photos
                            per_page=50,
                            sort='relevance')  # we want what we are looking for to appear first
    counter, urls = 0, []

    print("Scraping links from Flickr")
    tq = tqdm(total = max_images)
    for photo in photos:
        if counter < max_images:
            for i in range(len(SIZES)):  # makes sure the loop is done in the order we want
                url = photo.get(SIZES[i])
                if url:  # if url is None try with the next size
                    urls.append(url)
                    counter += 1
                    tq.update(1)
                    break
        else:
            break

    return urls

## Link scraping (Bing)

In [11]:
def links_from_bing(topic, max_images, exts=["jpg", "png", "jpeg"], adult="off", bing_filter="filterui:imagesize-custom_640_480"):
    links = []
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}
    image_counter = 0
    page_counter = 0

    tq = tqdm(total = max_images)
    while image_counter < max_images:
        # Parse the page source and download pics
        request_url = 'https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(topic) \
                        + '&first=' + str(page_counter) + '&count=' + str(max_images) \
                        + '&adlt=' + adult + '&qft=' + bing_filter
        request = urllib.request.Request(request_url, None, headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf8')
        found_links = re.findall('murl&quot;:&quot;(.*?)&quot;', html)

        for link in found_links:
            if image_counter < max_images:
                try:
                    path = urllib.parse.urlsplit(link).path
                    filename = posixpath.basename(path).split('?')[0]
                    file_type = filename.split(".")[-1]
                    link = link[:link.index("."+file_type)]+"."+file_type
                    if file_type.lower() in exts:
                        links.append(link)
                        image_counter += 1
                        tq.update(1)
                except:
                    pass
            else:
                break

        page_counter += 1
    tq.close()
    return links


## Download Function

In [12]:
def thread_download(item):
    link = item["link"]
    folder = item["folder"]
    service = item["service"]
    link_hash = str(hashlib.md5(link.encode("utf-8")).hexdigest())
    ext = link.split(".")[-1].lower()
    fname = "image-{}.{}".format(link_hash, ext)
    
    path = os.path.join(folder, fname)

    if not os.path.isfile(path):
        myfile = None
        if service == "ddg":
            myfile = requests.get(link, allow_redirects=True, timeout=0.5)
            open(path, 'wb').write(myfile.content)
        elif service == "flickr":
            myfile = requests.get(link, stream=True, timeout=0.5)
            open(path, 'wb').write(myfile.content)
        elif service == "bing":
            myfile = requests.get(link, timeout=0.5)
            open(path, 'wb').write(myfile.content)
            #wget.download(link, path)

def download(links, folder, service="flickr"):
    items = []
    for link in links:
        items.append({"link": link, "folder": folder, "service": service})
    print("Downloading links from {} to {}".format(service, folder))
    thread_it(thread_download, items, WORKERS=None)

## Download definition

In [14]:
# Define task
data_dir = "images/"
blurry = "blurry/"
pickles = "pickles/"
duplicates = "duplicates/"
histogram_dir = "histogram_check/"
too_small = "too_small/"

dirs = [data_dir, blurry, pickles, duplicates, histogram_dir, too_small]
for path in dirs:
    create_folder(path)

CPUs = multiprocessing.cpu_count()
# It will be double, since downloading from both Bing and Flickr.
bing_data_num = 10000
flickr_data_num = 10000
ddg_data_num = 10000
topic = "notre dame cathedral aerial view"

## Download the sets
#### Download the data set and count number of downloaded files 

In [16]:
# Download data set from DuckDuckGo
links = links_from_ddg(topic, max_images = ddg_data_num)
download(links, data_dir, service="ddg")
file_num = len(glob.glob(data_dir+"*", recursive=True))
print("Currently have {} images for the data set".format(file_num))

Scraping links from DuckDuckGo


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))

Downloading links from ddg to images/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=410.0), HTML(value='')))



Currently have 585 images for the data set


In [17]:
# Download data set from Flickr
links = links_from_flickr(topic, max_images=flickr_data_num)
download(links, data_dir, service="flickr")
file_num = len(glob.glob(data_dir+"*", recursive=True))
print("Currently have {} images for the data set".format(file_num))

Scraping links from Flickr


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))

Downloading links from flickr to images/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=379.0), HTML(value='')))


Currently have 954 images for the data set


In [18]:
# Download data set from Bing
links = links_from_bing(topic, max_images=bing_data_num)
download(links, data_dir, service="bing")
file_num_new = len(glob.glob(data_dir+"*", recursive=True))
print("Currently have {} images for the data set".format(file_num))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))


Downloading links from bing to images/


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))


Currently have 954 images for the data set


## Post-Download Cleaning
#### Rename all ".jpeg" to ".jpg" file format.

In [22]:
def img_rename(image):
    fname = image.split(".")[-2]
    ext = image.split(".")[-1]
    if ext == "jpeg":
        shutil.move(image, fname+".jpg")
    elif ext == "jpg":
        return
    else:
        return
                
def images_rename(path):
    files = glob.glob(path+"*.*", recursive=True)
    thread_it(img_rename, files)

images_rename(data_dir)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2871.0), HTML(value='')))




#### Detect file corruption

In [23]:
# Check the JPEG is valid, if not remove.
!jpeginfo -cd $data_dir/*.jpg;
# Check the PNG is valid, if not remove.
!pngcheck -fq $data_dir/*.png | cut -d " " -f2 | xargs rm;

#Convert all formats to JPEG.
!mogrify -format jpg $data_dir/*.png;

# Remove converted files.
!rm $data_dir/*.png

images//10190279@N06_1027171096.jpg 1024 x 685  24bit JFIF  N  424665  [OK]
images//10190279@N06_1027171096.rd.jpg 1024 x 685  24bit JFIF  N  405541  [OK]
images//10699036@N08_2183236005.jpg 2688 x 2016 24bit Exif  N  807254  [OK]
images//10699036@N08_2183236005.rd.jpg 2688 x 2016 24bit JFIF  N 2208442  [OK]
images//10699036@N08_2183236021.jpg 2016 x 2688 24bit Exif  N  799842  [OK]
images//10699036@N08_2183236021.rd.jpg 2016 x 2688 24bit JFIF  N 3100724  [OK]
images//11186165@N07_2297563319.jpg  681 x 1024 24bit JFIF  N  445203  [OK]
images//11186165@N07_2297563319.rd.jpg  681 x 1024 24bit JFIF  N  454698  [OK]
images//12407269@N07_1291377840.jpg 2592 x 1728 24bit Exif  N 2018684  [OK]
images//12407269@N07_1291377840.rd.jpg 2592 x 1728 24bit JFIF  N 2070450  [OK]
images//13012027@N04_2073398235.jpg 1024 x 768  24bit JFIF  N  411905  [OK]
images//13012027@N04_2073398235.rd.jpg 1024 x 768  24bit JFIF  N  442764  [OK]
images//13012027@N04_2073400087.jpg  768 x 1024 24bit JFIF  N  413663 

#### Checking minimum image size

In [24]:
def res_check_thread(image):
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)

    # Get image height and width
    #height, width, channels = img.shape
    height, width = img.shape

    # Count maximum resolution
    val = width * height
    
    # Show warning if problem
    # 640*480
    if val < 307200:
        return image


def get_under_res(path):
    files = glob.glob(path+"*.jpg")
    under_res = thread_it_return(res_check_thread, files)
    
    return under_res

print("Checking resolution for images (Data):")
data_under_res = get_under_res(data_dir)
pprint(data_under_res)

print("Moving found items")
if not move_to(data_under_res, too_small):
    del(data_under_res)

Checking resolution for images (Data):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2850.0), HTML(value='')))


['images/image-669701a81a6da604006960a67116f37a.jpg',
 'images/allez_asnl_2087428562.rd.jpg',
 'images/27101262@N00_7536391.rd.jpg',
 'images/paris_notredame_000733.jpg',
 'images/allez_asnl_2087428562.jpg',
 'images/omaromar_11726909.rd.jpg',
 'images/omaromar_11726909.jpg',
 'images/jswieringa_125983387.jpg',
 'images/allez_asnl_2087428412.jpg',
 'images/dottieday_508652908.rd.jpg',
 'images/image-b0ac5172136c093b79037423c9968406.jpg',
 'images/omaromar_11727516.jpg',
 'images/omaromar_11726904.jpg',
 'images/nikkole_200429059.rd.jpg',
 'images/acme_11938410.rd.jpg',
 'images/acme_11938410.jpg',
 'images/dottieday_508652908.jpg',
 'images/omaromar_11727407.rd.jpg',
 'images/joaomak_98830675.rd.jpg',
 'images/image-19cfc8c7a97d4669f62f53b54fe47a2a.jpg',
 'images/image-94d1b23d682982691efca1aca137e05e.jpg',
 'images/omaromar_11727516.rd.jpg',
 'images/acme_225281450.rd.jpg',
 'images/unorthodoxy_2308962348.jpg',
 'images/omaromar_11726904.rd.jpg',
 'images/image-7d0abde0dcfb54e39caf33

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=42.0), HTML(value='')))




#### Get the images in the sets that are more blurry than a given threshold

In [25]:
def too_blurry_thread(item):
    image = item["image"]
    threshold = item["threshold"]
    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    val = cv2.Laplacian(img, cv2.CV_64F).var()
    if val < threshold:
        return image
                
def get_too_blurry(path, threshold):
    files = glob.glob(path+"*.jpg")
    items = []
    for image in files:
        items.append({"image": image, "threshold": threshold})
    too_blurry = thread_it_return(too_blurry_thread, items)
    
    print("{} out of {} images are blurry".format(len(too_blurry), len(files)))
    return too_blurry

#https://www.pyimagesearch.com/2015/09/07/blur-detection-with-opencv/
# A constant of 200, it does pretty good!
too_blurry_data = get_too_blurry(data_dir, 125)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2808.0), HTML(value='')))


396 out of 2808 images are blurry


#### Move those images to blurry folder

In [26]:
if not move_to(too_blurry_data, blurry):
    del(too_blurry_data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=396.0), HTML(value='')))




#### Hash images to check for duplicates

In [27]:
def hash_thread(item):
    fname = item["image"]
    hashes = item["hashes"]
    if fname not in hashes.keys():
        hashes[fname] = imagehash.dhash(Image.open(fname))

def compute_img_hashes(path):
    items = []
    files = glob.glob(path+"*.jpg", recursive=True)
    hash_file = pickles+path[:-1]+"_hashes.pickle"
    if os.path.isfile(hash_file):
        hashes = pickle.load(open(hash_file, "rb"))
    else:
        hashes = {}
    for image in files:
        items.append({"image": image, "hashes": hashes})
    
    thread_it(hash_thread, items)
    pickle.dump(hashes, open(hash_file, "wb"))

compute_img_hashes(data_dir)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2412.0), HTML(value='')))




#### Compute hash distances for each image to each image

In [28]:
def hash_distance_thread(item):
    image = item["image"]
    hashes = item["hashes"]
    distances = item["distances"]
    hash1 = hashes[image]

    for hash2name in hashes.keys():
        # If not same image, and not already done for that image
        if hash2name != image and hash2name not in distances[image].keys():
            hash2 = hashes[hash2name]
            distances[image][hash2name] = hash1 - hash2
            
def compute_hash_distance(path):
    hashes_path = pickles+path[:-1]+"_hashes.pickle"
    distances_path = pickles+path[:-1]+"_distances.pickle"
    files = glob.glob(path+"*.jpg", recursive=True)
    if os.path.isfile(hashes_path):
        hashes = pickle.load(open(hashes_path, "rb"))
    else:
        print("No hash file detected, cannot continue.")
        return

    if os.path.isfile(distances_path):
        distances = pickle.load(open(distances_path, "rb"))
    else:
        distances = {}
    
    for image in files:
        distances[image] = {}

    items = []
    for image in files:
        items.append({"image": image, "hashes": hashes, "distances": distances})

    thread_it(hash_distance_thread, items)
    pickle.dump(distances, open(distances_path, "wb"))

compute_hash_distance(data_dir)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2412.0), HTML(value='')))




#### Check for duplicates

In [11]:
def get_duplicate_images(path, threshold=10):
    distances_path = pickles+path[:-1]+"_distances.pickle"
    if os.path.isfile(distances_path):
        distances = pickle.load(open(distances_path, "rb"))
    else:
        print("No distances file detected, cannot continue.")
        return

    files = glob.glob(path+"*.jpg", recursive=True)
    dup, close = [], []
    
    for path in files:
        distances_item = distances[path]
        img = path.split(".")[-2]+".jpg"
        
        for key, val in distances_item.items():
            if val == 0:
                if key not in dup and path not in dup:
                    dup.append(key)
            elif val < threshold:
                if key not in close and path not in close:
                    close.append(key)
                
    return dup, close

data_dup, data_close = get_duplicate_images(data_dir, threshold=5)

print("Data duplicates:")
pprint(data_dup)
print("Data close:")
pprint(data_close)

print("Length of data duplicates: ", len(data_dup))
print("Length of data close: ", len(data_close))

show = False
if show:
    print("DUPLICATES IN DATA:")
    for item in data_dup:
        show_img_by_path(item)
    print("CLOSE IN DATA:")
    for item in data_close:
        show_img_by_path(item)

Data duplicates:
['images/jaumemeneses_1808511833.rd.jpg',
 'images/alex1961_2465547309.jpg',
 'images/celesteh_102619569.jpg',
 'images/davebowman_407843450.jpg',
 'images/paris_notredame_000696.jpg',
 'images/derickleony_2196179420.rd.jpg',
 'images/hotels-paris-rive-gauche_272241909.jpg',
 'images/blahman_447239432.rd.jpg',
 'images/cfuga_1434885957.jpg',
 'images/eugeniayjulian_21023507.rd.jpg',
 'images/paigerphotography_1882636700.rd.jpg',
 'images/malias_1215792909.jpg',
 'images/eugeniayjulian_21019693.rd.jpg',
 'images/mpd01605_2536747839.jpg',
 'images/10190279@N06_1027171096.jpg',
 'images/haydn_173471993.jpg',
 'images/malias_55125824.jpg',
 'images/aoifecahill_379216157.rd.jpg',
 'images/eugeniayjulian_21023151.rd.jpg',
 'images/achtundsiebzig_196444302.rd.jpg',
 'images/mpd01605_2535288114.rd.jpg',
 'images/79109252@N00_2337647384.jpg',
 'images/8250661@N08_514024275.jpg',
 'images/alex1961_2471587281.rd.jpg',
 'images/85428086@N00_231121544.rd.jpg',
 'images/jjvaca_24216

#### Move duplicates

In [12]:
if not move_to(data_dup, duplicates):
    del(data_dup)
if not move_to(data_close, duplicates):
    del(data_dup)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=179.0), HTML(value='')))

[Errno 2] No such file or directory: 'images/jaumemeneses_1808511833.rd.jpg'
[Errno 2] No such file or directory: 'images/alex1961_2465547309.jpg'
[Errno 2] No such file or directory: 'images/celesteh_102619569.jpg'
[Errno 2] No such file or directory: 'images/davebowman_407843450.jpg'
[Errno 2] No such file or directory: 'images/paris_notredame_000696.jpg'
[Errno 2] No such file or directory: 'images/derickleony_2196179420.rd.jpg'
[Errno 2] No such file or directory: 'images/hotels-paris-rive-gauche_272241909.jpg'
[Errno 2] No such file or directory: 'images/blahman_447239432.rd.jpg'
[Errno 2] No such file or directory: 'images/cfuga_1434885957.jpg'
[Errno 2] No such file or directory: 'images/eugeniayjulian_21023507.rd.jpg'
[Errno 2] No such file or directory: 'images/paigerphotography_1882636700.rd.jpg'
[Errno 2] No such file or directory: 'images/malias_1215792909.jpg'
[Errno 2] No such file or directory: 'images/eugeniayjulian_21019693.rd.jpg'
[Errno 2] No such file or directory: 

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=320.0), HTML(value='')))

[Errno 2] No such file or directory: 'images/kenner116_877622220.jpg'
[Errno 2] No such file or directory: 'images/nickstenning_2127919630.jpg'
[Errno 2] No such file or directory: 'images/gcourbis_404724391.rd.jpg'
[Errno 2] No such file or directory: 'images/paris_notredame_000053.jpg'
[Errno 2] No such file or directory: 'images/mpd01605_2534315485.jpg'
[Errno 2] No such file or directory: 'images/julioenriquez_464095482.rd.jpg'
[Errno 2] No such file or directory: 'images/alex1961_2465544291.jpg'
[Errno 2] No such file or directory: 'images/77163391@N00_428510490.rd.jpg'
[Errno 2] No such file or directory: 'images/dankamminga_5084254.jpg'
[Errno 2] No such file or directory: 'images/bliksemtobey_1491802280.rd.jpg'
[Errno 2] No such file or directory: 'images/maveric2003_2131647620.jpg'
[Errno 2] No such file or directory: 'images/hotels-paris-rive-gauche_272241909.rd.jpg'
[Errno 2] No such file or directory: 'images/hotels-paris-rive-gauche_313842413.rd.jpg'
[Errno 2] No such file

### Histogram check for unrealistic colors

In [31]:
histSize = 256
histRange = (0, 256) # the upper boundary i
accumulate = False

def histogram_thread(fi):
    try:
        image = cv2.imread(fi)
        bgr_planes = cv2.split(image)
        b_hist = cv2.calcHist(bgr_planes, [0], None, [histSize], histRange, accumulate=accumulate)
        g_hist = cv2.calcHist(bgr_planes, [1], None, [histSize], histRange, accumulate=accumulate)
        r_hist = cv2.calcHist(bgr_planes, [2], None, [histSize], histRange, accumulate=accumulate)
        new = []
        for b, g, r in zip(b_hist, g_hist, r_hist):
            total = b + g + r     
            new.append(total/3)

        num_differing = 0

        for b, g, r, total in zip(b_hist, g_hist, r_hist, new):
            b_per = b / total
            g_per = g / total
            r_per = r / total
            diff = False
            if b_per > 1.15 or b_per < 0.85:
                diff = True
            if g_per > 1.15 or g_per < 0.85:
                diff = True
            if r_per > 1.15 or r_per < 0.85:
                diff = True
            if diff:
                num_differing += 1
        
        return (fi, num_differing)


    except Exception as e:
        print(fi, e)

In [None]:
data_files = glob.glob(data_dir+"*.jpg")

num_differing_data = thread_it_return(histogram_thread, data_files)

# 200 seems to be a good constant
data_to_move = []
data_moved = 0
for fi, num_differing in num_differing_data:
    if num_differing < 200:
        data_to_move.append(fi)
        data_moved += 1
        print(fi, num_differing, "moved")



In [None]:
if not move_to(compare_to_move, histogram_dir):
    del(compare_to_move)
if not move_to(data_to_move, histogram_dir):
    del(data_to_move)

print("Moved {} out of {} in compare".format(compare_moved, len(compare_files)))
print("Moved {} out of {} in data".format(data_moved, len(data_files)))

### Compute image clusters using deep learning means

In [15]:
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from sklearn.cluster import KMeans, Birch
from tqdm import tqdm
import numpy as np
import tensorflow as tf
import glob
import os
from shutil import copyfile, rmtree
from concurrent.futures import ThreadPoolExecutor

n_clusters = 50

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
	tf.config.experimental.set_memory_growth(gpu, True)

model = VGG16(weights='imagenet', include_top=False)
#model.summary()

rmtree("output")
os.mkdir("output")
for i in range(n_clusters):
	os.mkdir("output/"+str(i))

vgg16_feature_list, features = [], {}

if os.path.isfile("features.pkl"):
        features = pickle.load(open("features.pkl", "rb"))

NPROC = 4
images = glob.glob(data_dir+"/*")
for img_path in tqdm(images):
        if img_path not in features:
                img = image.load_img(img_path, target_size=(224, 224))
                img_data = image.img_to_array(img)
                img_data = np.expand_dims(img_data, axis=0)
                img_data = preprocess_input(img_data)
                vgg16_feature = model.predict(img_data)
                vgg16_feature_np = np.array(vgg16_feature)
                data = vgg16_feature_np.flatten()
                features[img_path] = data
                vgg16_feature_list.append(data)
        else:
                vgg16_feature_list.append(features[img_path])

pickle.dump(features, open("features.pkl", "wb"))

vgg16_feature_list_np = np.array(vgg16_feature_list)
#kmeans = KMeans(n_clusters=None, random_state=0, verbose=1)
#kmeans = kmeans.fit(vgg16_feature_list_np)
#threshold; float, default=0.5
#birch = Birch(n_clusters=None, threshold=1.7)
birch = Birch(n_clusters=n_clusters)
birch = birch.fit(vgg16_feature_list_np)

del(vgg16_feature_list_np)

for item in tqdm(zip(images, vgg16_feature_list), total=len(images)):
	img_path, feature = item
	pred = birch.predict(feature.reshape(1, -1))[0]
	fname = img_path.split("/")[-1]
	if not os.path.exists("output/"+str(pred)):
		os.mkdir("output/"+str(pred))
	copyfile(img_path, "output/"+str(pred)+"/"+fname)

del(vgg16_feature_list)

100%|██████████| 1913/1913 [00:00<00:00, 312802.76it/s]
 62%|██████▏   | 1190/1913 [01:40<01:01, 11.82it/s]


KeyboardInterrupt: 

In [38]:
import matplotlib.pyplot as plt

directory = os.listdir(data_dir)
for each in directory:
    plt.figure()
    currentFolder = data_dir + each
    for i, file in enumerate(os.listdir(currentFolder)[0:5]):
        fullpath = main_folder+ "/" + file
        print(fullpath)
        img=mpimg.imread(fullpath)
        plt.subplot(2, 3, i)
        plt.imshow(img)

SyntaxError: invalid syntax (<ipython-input-38-f170ef7bce6c>, line 1)

# Build dataset from the clusters

## Then proceed, to "Honours" notebook.