<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [64]:
import requests
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import hashlib
from PIL import Image
import io, os
from tqdm.notebook import tqdm
from pathlib import Path
import numpy as np
from collections import Counter
from shutil import copy

# TARGET_PATH = './images'
TRAIN_DIR = 'data/train/'
PARSE_DIR = './images'

def check_duplicates(url, search_term, target_path=PARSE_DIR):
    # try:
    image_content = requests.get(url).content
    target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')))
    file_path = os.path.join(target_folder, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')

    if os.path.exists(file_path):
        print(f"{file_path} already exist, continue")
        return False
    return True

def fetch_img_urls(query:str,
                   max_links_to_fetch:int,
                   wd:webdriver,
                   sleep_beetween_interactions:int=1,
                   random_images=False,
                  ):
    
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_beetween_interactions)

        # except Exception:
        #     print(f"Something wrong, url - {url}")
        #     return False
        
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))
    
    image_urls = set()
    # last_n_images = 0
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)
        
        # get all image thumbnail results
        thumbnail_results = wd.find_elements(By.CSS_SELECTOR, "img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        
        
        if random_images:
            # thumbnail_results = np.random.choice(thumbnail_results[results_start:number_results], 10)
            thumbnail_results = np.random.choice(thumbnail_results[results_start:number_results], number_results)
        t = tqdm(total=max_links_to_fetch)    
        for img in thumbnail_results:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_beetween_interactions)
            except Exception:
                continue
            
            # extract image urls    
            actual_images = wd.find_elements(By.CSS_SELECTOR, 'img.n3VNCb')
            for actual_image in actual_images:
                src = actual_image.get_attribute('src')

                # print('http' in src, check_duplicates(src))
                # if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    # print('valid link')
                    if check_duplicates(src, query):
                        prev_urls_len = len(image_urls)
                        image_urls.add(actual_image.get_attribute('src'))
                        t.update(len(image_urls) - prev_urls_len)
            image_count = len(image_urls)
            
            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                
                # t.update()
                t.close()
                break
                
            else:
                print(f"Found: {len(image_urls)} image links of \"{query}\", looking for more")
                
                # time.sleep(np.random.uniform(3.2,12.4))
                time.sleep(30)
                # return
                load_more_button = wd.find_element(By.CSS_SELECTOR, ".mye4qd")
                if load_more_button:
                    wd.execute_script("document.querySelector('.mye4qd').click();")
            
            
            results_start = len(thumbnail_results)
        return image_urls
    
    
def persist_image(folder_path:str, urls):

    for url in urls:
        try:
            image_content = requests.get(url).content
        except Exception as e:
            print(f"ERROR - Could not download {url} - {e}")

        try:
            image_file = io.BytesIO(image_content)
            image = Image.open(image_file).convert('RGB')
            file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')

            # if os.path.exists(file_path):
            #     print(f"{file_path} already exist")
            #     continue

            with open(file_path, 'wb') as f:
                image.save(f, "JPEG", quality=85)
            print(f"SUCCESS -saved {url} - as {file_path}")
        except Exception as e:
            print(f"ERROR - Could not save {url} - {e}")

def search_and_download(search_term:str, 
                        driver_path='chromedriver',
                        target_path=PARSE_DIR,
                        number_images=10,
                        click_random_img=False
                       ):
    
    target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')))
    
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
                           
    
    with webdriver.Chrome(executable_path=driver_path) as wd:

        res = fetch_img_urls(search_term,
                             number_images,
                             wd=wd,
                             sleep_beetween_interactions=0.5,
                             random_images=click_random_img
                            )

    # persist_image(res)
    
    persist_image(target_folder, res)
    # for elem in res:
    #     persist_image(target_folder, elem)

In [65]:
n_img_threshold = 50 # minimum of needed images in dataset

if os.path.exists(TRAIN_DIR) and os.path.exists(PARSE_DIR):
    print('train and parse dirs exists.')

img_counter = Counter([p.parent.name for p in Path(TRAIN_DIR).glob('**/*.jpg')])
download_query_dct = {p.replace('_',' ') + ' simpsons':n_img_threshold - c for p, c in img_counter.items() if c < n_img_threshold}

download_query_dct

train and parse dirs exists.


{'lionel hutz simpsons': 8,
 'gil simpsons': 17,
 'troy mcclure simpsons': 12,
 'miss hoover simpsons': 3,
 'disco stu simpsons': 1}

In [66]:
for person, n_images in download_query_dct.items():
    # print(f"loop n_images = {n_images}")
    search_and_download(person, number_images=n_images, click_random_img=True)

Found: 100 search results. Extracting links from 0:100


  0%|          | 0/8 [00:00<?, ?it/s]

./images/lionel_hutz_simpsons/3a8692a565.jpg already exist, continue
Found: 0 image links of "lionel hutz simpsons", looking for more
./images/lionel_hutz_simpsons/722386c56d.jpg already exist, continue
Found: 0 image links of "lionel hutz simpsons", looking for more
Found: 3 image links of "lionel hutz simpsons", looking for more
Found: 5 image links of "lionel hutz simpsons", looking for more
./images/lionel_hutz_simpsons/ee2d71a5a4.jpg already exist, continue
Found: 7 image links of "lionel hutz simpsons", looking for more
Found: 7 image links of "lionel hutz simpsons", looking for more
Found: 10 image links, done!
SUCCESS -saved https://www.animationconnection.com/assets/artwork/1491918113-884-4424-lionel-hutz.jpg - as ./images/lionel_hutz_simpsons/0c4b47d038.jpg
SUCCESS -saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSq_cxnkotScmliSRi8yk68j_OWo6guZzixpw&usqp=CAU - as ./images/lionel_hutz_simpsons/3e8fde27eb.jpg
SUCCESS -saved https://ih1.redbubble.net/image.1630158977

  0%|          | 0/17 [00:00<?, ?it/s]

Found: 3 image links of "gil simpsons", looking for more
Found: 6 image links of "gil simpsons", looking for more
Found: 9 image links of "gil simpsons", looking for more
Found: 12 image links of "gil simpsons", looking for more
Found: 15 image links of "gil simpsons", looking for more
Found: 15 image links of "gil simpsons", looking for more
Found: 17 image links, done!
SUCCESS -saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSR92M-tm6c9GjV7MsURDfP2k8Z8pgcgE9qAg&usqp=CAU - as ./images/gil_simpsons/5c3e193321.jpg
SUCCESS -saved https://assets.mycast.io/actor_images/actor-gil-gunderson-318853_large.jpg?1638941848 - as ./images/gil_simpsons/dca3d7db99.jpg
SUCCESS -saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRoHJNFHfSiv3AyLQfrkY2vIO7-8Jvg7MwQ2w&usqp=CAU - as ./images/gil_simpsons/dd41e31edf.jpg
SUCCESS -saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRi3yuIiN1L3LRZbFkNIWDLpVHDszarq_bH4Q&usqp=CAU - as ./images/gil_simpsons/8abae90831.jpg
SUCCESS -sav

  0%|          | 0/12 [00:00<?, ?it/s]

./images/troy_mcclure_simpsons/447777b385.jpg already exist, continue
Found: 0 image links of "troy mcclure simpsons", looking for more
./images/troy_mcclure_simpsons/3ae2a942a1.jpg already exist, continue
Found: 1 image links of "troy mcclure simpsons", looking for more
Found: 4 image links of "troy mcclure simpsons", looking for more
Found: 7 image links of "troy mcclure simpsons", looking for more
Found: 10 image links of "troy mcclure simpsons", looking for more
Found: 10 image links of "troy mcclure simpsons", looking for more
Found: 13 image links, done!
SUCCESS -saved https://i1.sndcdn.com/artworks-000477818898-kjdaix-t500x500.jpg - as ./images/troy_mcclure_simpsons/d7ef2885ab.jpg
SUCCESS -saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSSTb68B1OMpouKCiyZ7qvMdOgTWcZm-Jkqvw&usqp=CAU - as ./images/troy_mcclure_simpsons/6173d1a0bc.jpg
SUCCESS -saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSSGAOCaKe7CwPAXfWyin5vQzlw-cpBFdFSYQ&usqp=CAU - as ./images/troy_mcc

  0%|          | 0/3 [00:00<?, ?it/s]

Found: 1 image links of "miss hoover simpsons", looking for more
./images/miss_hoover_simpsons/89e4b9bb3f.jpg already exist, continue
Found: 2 image links of "miss hoover simpsons", looking for more
Found: 3 image links, done!
SUCCESS -saved https://cdn.drawception.com/drawings/634633/9Z4kW9PKrJ.png - as ./images/miss_hoover_simpsons/cb175b8f61.jpg
SUCCESS -saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTbFqA9n4_J6dPey5PzbtN6OXtQVGPbF6Zuqw&usqp=CAU - as ./images/miss_hoover_simpsons/d30596a8aa.jpg
SUCCESS -saved https://i.pinimg.com/originals/7b/9d/24/7b9d2484fee311558d55a50841f75a92.jpg - as ./images/miss_hoover_simpsons/e1446cd6b2.jpg
Found: 100 search results. Extracting links from 0:100


  0%|          | 0/1 [00:00<?, ?it/s]

./images/disco_stu_simpsons/0fc90e7633.jpg already exist, continue
Found: 0 image links of "disco stu simpsons", looking for more
./images/disco_stu_simpsons/e93d16285e.jpg already exist, continue
Found: 0 image links of "disco stu simpsons", looking for more
Found: 0 image links of "disco stu simpsons", looking for more


In [67]:
parsed_img_list = list(Path(PARSE_DIR).glob('**/*.jpg'))

for n, img in enumerate(parsed_img_list):
    path_from = img
    target_folder = '_'.join(str(img.parent.name).split('_')[:-1])
#     last_folder = '_'.join(str(parsed_img_list[n - 1].parent.name).split('_')[:-1])
    path_to = Path(TRAIN_DIR).joinpath(target_folder,)
#     print(path_from.exists(), path_to.exists())
    try:
        print(copy(path_from, path_to))
#         if n != 0 and target_folder != last_folder:
#             print(f"Sucsessfull copied {last_folder}")
    except Exception:
        print(f"Something wrong with path - {path_to}")


data/train/sideshow_mel/1198533200.jpg
data/train/sideshow_mel/c119a7880f.jpg
data/train/sideshow_mel/170b992821.jpg
data/train/sideshow_mel/dd10e6e569.jpg
data/train/sideshow_mel/39649c9e04.jpg
data/train/sideshow_mel/2e2b85e9b6.jpg
data/train/sideshow_mel/c1a0e44e60.jpg
data/train/sideshow_mel/ed2abc4424.jpg
data/train/sideshow_mel/7fdf586dd7.jpg
data/train/sideshow_mel/3a8d86bfcd.jpg
data/train/sideshow_mel/c3fbd3e3db.jpg
data/train/miss_hoover/dab537e39a.jpg
data/train/miss_hoover/89e4b9bb3f.jpg
data/train/miss_hoover/1ab94cb9eb.jpg
data/train/miss_hoover/8158595db0.jpg
data/train/miss_hoover/e1446cd6b2.jpg
data/train/miss_hoover/9cf48249d9.jpg
data/train/miss_hoover/3980104836.jpg
data/train/miss_hoover/11b6f810c2.jpg
data/train/miss_hoover/9cb6b56332.jpg
data/train/miss_hoover/82fe44f1aa.jpg
data/train/miss_hoover/37c7ffd537.jpg
data/train/miss_hoover/1e52cf5054.jpg
data/train/miss_hoover/fbff75fbbc.jpg
data/train/miss_hoover/20ca1c5390.jpg
data/train/miss_hoover/312d0c4117.jpg
d

In [48]:
copy(Path('copy_parsed.sh'), Path('data/'))

'data/copy_parsed.sh'

In [57]:
tmp = ['data/train/sideshow_mel/1198533200.jpg',
'data/train/sideshow_mel/c119a7880f.jpg',
'data/train/sideshow_mel/170b992821.jpg',
'data/train/sideshow_mel/dd10e6e569.jpg',
'data/train/sideshow_mel/39649c9e04.jpg',
'data/train/sideshow_mel/2e2b85e9b6.jpg',
'data/train/sideshow_mel/c1a0e44e60.jpg',
'data/train/sideshow_mel/ed2abc4424.jpg',
'data/train/sideshow_mel/7fdf586dd7.jpg',
'data/train/sideshow_mel/3a8d86bfcd.jpg']

In [58]:
for i in tmp:
    print(Path(i).exists())

True
True
True
True
True
True
True
True
True
True
