**Code Inspired from [Fabian Bosler](https://towardsdatascience.com/image-scraping-with-python-a96feda8af2d)**

In [29]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from PIL import Image
import io
import os
import requests
import time
import Augmentor

SUBJECT = 'airchair'
GSEARCH_URL = '''https://www.google.com/search?q=air+chair+breakdance&tbm=isch&ved=2ahUKEwilvrvUmqf6A
               hU0YzUKHQeMCWQQ2-cCegQIABAA&oq=air+chair+breakdance&gs_lcp=CgNpbWcQAzIFCAAQgAQ6BAgjE
               CdQtQRYwgZgzwloAHAAeACAAbQBiAGhA5IBAzEuMpgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&
               ei=_7QrY-XCG7TG1QGHmKagBg&bih=616&biw=1263&hl=en'''
IMG_PATH = os.path.join('images', 'downloadedimages', SUBJECT)
CLASS_NAME0 = 'Q4LuWd'
CLASS_NAME1 = 'KAlRDb'
LABELING_PATH = os.path.join('labelimg')

In [30]:
def get_google_images(search_url, CN_0, CN_1, max_images, lag):
    '''scrapes google_images and stores source image urls in a set
        Args:
            search_url (str): google image search url
            max_images (int): total images to be collected (must be <= 30)
            CN_0 (str): class name of image thumbnails
            CN_1 (str): class name of html block containing 'src' tag
            
        Returns:
            image_url_set (set): set of all image urls (str)
    '''
    img_url_set = set()
    assert max_images <= 30, 'Too many images (only <=30)!'
    
    # Initializing selenium webdriver
    s = Service(ChromeDriverManager().install())
    wd = webdriver.Chrome(service=s)
    wd.get(search_url)
    
    # Finding all images using thumbnail element (CONSTANT class)
    img_elements = wd.find_elements(By.CLASS_NAME, CN_0)
    
    # Scrolling to the bottom of the page to load all images before parsing
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Loop iterations are adjusted based on exceptions (to 'skip' image)
    skip = 0
    
    for img_element in img_elements[len(img_url_set) + skip :max_images]:
        try:
            # Clicks on each thumbnail image
            img_element.click()
            
            # Waits till full res image loads
            time.sleep(lag)
            
            # Finds src_url
            img = wd.find_element(By.CLASS_NAME, CN_1)
            src_url = img.get_attribute('src')
            
            # Checks if src_url is a real website
            if 'http' in src_url:
                img_url_set.add(src_url)
        except:
            skip += 1
            continue
    
    # Success rate
    print(f'''Images collected: {len(img_url_set)} out of {max_images}, Images Skipped: {skip} out of {max_images}''')
    
    return img_url_set
    

In [31]:
def download_image(dl_path, dl_url, filename):
    '''downloads an image from a source url to given filename
        Args:
            dl_path (str): path to download folder
            dl_url (str): image url
            filename (str): filename of downloaded image
        
        Returns:
            None
    '''
    try:
        # Requesting image content from src
        img_content = requests.get(dl_url).content
        img_file = io.BytesIO(img_content)
        img = Image.open(img_file)
        
        file_path = os.path.join(dl_path, filename)

        # Downloading image to filepath
        with open(file_path, 'wb') as downloader:
            img.save(downloader, "JPEG")
            
    except Exception as e:
        print('Download Failed:', e)

In [36]:
def image_augmentation(imgs_path, sample_size):
    '''downloads an image from a source url to given filename
        Args:
            imgs_path (str): path to folder with images to be augmented
            sample_size (int): desired number of augmented images
        
        Returns:
            None
    '''
    aug = Augmentor.Pipeline(imgs_path)
    
    # Horizontal mirror
    aug.flip_left_right(probability=0.5)
    # Slight rotation
    aug.rotate(probability=0.7, max_left_rotation=5, max_right_rotation=5)
    # Slight skew
    aug.skew_top_bottom(probability=0.3, magnitude=0.1)
    # Change in brightness
    aug.random_brightness(probability=0.4, min_factor=0.6, max_factor=1.2)
    # Change in saturation & contrast
    aug.random_color(probability=0.2, min_factor=0.5, max_factor=2)
    aug.random_contrast(probability=0.2, min_factor=0.5, max_factor=2)
    
    aug.sample(sample_size)

In [34]:
img_url_set = get_google_images(GSEARCH_URL, CLASS_NAME0, CLASS_NAME1, 15, 3)

# To track images
i = 0
for url in img_url_set:
    download_image(IMG_PATH, url, SUBJECT + str(i) + '.jpg')
    i += 1

print(f'{i} downloads successful.')

Images collected: 14 out of 15, Images Skipped: 1 out of 15
14 downloads successful.


In [37]:
image_augmentation(IMG_PATH, 30)
print('Augmentation successful.')

Initialised with 12 image(s) found.
Output directory set to images\downloadedimages\airchair\output.

Processing <PIL.Image.Image image mode=RGB size=1280x720 at 0x26201018E20>: 100%|█| 30/30 [00:00<00:00, 35.42 Samples/s

Augmentation successful.





In [23]:
# Installing labelImg software by heartexlabs
!mkdir {LABELING_PATH}
!git clone https://github.com/heartexlabs/labelImg.git {LABELING_PATH}
!cd {LABELING_PATH} && pyrcc5 -o libs/resources.py resources.qrc

Cloning into 'labelimg'...


In [38]:
# Launching labelImg
!cd {LABELING_PATH} && python labelImg.py

Cancel creation.
Image:D:\HDD Files\Desktop\Power Detection\Web Scraped Training Set\images\downloadedimages\airchair\output\airchair_original_airchair1.jpg_b899b443-9fcf-454e-82f1-574764e31c4f.jpg -> Annotation:D:/HDD Files/Desktop/Power Detection/Web Scraped Training Set/images/downloadedimages/airchair/output\airchair_original_airchair1.jpg_b899b443-9fcf-454e-82f1-574764e31c4f.xml
Image:D:\HDD Files\Desktop\Power Detection\Web Scraped Training Set\images\downloadedimages\airchair\output\airchair_original_airchair1.jpg_eefe184e-1809-4962-aa24-9017f00dc637.jpg -> Annotation:D:/HDD Files/Desktop/Power Detection/Web Scraped Training Set/images/downloadedimages/airchair/output\airchair_original_airchair1.jpg_eefe184e-1809-4962-aa24-9017f00dc637.xml
Image:D:\HDD Files\Desktop\Power Detection\Web Scraped Training Set\images\downloadedimages\airchair\output\airchair_original_airchair1.jpg_f1bffa3c-c0cd-407a-a4ad-a74caa69e9c6.jpg -> Annotation:D:/HDD Files/Desktop/Power Detection/Web Scraped