In [9]:
# https://www.google.com/search?q=what&tbm=isch

import requests
from bs4 import BeautifulSoup
import csv
import cv2
import numpy as np
import re
import pandas as pd
from multiprocessing.pool import ThreadPool
from multiprocessing import cpu_count
import os
from PIL import Image

import time


In [10]:
class google_webscraping:
    """Images Web Scraper for Google Images
    """
    def __init__(self, path: str, query: str, total_img: int) -> None:
        """Images Web Scraper for Google Images

        Args:
            path (str): Path for images to downlaod
            query (str): search query
            total_img (int): total images to download
        """
        self.GOOGLE_BASE: str = "https://www.google.com"
        self.query: str = query
        self.total_img: int = total_img
        self.IMG_PER_PAGE: int = 20
        self.BASE_PATH = os.getcwd()
        if not os.path.exists(path):
            # assert False, "Path does not exist"
            os.makedirs(path, exist_ok=True)
        self.path: str = path
        os.chdir(os.path.join(os.getcwd(), self.path))

    def close(self):
        """Close the scraper and return the path back to the base path"""
        os.chdir(self.BASE_PATH)

    def start(self):
        """start image scraping from google images and save in to the path with multiple threads
        """
        procs = []
        pool = ThreadPool(processes=cpu_count())
        for start_index in range(0, self.total_img, self.IMG_PER_PAGE):
            proc = pool.apply_async(self.run, args=(start_index,))
            procs.append(proc)

        # complete the processes
        for proc in procs:
            proc.get()

    def run(self, start_index: int) -> None:
        """run the scraper for the given start index and download the images to the path

        Args:
            start_index (int): start index for the images to download
        """
        search_url: str = self.get_search_url(self.query, start_index)
        soup: BeautifulSoup = self.get_soup(search_url)
        image_urls: list = self.get_image_urls(soup)[:self.total_img]
        self.download_images(image_urls)

    def download_images(self, image_urls: list) -> None:
        """download images from the given image urls

        Args:
            image_urls (list): images urls to download
        """
        for url in image_urls:
            response: requests.Response = requests.get(url)
            if response.status_code == 200:
                img = cv2.imdecode(np.frombuffer(
                    response.content, np.uint8), cv2.IMREAD_UNCHANGED)
                cv2.imwrite(filename=self.query+'_' +
                            url.split(':')[-1]+'.png', img=img)
            else:
                print("FAILED", url)

    def get_soup(self, url: str) -> BeautifulSoup:
        """get the soup (html for filtered) from the given url using requests and BeautifulSoup

        Args:
            url (str): url to get the soup

        Returns:
            BeautifulSoup: BeautifulSoup object for the given url
        """
        response: requests.Response = requests.get(url)
        return BeautifulSoup(response.text, "html.parser")

    def get_image_urls(self, soup: BeautifulSoup) -> list:
        """filter only the images urls from the given soup (Google search page - html)

        Args:
            soup (BeautifulSoup): BeautifulSoup object from get_soup method

        Returns:
            list: images urls list from the given soup
        """
        image_urls: list = []
        for img in soup.find_all("img"):
            if img.has_attr("src"):
                if not img["src"].startswith("http"):
                    continue
                image_urls.append(img["src"])
        return image_urls

    def get_search_url(self, query: str, start_index: int) -> str:
        """get the search url for the given query and start index from the google images

        Args:
            query (str): search query
            start_index (int): start index for the images 

        Returns:
            str: url with given query and start index
        """
        return self.GOOGLE_BASE + "/search?q={query}&tbm=isch&start={start_index}".format(query=query, start_index=start_index)


In [8]:
total_img = 2000

print('single-threading')
print("Starting...")
start = time.time()
web = google_webscraping(path="pirate_ship_images",
                         query="pirate ship", total_img=total_img)
for start_index in range(0, web.total_img, 20):
    web.run(start_index)
old = time.time()-start
print("Time taken [s]:", old)
web.close

print('multi-threading')
print("Starting...")
start = time.time()
web = google_webscraping(path="pirate_ship_images",
                         query="pirate ship", total_img=total_img)
web.start()
new = time.time()-start
print("Time taken [s]:", new)
web.close()

print("Speedup [เท่าตัว]:", old/new)
print("Average Time per image old [s]:", old/total_img)
print("Average Time per image new [s]:", new/total_img)
print("Faster time per image [s]:", (old-new)/total_img)


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x0000024AFF7AB3D0>