# Web photo scraper app
This project uses the selenium library for sending and retrieving information from a website using a web driver,
and then display the result in a GUI generated by the tkinter library.
## Some basic principles:
* The app uses google image search as it's primary and only website. You can of course change this in the code and use other websites.
* This app does not download the full photo size but rather takes a screenshot of the image displayed in the web result page. 
Thus, reducing the download time and space taken by the photos in the folder.
* For the app to work, you first must install a web driver. Go to [Seleniumâ€™s website](https://selenium-python.readthedocs.io/) 
and follow the instruction for downloading your browser's web driver and place it in your **C:\** root directory.
* Download the app icon image file from the project repository and place it in the same folder as the python file. 
* When you click the search button(Run scraper), you won't see anything happening. 
I chose to disable the display the website opening for search, to make it more 'cleaner' in appearance.
Again, you can change this parameter in the code and watch how all the magic is happening!

**Important** - for the App to function correctly, run all the code from top to bottom. Then the GUI will appear...

In [1]:
# Import all nesscery librarys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import tkinter as tk
from tkinter import filedialog, Text
from tkinter import messagebox
from PIL import Image, ImageTk
import time
import re
import os
import threading

* This next line of code is where selenium web driver is used to send text input, keystrokes and communicate with the website.

In [2]:
# Function to retrive images from google images website using selenium chrome web driver
def web_scraper(img, num, path):
    Options = webdriver.ChromeOptions()
    Options.add_argument("headless")
    driver = webdriver.Chrome(options= Options, executable_path='C:\chromedriver.exe') # Chrome driver downloaded and placed in c:\
    driver.get("https://images.google.com/")
    assert "Google" in driver.title
    elem = driver.find_element_by_name("q")
    elem.clear()
    elem.send_keys(img)
    elem.send_keys(Keys.RETURN)
    elem = driver.find_element_by_tag_name('html')

    SCROLL_PAUSE_TIME = 0.2
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        elem.send_keys(Keys.HOME)
        time.sleep(SCROLL_PAUSE_TIME)

        for i in range(1,num+1):
            try:
                driver.find_element_by_xpath('//*[@id="islrg"]/div[1]/div['+str(i)+']/a[1]/div[1]/img').screenshot(f'{path}\{img}-'+str(i)+'.png')
            except:
                pass
    driver.close()

* This next line of code returns information on the folders and number of files, where you saved the images.

In [3]:
# Functionalty for counting total number of directories and files in web scraper base directory
global file_lst
def path_list(input_path, path):
    APP_FOLDER = input_path
    totalDir = 0
    print(f'Searching in {input_path}')
    for base, dirs, files in os.walk(APP_FOLDER):
        for Directory in dirs:
            totalDir += 1
            totalFiles = (len(os.listdir(f'{input_path}/{Directory}')))
            print(f'Total number of files in - {Directory}: ', totalFiles)    
    print('Total number of directories: ', totalDir)
    file_lst = os.listdir(f'{path}')
    return 

* Next are all the functions used by the GUI

In [4]:
# Run scraper button widgets
def run_scraper():
    getResult = image_name_ent.get()
    getImageNum = img_num.get()
    new_dir = getResult
    parent_dir = input_path
    path = os.path.join(parent_dir, new_dir)
    while os.path.exists(path):
        messagebox.showinfo(title="Error", message=('Folder "{}" already exsist. Please change search image and browse again.'.format(getResult)))
        break
    os.mkdir(path)
    web_scraper(getResult, getImageNum, path)
    path_list(input_path, path)
    pic_lst = []
    for i in range(1,4):
        pic_lst.append('{}\{}-{}.png'.format(path, getResult, str(i)))    
    summery_lbl = tk.Label(root, text=('Done!  {} files in location: {}'.format(getImageNum, path)), font= "Helvetica 12", bg='lightgrey')
    summery_lbl.grid(row=9, column=1, columnspan=4, sticky="w")    
    threading.Thread(image_preview(pic_lst, frame)).start()
    return 

# Generate a three images preview on app
def image_preview(pic_lst, frame):
# fix double sleshes in pic_lst
    pic_lst = [re.sub('\\\\', '{}'.format(r'/'), item) for item in pic_lst]  

    pic1 = Image.open(pic_lst[0])
    pic1_resized = pic1.resize((177,117), Image.ANTIALIAS)
    New_pic1 = ImageTk.PhotoImage(pic1_resized)
    img1 = tk.Label(frame, padx=2, pady=2, image= New_pic1)
    img1.grid(row=8, column=1, sticky='w')

    pic2 = Image.open(pic_lst[1])
    pic2_resized = pic2.resize((177,117), Image.ANTIALIAS)
    New_pic2 = ImageTk.PhotoImage(pic2_resized)
    img2 = tk.Label(frame, padx=2, pady=2, image=New_pic2)
    img2.grid(row=8, column=2, sticky='n')

    pic3 = Image.open(pic_lst[2])
    pic3_resized = pic3.resize((177,117), Image.ANTIALIAS)
    New_pic3 = ImageTk.PhotoImage(pic3_resized)
    img3 = tk.Label(frame, padx=2, pady=2, image=New_pic3)
    img3.grid(row=8, column=3, sticky='e')
    return New_pic1, New_pic2, New_pic3 

# on change dropdown value
def change_dropdown(*args):
    getImageNum = img_num.get()
    return getImageNum

# Clear form button widget
def clear_form():
    image_name_ent.delete(0, 'end')
    img_num.set(5)
    return 

# Browse button widget
def browse():
    global input_path
    input_path = filedialog.askdirectory()
    return


* This is the GUI and where all your interaction will take place

In [5]:
# Creat main app window frame
root = tk.Tk()
root.title("Web photo scraper")
root.config(bg="lightgray", highlightbackground='darkblue', highlightthickness=3)

# Center app on middle of the screen
app_width = 575
app_height = 570
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
x = (screen_width/2) - (app_width/2)
y = (screen_height/2) - (app_height/2)
root.geometry(f'{app_width}x{app_height}+{int(x)}+{int(y)}')

# App icon
dir_path = os.path.dirname(os.path.realpath('Python Project - Web photos scraper.ipynb'))
img_path = os.path.join(dir_path, 'scraper_app_icon.png')
app_img = Image.open(img_path)
app_img_resized = app_img.resize((300,50), Image.ANTIALIAS)
New_app_img = ImageTk.PhotoImage(app_img_resized)

# Image number select dropbox
img_num = tk.IntVar(root)
choices = [5, 10, 20, 50, 100, 1000]
img_num.set(5) # set the default option

# link function to change dropdown
img_num.trace('w',change_dropdown) 

# Widgets creation
app_title = tk.Label(root, padx=2, pady=2, image= New_app_img)
image_name_lbl = tk.Label(root, text="Image name :",font = "Helvetica 12", bg="lightgrey")
image_name_ent = tk.Entry(root, font = "Helvetica 10", bd=2)
img_num_lbl = tk.Label(root, text="Number of images :", font = "Helvetica 12", bg="lightgrey")
popupMenu = tk.OptionMenu(root, img_num, *choices).grid(row=2, column=2, padx=5, pady=5)
browse_lbl = tk.Label(root, text="Browse for location :", font = "Helvetica 12", justify='left', bg="lightgrey")
browse_btn = tk.Button(root, text="Browse", font = "Helvetica 12", bg='lightgrey', command=browse)
scraper_btn = tk.Button(root, text="Run scraper", font = "Helvetica 12", bg='lightgrey', command= run_scraper)
labspace1 = tk.Label(root, text=" ",bg='lightgrey').grid(row=7, column=0, pady=30)
labspace2 = tk.Label(root, text="",bg='lightgrey').grid(row=7, column=1)
frame = tk.LabelFrame(root, borderwidth=2, height=120, width=550, pady=3, padx=3, bg="lightgrey")
frame.grid(row=8, column=1, ipady=1, ipadx=1, sticky='NSEW',columnspan=3)
clear_btn = tk.Button(root, text="Clear", font = "Helvetica 12", bg='lightgrey', command = clear_form)

# Widgets placement on grid
app_title.grid(row=0, column=1, columnspan=5, padx=5, pady=5)
image_name_lbl.grid(row=1, column=1, padx=5, pady=5, sticky="e")
image_name_ent.grid(row=1, column=2, padx=5, pady=5, sticky="w")
img_num_lbl.grid(row=2, column=1, padx=5, pady=5, sticky="e")
browse_lbl.grid(row=3, column=1, padx=5, sticky="e", pady=5)
browse_btn.grid(row=3, column=2, pady=5)
scraper_btn.grid(row=4, column=2, pady=5)
clear_btn.grid(row=10, column=3, pady=25)

image_name_ent.focus()

root.mainloop()

Searching in C:/Web scraper
Total number of files in - ant:  5
Total number of files in - ants:  10
Total number of files in - bee:  5
Total number of files in - bells:  5
Total number of files in - cow:  5
Total number of files in - f-15:  10
Total number of files in - mig-15:  5
Total number of files in - mig-25:  5
Total number of files in - snakes:  5
Total number of directories:  9


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\Tald\miniconda3\lib\tkinter\__init__.py", line 1883, in __call__
    return self.func(*args)
  File "<ipython-input-4-42ee464ecfe3>", line 19, in run_scraper
    threading.Thread(image_preview(pic_lst, frame)).start()
  File "C:\Users\Tald\miniconda3\lib\threading.py", line 784, in __init__
    assert group is None, "group argument must be None for now"
AssertionError: group argument must be None for now
