In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup

import shutil
import requests
import re
import os

import time
#import imutils

In [2]:
def format_query_into_url(query, orientation_value, orientation_flag=False):
    """
    Reformat and add the query into the working url
    
    :param str orientation_value: Can be either "landscape", "potrait" or "square"
    """
    query = query.split()
    query = '-'.join(query)
    
    # To increase the relevancy of the images found, parameter flag are added
    if orientation_flag == True:
        url = r'https://unsplash.com/s/photos/' + query + "/?orientation={}".format(orientation_value)
    else:
        url = r'https://unsplash.com/s/photos/' + query
    return url

In [3]:
def get_link_from_html(html):
    """
    Parse the html code and retrieve all the download links within in
    Return a list containing all the download links
    """
    
    # Method 3: Using regex, search for 'href="/photos/.*?"' pattern
    match_result = re.findall('href="/photos/.*?"', html)
    view_links = match_result.copy()
    # The output after the filter results in a list of string containing the ending part of the
    # hypertext REFerence, href, that leads to the image webpage, e.g. 'href="/photos/mEZ3PoFGs_k"'
    # Rearrange the href to obtain appropriate download link, e.g. https://unsplash.com/photos/BGI9MVXdOCA
    url_base = r'https://unsplash.com'
    for index in range(len(match_result)):
        match_result[index] = re.sub(r'href="', '', match_result[index])
        match_result[index] = re.sub(r'"', '', match_result[index])
        view_links[index] = url_base + match_result[index]
        
        match_result[index] = url_base + match_result[index] + r'/download'
        
    return match_result, view_links

In [4]:
# def get_link_from_html(html):
#     """
#     Parse the html code and retrieve all the download links within in
#     Return a list containing all the download links
#     """
    
#     # Method 3: Using regex, search for 'href="/photos/.*?"' pattern
#     match_result = re.findall('href="/photos/.*?"', html)
#     # The output after the filter results in a list of string containing the ending part of the
#     # hypertext REFerence, href, that leads to the image webpage, e.g. 'href="/photos/mEZ3PoFGs_k"'
#     # Rearrange the href to obtain appropriate download link, e.g. https://unsplash.com/photos/BGI9MVXdOCA
#     url_base = r'https://unsplash.com'
#     for index in range(len(match_result)):
#         match_result[index] = re.sub(r'href="', '', match_result[index])
#         match_result[index] = re.sub(r'"', '', match_result[index])
#         match_result[index] = url_base + match_result[index] + r'/download'
        
#     return match_result

In [6]:
# Set up a driver browser for browsing website
#driver = webdriver.Chrome(executable_path=r'D:\Chrome Driver/chromedriver.exe')

def Unsplash_download(driverpath, query, target_height, path_download, num_img, orientation_value, orientation_flag=False, scrollSpeed=15, delay=2):
    """
    Reformat the query into appropriate working URL in Unsplash and download a number of the images.
    Do note that even if 100 images appear in the webpage, it is not necessarily that all of the imageswill be encoded in the html page source, not by the current pattern I have noticed.
    This could be improved later on.
    
    :param driverpath: Specify the path of the webdriver, should be having extension of .exe
    :param str query: The query for searching.
    :param int target_height: A max. height representing how much to scroll down to, based on the user's screen
    The target_height is approximately 150 times of the number of images returned.
    target_height = 150 * Number of images returned. Use this equation as a guideline to roughly estimate the target_height required.
    
    :param int scrollSpeed: A height representing how fast the driver scrolls downward.
    A lower scrollSpeed results in a more steady execution of code, just incase the webpage scrolls to the bottom without loading.
    (Unsplash loads more picture at it reaches the bottom of the page)
    
    :param str dirname: The directory to store the file.
    This function will automatically create a new folder for storing the iamges on the current working dir.
    
    :param int delay: Parametes specifying how many seconds is delayed between each download to reduce the chances of being flagged as bot
    """
    options = webdriver.ChromeOptions()
    # Change driver settings so doesn't get flagged
    options.add_argument('--disable-blink-features=AutomationControlled')
    # Change the browser resolution
    options.add_argument("window-size=1536,824")
    # Remove the navigator.webdriver flag before it is even set
    driver = webdriver.Chrome(executable_path=r'{}'.format(driverpath), options=options)
   
    # Set the Navigator.webdriver flag to False
    # The navigator.webdriver flag indicates whether a browser is controlled by automation tool
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    # Access the homepage first before searching so that this doesn't raise any suspicions
    home_page = 'https://www.unsplash.com'
    driver.get(home_page)
    time.sleep(3)
    
    url = format_query_into_url(query, orientation_value, orientation_flag)
    # Using the web driver to access the website
    driver.get(url)
    
    # Starts scrolling down, the number of images appear in the page source increases as we keep scrolling down
    SCROLL = True
    scroll_count = 0
    
    while SCROLL:
        driver.execute_script('window.scrollTo(0, window.scrollY+{})'.format(scrollSpeed))
        new_height = driver.execute_script('return document.body.scrollHeight')
        scroll_count += 1
        if new_height > target_height:
            SCROLL = False
        # Set a maximum scrolling limit just incase there is only 1 picture but the website keeps scroling
        if scroll_count > 800000:
            SCROLL = False
            print('Exceeded scrolling limit')
            
    # View the website page source
    html = driver.page_source
    # Retreive the download links from the website page source
    dl_links, view_links = get_link_from_html(html)
    print("{} images are required.".format(num_img))
    print("There is a total of {} images found in the our first iteration.".format(len(dl_links)))
    
    # Check and see if the number of images found meets our requirement
    if len(dl_links) < num_img:
        ENOUGH = False
        print("There is not enough images. Will continue scrolling until we have sufficient image.")
        while ENOUGH == False:
            driver.execute_script('window.scrollTo(0, window.scrollY+{})'.format(scrollSpeed))
            html = driver.page_source
            dl_links, view_links = get_link_from_html(html)
            if len(dl_links) < num_img:
                ENOUGH = False
            else:
                ENOUGH = True
                print("We have now gotten sufficient number of images. Will proceed to the next step of downloading the images.")
        
    else:
        ENOUGH = True
        print("There is enough images. Will proceed to the next step of downloading the images.")
    
    # Only select the required images
    dl_links = dl_links[0:num_img]
    
    # Parse the query into a suitable format for a website directory
    query = query.split()
    query = '_'.join(query)
    
    # Make a new folder for storing the downloaded images
    if os.path.exists(path_download) != True:
        os.makedirs(path_download)
    print("The downloaded images will be saved to '{}'".format(path_download))
    
    # Download the images using the download links in the list
    for index, link in enumerate(dl_links):
        # stream=True to download a large file of object
        # Request doesn release the connection back to the pool until all data is downloaded
        with requests.get(link, stream=True) as dl_response:
            with open('{foldername}/{query}_{suffix:003d}.jpeg'.format(foldername=path_download, query=query, suffix=index),'wb') as output_file:
                shutil.copyfileobj(dl_response.raw, output_file)
                # Delete the HTTP response after finished copying and saving the body of response to local disk
                time.sleep(delay)
                del dl_response
                print("Downloaded '{query}_{suffix:003d}.jpeg'.\n".format(query=query, suffix=index))
    
    return dl_links[0:num_img], view_links[0:num_img]

In [7]:
search = ['People', 'Fashion']

for indivi_query in search:
    download_links, image_links = Unsplash_download(driverpath='ChromeDriver/chromedriver_win32/chromedriver.exe',
                                                query=indivi_query,
                                                target_height=100000,
                                                path_download="Image/CollectedImage/Background_Image/{}".format(indivi_query),
                                                num_img=1000,
                                                orientation_value="landscape",
                                                orientation_flag=True,
                                                scrollSpeed=15,
                                                delay=0)

1000 images are required.
There is a total of 820 images found in the our first iteration.
There is not enough images. Will continue scrolling until we have sufficient image.
We have now gotten sufficient number of images. Will proceed to the next step of downloading the images.
The downloaded images will be saved to 'Image/CollectedImage/Background_Image/People'
Downloaded 'People_000.jpeg'.

Downloaded 'People_001.jpeg'.

Downloaded 'People_002.jpeg'.

Downloaded 'People_003.jpeg'.

Downloaded 'People_004.jpeg'.

Downloaded 'People_005.jpeg'.

Downloaded 'People_006.jpeg'.

Downloaded 'People_007.jpeg'.

Downloaded 'People_008.jpeg'.

Downloaded 'People_009.jpeg'.

Downloaded 'People_010.jpeg'.

Downloaded 'People_011.jpeg'.

Downloaded 'People_012.jpeg'.

Downloaded 'People_013.jpeg'.

Downloaded 'People_014.jpeg'.

Downloaded 'People_015.jpeg'.

Downloaded 'People_016.jpeg'.

Downloaded 'People_017.jpeg'.

Downloaded 'People_018.jpeg'.

Downloaded 'People_019.jpeg'.

Downloaded 'Pe

In [82]:
image_links

['https://unsplash.com/photos/ABGaVhJxwDQ',
 'https://unsplash.com/photos/8YG31Xn4dSw',
 'https://unsplash.com/photos/_7LbC5J-jw4',
 'https://unsplash.com/photos/nF8xhLMmg0c',
 'https://unsplash.com/photos/31-pOduwZGE',
 'https://unsplash.com/photos/pzMP-RGJ7mY',
 'https://unsplash.com/photos/IgUR1iX0mqM',
 'https://unsplash.com/photos/-uHVRvDr7pg',
 'https://unsplash.com/photos/DsAjH9B24G8',
 'https://unsplash.com/photos/MYbhN8KaaEc',
 'https://unsplash.com/photos/2EGNqazbAMk',
 'https://unsplash.com/photos/2TlAsvhqiL0',
 'https://unsplash.com/photos/98Elr-LIvD8',
 'https://unsplash.com/photos/TamMbr4okv4',
 'https://unsplash.com/photos/6dW3xyQvcYE',
 'https://unsplash.com/photos/-87JyMb9ZfU',
 'https://unsplash.com/photos/0K7GgiA8lVE',
 'https://unsplash.com/photos/6V19Uy-tUhs',
 'https://unsplash.com/photos/YxJ5AfKFgFE',
 'https://unsplash.com/photos/oqnVnI5ixHg',
 'https://unsplash.com/photos/rBx6r5OUt7s',
 'https://unsplash.com/photos/-wjk_SSqCE4',
 'https://unsplash.com/photos/wK

In [84]:
print(image_links)

['https://unsplash.com/photos/ABGaVhJxwDQ', 'https://unsplash.com/photos/8YG31Xn4dSw', 'https://unsplash.com/photos/_7LbC5J-jw4', 'https://unsplash.com/photos/nF8xhLMmg0c', 'https://unsplash.com/photos/31-pOduwZGE', 'https://unsplash.com/photos/pzMP-RGJ7mY', 'https://unsplash.com/photos/IgUR1iX0mqM', 'https://unsplash.com/photos/-uHVRvDr7pg', 'https://unsplash.com/photos/DsAjH9B24G8', 'https://unsplash.com/photos/MYbhN8KaaEc', 'https://unsplash.com/photos/2EGNqazbAMk', 'https://unsplash.com/photos/2TlAsvhqiL0', 'https://unsplash.com/photos/98Elr-LIvD8', 'https://unsplash.com/photos/TamMbr4okv4', 'https://unsplash.com/photos/6dW3xyQvcYE', 'https://unsplash.com/photos/-87JyMb9ZfU', 'https://unsplash.com/photos/0K7GgiA8lVE', 'https://unsplash.com/photos/6V19Uy-tUhs', 'https://unsplash.com/photos/YxJ5AfKFgFE', 'https://unsplash.com/photos/oqnVnI5ixHg', 'https://unsplash.com/photos/rBx6r5OUt7s', 'https://unsplash.com/photos/-wjk_SSqCE4', 'https://unsplash.com/photos/wKOKidNT14w', 'https://u