## Gather images with Unsplash API
Goal: Use 'Unsplash API' to make basic search queries in form of HTTP requests to the api server and fetch specified data of images/photos in form of JSON. 
<br><br>To do this: 
<ol>
    <li> Initialization: Set up environment for necessary API key(s) and end point</li>
    <li> Query function: Define function for basic querying </li>
    <li> Filtering function: Define function to filter based on some attribute of JSON. In this case it is created_date. </li>
    <li> Composite functions: Use the basic functions as subroutines to get the result I need. 
</ol>


In [1]:
# Relevant imports
import requests
import json
import os
from datetime import datetime

In [2]:
# Define variables required for using Unsplash API
ACCESS_KEY = "JojQg5MlI4bH3scg1sQN4Am9-ytvq0Xw-eezzWx5tvE"
SECRET_KEY = "trgN82fiIFdISXYIhnQscbgtI4jUuYhdDciwvmHV8-c"
END_POINT = "https://api.unsplash.com/search/photos"

In [3]:
# Basic querying function 
def search_unsplash(query, page=1, per_page=10):
    """
    Search for photos on Unsplash based on the given query.

    Parameters:
    - query (str): The search term to use for the query.
    - page (int, optional): The page number to fetch. Default is 1.
    - per_page (int, optional): Number of items per page. Default is 10.
    
    Returns:
    - dict: JSON response from the Unsplash API containing details about 
            the photos that match the query, as well as metadata about 
            the search results.
    """
    headers = {
        'Authorization': f'Client-ID {ACCESS_KEY}'
    }
    
    params = {
        'query': query,
        'page': page,
        'per_page': per_page
    }
    
    response = requests.get(END_POINT, headers=headers, params=params)
    response.raise_for_status()  # Check if the request was successful
    return response.json()


In [4]:

# Example usage, fetch the first 100 images when querying "black lives matter"
# result = search_unsplash("black lives matter", page=1, per_page=100)
# print(result)


In [5]:
# Filter JSON response by date. 
def filter_by_date(json_response, start, end):
    """
    Filters images from the JSON response based on their 'created_at' date.
    
    Parameters:
    - json_response (dict): The JSON response from the Unsplash API.
    - start (str): The start date in 'YYYY-MM-DD' format.
    - end (str): The end date in 'YYYY-MM-DD' format.
    
    Returns:
    - list: A list of images filtered by the date criteria.
    """
    
    filtered_images = []

    # Convert start and end strings to date objects
    start_date = datetime.strptime(start, "%Y-%m-%d").date()
    end_date = datetime.strptime(end, "%Y-%m-%d").date()

    for image in json_response.get("results", []):
        image_date_str = image.get("created_at", "").split("T")[0]  # Extract the date part
        image_date = datetime.strptime(image_date_str, "%Y-%m-%d").date()

        if start_date <= image_date <= end_date:
            filtered_images.append(image)
    
    return filtered_images

# Example usage:
# result = search_unsplash("black lives matter", page=1, per_page=100)
# filtered_images = filter_by_date(result, "2020-05-01", "2020-12-31")

In [6]:
# Get n images given a query and date range
def query_in_date_range(query, start, end, count):
    """
    Queries the Unsplash API for images based on the given search term and filters them by date.
    
    Parameters:
    - query (str): The search term.
    - start (str): The start date in 'YYYY-MM-DD' format.
    - end (str): The end date in 'YYYY-MM-DD' format.
    - count (int): The number of images to retrieve.
    
    Returns:
    - str: A JSON string containing the list of images filtered by the date criteria.
    """
    
    collected_images = []
    page = 1
    per_page = 100  # Maximum allowed by most APIs for a single request

    while len(collected_images) < count:
        response = search_unsplash(query, page=page, per_page=per_page)
        filtered = filter_by_date(response, start, end)
        
        collected_images.extend(filtered)
        
        # Check if we've collected enough images or if there are no more results
        if len(filtered) == 0 or len(collected_images) >= count:
            break
        
        page += 1

    # Create a dictionary with the desired structure
    response_data = {
        'results': collected_images[:count],
        'total': len(collected_images[:count])
    }

    # Convert the dictionary to a JSON string
    return json.dumps(response_data)

# Example usage:
# json_response = query_in_date_range("black lives matter", "2020-05-01", "2020-12-31", 100)
# print(json_response)

In [7]:
# A function (for sanity check) containing dates of the images in json response. 
def list_of_dates_of_images(json_response) -> list:
    """
    Extracts the 'created_at' dates from the given JSON response.
    
    Parameters:
    - json_response (str): The JSON response containing image details.
    
    Returns:
    - list: A list of 'created_at' dates for each image in the response.
    """
    
    # Parse the JSON string to get a dictionary
    data = json.loads(json_response)
    
    # Extract the 'created_at' date for each image
    dates = [image['created_at'] for image in data.get('results', [])]
    
    return dates

# Example usage:
# json_response = query_in_date_range("black lives matter", "2020-05-01", "2020-12-31", 100)
# dates = list_of_dates_of_images(json_response)
# print(dates)

In [8]:
# Given a json response, download images into 'downloads' folder. 
def download_images_from_json(json_response, download_folder='downloads'):
    """
    Downloads images from the given JSON response.
    
    Parameters:
    - json_response (str): The JSON response containing image URLs.
    - download_folder (str, optional): The folder where images will be saved. Default is 'downloads'.
    
    Returns:
    - list: A list of file paths where images were saved.
    """
    
    # Parse the JSON string to get a dictionary
    data = json.loads(json_response)
    
    # Create the download folder if it doesn't exist
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    saved_files = []

    # Add a loop counter for auto-indexing
    for index, image in enumerate(data.get('results', []), start=1):
        # Create a filename using the loop counter
        filename = os.path.join(download_folder, f'image{index}.jpg')
        
        # Assuming the 'urls' field contains a 'full' subfield with the image URL
        image_url = image['urls']['full']
        
        # Download the image
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise an error for bad responses

        # Save the image to the file
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        
        saved_files.append(filename)

    return saved_files

# Example usage:
# json_response = query_in_date_range("black lives matter", "2020-05-01", "2020-12-31", 100)
# downloaded_files = download_images_from_json(json_response)
# print(downloaded_files)

In [9]:
# Query 10 images of "black lives matter" in date of 2020 may to 2020 december
json_response = query_in_date_range("black lives matter", "2020-05-01", "2020-12-31", 100)
# Perform check on the dates of the images 
#print(list_of_dates_of_images(json_response))
# Download images of the above 10 images
downloaded_files = download_images_from_json(json_response)

['2020-06-03T13:12:16Z', '2020-06-16T11:50:43Z', '2020-06-13T20:46:58Z', '2020-11-26T01:02:09Z', '2020-06-06T23:28:20Z', '2020-10-31T16:14:23Z', '2020-06-03T13:06:08Z', '2020-06-02T02:49:45Z', '2020-06-10T19:35:16Z', '2020-06-13T08:18:15Z', '2020-06-11T15:01:16Z', '2020-06-01T16:57:29Z', '2020-06-08T13:16:05Z', '2020-10-11T17:21:12Z', '2020-06-03T13:06:08Z', '2020-06-19T01:34:38Z', '2020-06-08T14:21:41Z', '2020-06-25T03:26:47Z', '2020-06-24T02:50:38Z', '2020-06-12T15:05:36Z', '2020-06-11T04:16:33Z', '2020-06-18T19:14:43Z', '2020-08-08T07:36:25Z', '2020-06-03T13:10:11Z', '2020-07-06T16:19:56Z', '2020-08-21T18:59:01Z', '2020-12-04T09:31:29Z', '2020-05-31T13:26:11Z', '2020-06-10T03:19:33Z', '2020-06-10T21:57:47Z', '2020-06-05T13:15:40Z', '2020-06-12T15:05:35Z', '2020-06-08T13:16:05Z', '2020-06-08T14:21:41Z', '2020-07-30T15:21:38Z', '2020-06-08T13:23:41Z', '2020-06-10T03:17:21Z', '2020-06-06T20:38:45Z', '2020-06-11T19:01:28Z', '2020-05-31T13:36:35Z', '2020-06-08T13:16:05Z', '2020-06-08T14: