# Digital image archiving, APIs, and webscraping

In [None]:
![gallery](gallery.jpeg)

In [None]:
import requests
import os
import re
import requests
import pandas as pd
import getpass
from urllib.parse import urlparse
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
import string
punct = list(string.punctuation)
import seaborn as sns
sns.set()

## How can we programmatically access images in a way that facilitates research?

An API (Application Programming Interface) is a set of rules and protocols that allows different software applications to communicate with each other. It defines how requests and responses should be structured, enabling developers to access and use the functionality of another service, library, or platform without needing to understand its internal workings.

APIs should be your go-to resource of choice when gathering large quantities of data, as they generally provide this data in structured form, allowing you to easily manipulate it.

Microsoft makes Bing image search available as an API; so do other search providers. The Bing API is useful because it gives good metadata on the images it finds. But first, let's look at a more intuitive API.

### The Project Gutenberg API

[Project Gutenberg](https://www.gutenberg.org/) provides electronic copies of large variety of out-of-copyright texts. It can be accessed using the [Gutendex API](https://gutendex.com/). The `requests` library in python can be used to query this API via the relevant parameters (see the documentation for what these are).  

In [None]:
# Define the API root url:

gut = 'https://gutendex.com/books/'

In [None]:
# Query by topic (here, 'death')

params = {'topic':'death'} 
death = requests.get(url = gut, params = params).json() # returns the results as a python dictionary

In [None]:
print(death['results'][2]['summaries'][0])

## Unsplash is a free high-quality image API. How can we access it?

In [None]:
api_key = "z_387ySbV7I3U0uEOMhFW2neHnnALBVPCFVw-7-wc7o"

def search_unsplash(query, per_page=10):
    
    url = "https://api.unsplash.com/search/photos"
    headers = {
        "Authorization": f"Client-ID {api_key}"
    }

    params = {
        "query": query,
        "per_page": per_page
    }
    
    # Make the API request
    response = requests.get(url, headers=headers, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        # Extract relevant image details
        images = []
        for result in data["results"]:
            image_info = {
                "description": result.get("description", "No description"),
                "url": result["urls"]["regular"],  # Use "regular" for medium-sized images
                "download_url": result["links"]["download"],  # URL to download the image
                "photographer": result["user"]["name"],
                "photographer_profile": result["user"]["links"]["html"]
            }
            images.append(image_info)
        images = pd.DataFrame(images)
        return images
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None




def download_images(image_urls, save_dir="unsplash_images"):
    # Create the save directory if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Loop through the list of URLs and download each image
    for i, url in enumerate(image_urls):
        try:
            # Send a GET request to the image URL
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise an error for bad status codes

            # Extract the image file name from the URL
            file_name = f"image_{i + 1}.png"  # You can customize the naming convention
            file_path = os.path.join(save_dir, file_name)

            # Save the image to the specified directory
            with open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"Downloaded: {file_path}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")

### Now, let's measure the emotional variation of any text using the VAD norms

In [None]:
vad = pd.read_csv('vad.csv', index_col = 0)  #VAD norms
vad = vad[["V.Mean.Sum", "A.Mean.Sum", "D.Mean.Sum"]]
vad.columns = ['valence', 'arousal', 'dominance']

def vad_data(word_list):
    word_list = [i.lower() for i in word_list]
    words = []
    norms = []
    
    for i in word_list:
        if i in vad.index:
            norms.append(vad.loc[i])
            words.append(i)
        else:
            pass
    norms_vad = pd.DataFrame(norms).mean()
    return norms_vad