# Scrape Guardian Article URLs

In [None]:
import os
from time import sleep
from random import randint

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

In [None]:
load_dotenv(dotenv_path=os.path.join(os.pardir, os.pardir, '.env'))

In [None]:
PROJ_ROOT = os.path.join(os.pardir)

## About

Retrieve URLs for Guardian newspaper articles on coral bleaching events covering the last five years (between January 1, 2019 and October 31, 2024, inclusive), by querying the [`/content` endpoint of the Guardian API](https://open-platform.theguardian.com/documentation/search).

## User Inputs

In [None]:
guardian_start_page_num = 1
guardian_num_pages_wanted = 10
guardian_query_min_delay = 8
guardian_query_max_delay = 14

url = "https://content.guardianapis.com/search"

# API Query inputs
query_params = {
    "guardian": {
        "section": "environment",
        "from-date": "2019-01-01",
        "to-date": "2024-10-31",
        "order-by": "oldest",
        "page-size": 10,
        "q": "coral bleaching event",
        "api-key": os.getenv('GUARDIAN_API_KEY'),
    },
}

In [None]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw', 'guardian', 'urls')

## Retrieve Guardian Newspaper Metadata from API

Perform the following workflow

1. step 1. Calculate the maximum number of pages of results available based on the search term
2. step 2. Set the maximum page number to be queried

Loop over each page to be queried and retrieve article details for all articles on that page
1. step 3. Send GET request to API endpoint and retrieve the response
2. step 4. Get results dictionary from the response attribute of the `json`ified response
3. step 5. Extract attributes of response json and store in dictionary
4. step 6. Convert dictionary of urls to a `DataFrame`
5. step 7. Append page number of `DataFrame` from step 6.
6. step 8. Concatenate `DataFrame`s across all pages of search results
7. step 9. Filter `DataFrame` to retain published news articles and remove blog posts
8. step 10. Export `DataFrame` of metadata to `*.csv` file

In [None]:
%%time
# Guardian urls to file
dfs_guardian_details = []
# 1. Find maximum number of pages of results available
guardian_max_pages_returned = requests.get(
    url, params=query_params["guardian"]
).json()["response"]["pages"]
print(f"Found {guardian_max_pages_returned} pages of results")
# 2. Set the maximum page number to be queried
if guardian_num_pages_wanted == -1:
    guardian_max_page_num = guardian_max_pages_returned
    guardian_pages_to_use = "all available"
else:
    guardian_max_page_num = guardian_start_page_num + guardian_num_pages_wanted
    guardian_pages_to_use = "requested"
print(
    f"Retrieving articles from {guardian_pages_to_use} pages, "
    f"number of requested pages = {guardian_num_pages_wanted}\n"
)
# Loop over all pages to be queried and retrieve article details
for page in range(guardian_start_page_num, guardian_max_page_num):
    d = {}
    query_params["guardian"]["page"] = page
    # 3. Send GET request to API and retrieve response
    r = requests.get(url, params=query_params["guardian"])
    assert r.status_code == 200
    # print(r.json().keys())
    # 4. Get results dict from response attribute of jsonified response
    try:
        rdocs = r.json()["response"]["results"]
    except KeyError as e:
        # print(r.json())
        if (
            r.json()["response"]["message"]
            == "requested page is beyond the number of available pages"
        ):
            print(f"Page {page} exceeded number of available pages. Stopping.")
            break
    else:
        print(f"Page: {page}, Found: {len(rdocs)} articles")
        # 5. Extract various attributes (metadata) of response json and store in dict
        for key in [
            "webUrl",
            "id",
            "webPublicationDate",
            "apiUrl",
            "webTitle",
            "document_type",
            "sectionId",
            "sectionName",
            "type",
            "isHosted",
            "pillarId",
            "pillarName",
        ]:
            d[key] = []
            for rr in rdocs:
                try:
                    rr[key]
                    d[key].append(rr[key])
                except Exception as e:
                    d[key].append(None)
        print(f"Retrieved {len(rdocs)} article details from page {page}")
        # 6. Convert dict of urls to DataFrame of urls
        df_guardian_article = pd.DataFrame.from_dict(d, orient="index").T
        # 7. Append page number of DataFrame
        df_guardian_article["page"] = page
        dfs_guardian_details.append(df_guardian_article)
        # Pause between pages
        if page != (guardian_start_page_num + guardian_num_pages_wanted) - 1:
            random_sleep_time = randint(
                guardian_query_min_delay, guardian_query_max_delay
            )
            print(
                f"Pausing for {random_sleep_time} seconds before retrieving "
                f"from page {page+1}\n"
            )
            sleep(random_sleep_time)
# 8. Concatenate DataFrames across all pages
df_guardian_details = pd.concat(
    dfs_guardian_details, axis=0, ignore_index=True
).drop_duplicates()
# 9. Filter DataFrame to retain articles and remove blogs
df_guardian_details = df_guardian_details.loc[
    (df_guardian_details["type"] == "article")
    & (~df_guardian_details["webUrl"].str.contains("blog"))
]
print(f"\nGot {df_guardian_details.shape[0]:,} articles after filtering")
with pd.option_context('display.max_colwidth', None):
    display(df_guardian_details)
# 10. Export DataFrame of metadata to *.csv file
fname_urls = (
    f"urls_pgs_{guardian_start_page_num}_{guardian_max_page_num-1}.csv"
)
fpath_urls = os.path.join(raw_data_dir, fname_urls)
df_guardian_details.to_csv(fpath_urls, index=False)
print(f"Exported {len(df_guardian_details):,} rows to {fname_urls}")

## Conclusion

This notebook has retrieved URLs for news articles on *coral bleaching events* (search term) in the Guardian's **Environment** section over the last five years (between Jan 1, 2019 and October 31, 2024).

These search results were returned on a single batch of pages from the Guardian API. The URLs in the query results were returned in response to the search term *coral bleaching events*.

The batch size is specified by `guardian_num_pages_wanted`. The batch starts at a page number specified by `guardian_start_page_num` and ends at a page number that is calculated from `guardian_start_page_num + guardian_num_pages_wanted`.