# Scrape Guardian Newspaper Articles

In [1]:
import os
from urllib3.util.retry import Retry

import pandas as pd
import requests
from bs4 import BeautifulSoup
from contexttimer import Timer
from requests.adapters import HTTPAdapter

In [2]:
PROJ_ROOT = os.path.join(os.pardir)

## About

Scrape Guardian newspaper articles.

## User Inputs

In [3]:
urls_guardian = [
    'https://www.theguardian.com/environment/2024/nov/14/worlds-largest-known-coral-discovered-in-solomon-islands',
    'https://www.theguardian.com/environment/article/2024/jun/26/most-of-it-was-dead-scientists-discovers-one-of-great-barrier-reefs-worst-coral-bleaching-events',
    'https://www.theguardian.com/environment/article/2024/jul/30/as-record-heat-risks-bleaching-73-of-the-worlds-coral-reefs-scientists-ask-what-do-we-do-now',
    'https://www.theguardian.com/environment/2024/nov/19/graveyard-of-dead-coral-great-barrier-reef-bleaching-damage',
    'https://www.theguardian.com/environment/2024/apr/15/great-barrier-reef-coral-bleaching-global-heating',
    'https://www.theguardian.com/environment/article/2024/sep/09/sharks-deserting-coral-reefs-climate-crisis-heating-oceans-study',
    'https://www.theguardian.com/environment/2024/apr/17/great-barrier-reef-extreme-coral-bleaching',
    'https://www.theguardian.com/environment/2024/feb/28/aerial-surveys-of-great-barrier-reef-ordered-after-flights-confirm-mass-coral-bleaching',
    'https://www.theguardian.com/environment/2024/may/01/great-barrier-reef-coral-bleaching-crisis',
    'https://www.theguardian.com/environment/2023/dec/07/unprecedented-mass-coral-bleaching-expected-2024-professor-ove-hoegh-guldberg'
]

fname_processed = 'guardian_articles_cleaned.parquet'

In [4]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')

fpath_processed = os.path.join(processed_data_dir, fname_processed)

In [5]:
def get_guardian_text_from_soup(soup):
    """Get Guardian text from soup object"""
    mydiv = soup.find("div", {"class": "article-body-commercial-selector"})
    # print(mydiv)
    if not mydiv:
        mydiv = soup.find("div", {"class": "content__article-body"})
    unwanted_tweets = mydiv.findAll(
        "figure", {"class": "element element-tweet"}
    )
    for unwanted in unwanted_tweets:
        unwanted.extract()
    unwanted_images = mydiv.findAll(
        "figure", {"class": "element element-embed"}
    )
    for unwanted in unwanted_images:
        unwanted.extract()
    unwanted_images2 = mydiv.findAll(
        "figure",
        {
            "class": (
                "element element-image "
                "img--landscape fig--narrow-caption fig--has-shares"
            )
        },
    )
    for unwanted in unwanted_images2:
        unwanted.extract()
    all_text = str(mydiv.text).replace("\n", "")
    art_text = all_text.split("Topics")[0]
    # print(art_text)
    return art_text

## Scrape Article Text

In [6]:
%%time
l_texts = {}
for k, link in enumerate(urls_guardian):
    print(f"Scraping article number {k+1}, Link: {link}")
    # print(site, link)
    with Timer() as t:
        r_session = requests.Session()
        retries = Retry(
            total=2,
            backoff_factor=0.1,
            status_forcelist=[500, 502, 503, 504],
        )
        r_session.mount("http://", HTTPAdapter(max_retries=retries))
        try:
            page_response = r_session.get(link, timeout=5)
        except Exception as ex:
            print(f"{ex} Error connecting to {link}")
        else:
            try:
                soup = BeautifulSoup(page_response.content, "lxml")
                # print(soup.prettify())
            except Exception as e:
                print(f"Experienced error {str(e)} when scraping {link}")
                text = np.nan
            else:
                text = get_guardian_text_from_soup(soup)
    print(f"Scraping time: {t.elapsed:.2f} seconds")
    l_texts[link] = [text]

Scraping article number 1, Link: https://www.theguardian.com/environment/2024/nov/14/worlds-largest-known-coral-discovered-in-solomon-islands
Scraping time: 0.44 seconds
Scraping article number 2, Link: https://www.theguardian.com/environment/article/2024/jun/26/most-of-it-was-dead-scientists-discovers-one-of-great-barrier-reefs-worst-coral-bleaching-events
Scraping time: 0.32 seconds
Scraping article number 3, Link: https://www.theguardian.com/environment/article/2024/jul/30/as-record-heat-risks-bleaching-73-of-the-worlds-coral-reefs-scientists-ask-what-do-we-do-now
Scraping time: 0.50 seconds
Scraping article number 4, Link: https://www.theguardian.com/environment/2024/nov/19/graveyard-of-dead-coral-great-barrier-reef-bleaching-damage
Scraping time: 0.46 seconds
Scraping article number 5, Link: https://www.theguardian.com/environment/2024/apr/15/great-barrier-reef-coral-bleaching-global-heating
Scraping time: 0.44 seconds
Scraping article number 6, Link: https://www.theguardian.com/e

In [7]:
df = (
    pd.DataFrame.from_dict(l_texts, orient="index")
    .reset_index()
    .rename(columns={"index": "url", 0: "text"})
    .assign(
        char_count=lambda df: df['text'].str.len(),
        sentence_count_raw=lambda df: df['text'].str.split(". ").str.len(),
        token_count=lambda df: (df['text'].str.len()/4).round().astype(int),
    )
    .convert_dtypes()
)
display(df)

Unnamed: 0,url,text,char_count,sentence_count_raw,token_count
0,https://www.theguardian.com/environment/2024/n...,"The world’s largest known coral, visible from ...",3087,481,772
1,https://www.theguardian.com/environment/articl...,At least 97% of corals on a reef in the Great ...,7025,1115,1756
2,https://www.theguardian.com/environment/articl...,After 18 months of record-breaking ocean tempe...,9264,1440,2316
3,https://www.theguardian.com/environment/2024/n...,Reefs across the north of the Great Barrier Re...,4147,685,1037
4,https://www.theguardian.com/environment/2024/a...,Global heating has pushed the world’s coral re...,6656,1042,1664
5,https://www.theguardian.com/environment/articl...,Sharks are deserting their coral reef homes as...,4531,717,1133
6,https://www.theguardian.com/environment/2024/a...,The Great Barrier Reef is in the midst of what...,6543,1041,1636
7,https://www.theguardian.com/environment/2024/f...,The Great Barrier Reef’s management authority ...,4484,696,1121
8,https://www.theguardian.com/environment/2024/m...,Beneath the turquoise waters off Heron Island ...,9597,1561,2399
9,https://www.theguardian.com/environment/2023/d...,"Record-breaking land and sea temperatures, dri...",4749,766,1187


## Export to Disk

In [8]:
%%time
df.to_parquet(fpath_processed, index=False)

CPU times: user 6.68 ms, sys: 1.99 ms, total: 8.67 ms
Wall time: 8.31 ms
