# Scrape & Clean Guardian Newspaper Articles

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import sys
from glob import glob
from random import choice, randint
from time import sleep
from urllib3.util.retry import Retry

import pandas as pd
import requests
from bs4 import BeautifulSoup
from contexttimer import Timer
from requests.adapters import HTTPAdapter

In [None]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [None]:
%aimport webscraping_utils
from webscraping_utils import get_custom_headers_list

## About

### Objective

Scrape and clean Guardian newspaper articles.

### Output

The output is a `DataFrame` containing the following columns

1. `url_num`
   - (news article) url number
2. `url_name`
   - `NULL`
3. `url`
   - web url of news article
4. `text`
   - raw text
5. `char_count`
   - approximate number of characters in raw text
6. `sentence_count_raw`
   - approximate number of sentences in raw text
7. `token_count`
   - approximate number of tokens in raw text
8. `text_cleaned`
   - cleaned text from paper
9. `type`
   - type of data source (`news_article`)

### Notes About Data Privacy

1. All news article texts were retrieved by web-scraping and storing the scraped texts in a `.parquet` file. All `.parquet` files are stored locally and will be deleted on November 30, 2024.
2. Raw or processed text outputs are not shown here.

## User Inputs

In [None]:
# data cleaning
aft_newsletter_advert = "skip past newsletter promotionSign up to Afternoon UpdateFree daily newsletterOur Australian afternoon update breaks down the key stories of the day, telling you what's happening and why it mattersEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
down_to_earth_newsletter_advert = "skip past newsletter promotionSign up to Down to EarthFree weekly newsletterThe planet's most important stories. Get all the week's environment news - the good, the bad and the essentialEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
morn_newsletter_advert = "skip past newsletter promotionSign up to Morning MailFree daily newsletterOur Australian morning briefing breaks down the key stories of the day, telling you what's happening and why it mattersEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
newsletter_signup_advert = "Sign up for Guardian Australia’s free morning and afternoon email newsletters for your daily news roundup"

urls_pages = range(65, 70+1)
min_wait = 9
max_wait = 18

fname_processed = 'guardian_articles_cleaned'

output_columns = [
    'url_num',
    'url_name',
    'url',
    'text',
    'char_count',
    'sentence_count_raw',
    'token_count',
    'text_cleaned',
    'type',
]

In [None]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')

fpaths_urls = sorted(
    glob(
        os.path.join(raw_data_dir, 'guardian', 'urls', "urls_*.csv")
    )
)

fpath_processed = os.path.join(
    processed_data_dir,
    f"{fname_processed}_pgs_{min(urls_pages)}_{max(urls_pages)}.parquet"
)

# Define list of request headers to (randomly) choose from
headers_list = get_custom_headers_list()

In [None]:
def get_guardian_text_from_soup(soup):
    """Get Guardian text from soup object"""
    mydiv = soup.find("div", {"class": "article-body-commercial-selector"})
    # print(mydiv)
    if not mydiv:
        mydiv = soup.find("div", {"class": "content__article-body"})
    unwanted_tweets = mydiv.findAll(
        "figure", {"class": "element element-tweet"}
    )
    for unwanted in unwanted_tweets:
        unwanted.extract()
    unwanted_images = mydiv.findAll(
        "figure", {"class": "element element-embed"}
    )
    for unwanted in unwanted_images:
        unwanted.extract()
    unwanted_images2 = mydiv.findAll(
        "figure",
        {
            "class": (
                "element element-image "
                "img--landscape fig--narrow-caption fig--has-shares"
            )
        },
    )
    for unwanted in unwanted_images2:
        unwanted.extract()
    all_text = str(mydiv.text).replace("\n", "")
    art_text = all_text.split("Topics")[0]
    # print(art_text)
    return art_text

In [None]:
def numeric_sort(test_string):
    """Sort by numeric part of string."""
    return list(map(int, re.findall(r'\d+', test_string)))[0]

## Load Article URLs

Sort list of urls by page bounds

In [None]:
fpaths_urls.sort(key=numeric_sort)

Load URLs from required page

In [None]:
# urls_guardian = [
#     'https://www.theguardian.com/environment/2024/nov/14/worlds-largest-known-coral-discovered-in-solomon-islands',
#     'https://www.theguardian.com/environment/article/2024/jun/26/most-of-it-was-dead-scientists-discovers-one-of-great-barrier-reefs-worst-coral-bleaching-events',
#     'https://www.theguardian.com/environment/article/2024/jul/30/as-record-heat-risks-bleaching-73-of-the-worlds-coral-reefs-scientists-ask-what-do-we-do-now',
#     'https://www.theguardian.com/environment/2024/nov/19/graveyard-of-dead-coral-great-barrier-reef-bleaching-damage',
#     'https://www.theguardian.com/environment/2024/apr/15/great-barrier-reef-coral-bleaching-global-heating',
#     'https://www.theguardian.com/environment/article/2024/sep/09/sharks-deserting-coral-reefs-climate-crisis-heating-oceans-study',
#     'https://www.theguardian.com/environment/2024/apr/17/great-barrier-reef-extreme-coral-bleaching',
#     'https://www.theguardian.com/environment/2024/feb/28/aerial-surveys-of-great-barrier-reef-ordered-after-flights-confirm-mass-coral-bleaching',
#     'https://www.theguardian.com/environment/2024/may/01/great-barrier-reef-coral-bleaching-crisis',
#     'https://www.theguardian.com/environment/2023/dec/07/unprecedented-mass-coral-bleaching-expected-2024-professor-ove-hoegh-guldberg'
# ]
df_urls = (
    pd.concat(
        [pd.read_csv(f, usecols=['page', 'webUrl']) for f in fpaths_urls]
    )
    .query(
        "(webUrl.str.contains('coral')) | "
        "(webUrl.str.contains('reef')) | "
        "(webUrl.str.contains('algae')) | "
        "(webUrl.str.contains('algal')) | "
        "(webUrl.str.contains('ocean')) | "
        "(webUrl.str.contains('marine')) | "
        "(webUrl.str.contains('zooxanthellae')) | "
        "(webUrl.str.contains('trophic')) | "
        "(webUrl.str.contains('symbiosis')) | "
        "(webUrl.str.contains('symbionts')) | "
        "(webUrl.str.contains('anthropogenic')) | "
        "(webUrl.str.contains('eutrophication'))"
    )
)
urls_guardian = (
    df_urls
    .query(f"page.isin(@urls_pages)")
    ['webUrl']
    .tolist()
)
assert urls_guardian
print(
    f"Found {len(urls_guardian)} relevant article(s) out of {len(df_urls):,} "
    "total articles to be scraped on pages "
    f"{min(urls_pages)}-{max(urls_pages)}"
)

## Scrape Article Text

### Extract

In [None]:
%%time
l_texts = {}
for k, link in enumerate(urls_guardian, 1):
    print(f"Scraping article number {k}/{len(urls_guardian):,}, Link: {link}")
    # print(site, link)
    with Timer() as t:
        r_session = requests.Session()
        retries = Retry(
            total=2,
            backoff_factor=0.1,
            status_forcelist=[500, 502, 503, 504],
        )
        r_session.mount("http://", HTTPAdapter(max_retries=retries))
        try:
            headers = choice(headers_list)
            page_response = r_session.get(link, timeout=5, headers=headers)
        except Exception as ex:
            print(f"{ex} Error connecting to {link}")
        else:
            try:
                soup = BeautifulSoup(page_response.content, "lxml")
                # print(soup.prettify())
            except Exception as e:
                print(f"Experienced error {str(e)} when scraping {link}")
                text = np.nan
            else:
                text = get_guardian_text_from_soup(soup)
    num_chars = len(text) if text else 0
    print(
        f"Scraped {num_chars:,} characters of text from article {k} in "
        f"{t.elapsed:.2f} seconds"
    )
    l_texts[link] = [text]
    if k != len(urls_guardian):
        random_sleep_time = randint(min_wait, max_wait)
        print(
            f"Pausing for {random_sleep_time} seconds after retrieving "
            f"article {k} from pages {min(urls_pages)}-{max(urls_pages)}...",
            end="",
        )
        sleep(random_sleep_time)
        print("done.")
        print('\n')

Store article metadata and text in tabular format

In [None]:
df = (
    pd.DataFrame.from_dict(l_texts, orient="index")
    .reset_index()
    .rename(columns={"index": "url", 0: "text"})
    .assign(
        url_num=lambda df: pd.Series(range(1, len(df)+1)),
        url_name=None,
        char_count=lambda df: df['text'].str.len(),
        sentence_count_raw=lambda df: df['text'].str.split(". ").str.len(),
        token_count=lambda df: (df['text'].str.len()/4).round().astype(int),
        type='news_article',
    )
    .convert_dtypes()
)
df

### Transform

Clean the article text (in the `text` column)

In [None]:
df = (
    df
    .assign(
        text_cleaned=lambda df: (
            df['text']
            .str.replace("– video", " ")
            .str.replace('–', '')
            .str.replace('“', '\"')
            .str.replace('”', '\"')
            .str.replace("‘", "'")
            .str.replace('’', "'")
            .str.replace('|', '')
            .str.replace(aft_newsletter_advert, " ")
            .str.replace(down_to_earth_newsletter_advert, " ")
            .str.replace(morn_newsletter_advert, " ")
            .str.replace(newsletter_signup_advert, "")
            .str.replace("QuickGuideWhat is coral bleaching?Show", " What is coral bleaching? ")
            .str.replace("Quick GuideWhat is coral bleaching?Show", " What is coral bleaching? ")
            .str.replace("Was this helpful?Thank you for your feedback.", " ")
            .str.replace("Read more", " ")
            .str.replace("©", "")
        )
        
    )
    [output_columns]
)
df

### Load

Export to disk

In [None]:
%%time
df.to_parquet(fpath_processed, index=False)
print(
    f"Exported {len(df):,} row(s) of article text from pages "
    f"{min(urls_pages)}-{max(urls_pages)} to "
    f"{os.path.basename(fpath_processed)}"
)