# Scrape & Clean Guardian Newspaper Articles

In [3]:
import os
from urllib3.util.retry import Retry

import pandas as pd
import requests
from bs4 import BeautifulSoup
from contexttimer import Timer
from requests.adapters import HTTPAdapter

In [4]:
PROJ_ROOT = os.path.join(os.pardir)

## About

### Objective

Scrape Guardian newspaper articles.

### Output

The output is a `DataFrame` containing the following columns

1. `url_num`
   - (news article) url number
2. `url_name`
   - `NULL`
3. `url`
   - web url of news article
4. `text`
   - raw text
5. `char_count`
   - approximate number of characters in raw text
6. `sentence_count_raw`
   - approximate number of sentences in raw text
7. `token_count`
   - approximate number of tokens in raw text
8. `text_cleaned`
   - cleaned text from paper
9. `type`
   - type of data source (`news_article`)

### Notes About Data Privacy

1. All news article texts were retrieved by web-scraping and storing the scraped texts in a `.parquet` file. All `.parquet` files are stored locally and will be deleted on November 30, 2024.
2. Raw or processed text outputs are not shown here.

## User Inputs

In [17]:
urls_guardian = [
    'https://www.theguardian.com/environment/2024/nov/14/worlds-largest-known-coral-discovered-in-solomon-islands',
    'https://www.theguardian.com/environment/article/2024/jun/26/most-of-it-was-dead-scientists-discovers-one-of-great-barrier-reefs-worst-coral-bleaching-events',
    'https://www.theguardian.com/environment/article/2024/jul/30/as-record-heat-risks-bleaching-73-of-the-worlds-coral-reefs-scientists-ask-what-do-we-do-now',
    'https://www.theguardian.com/environment/2024/nov/19/graveyard-of-dead-coral-great-barrier-reef-bleaching-damage',
    'https://www.theguardian.com/environment/2024/apr/15/great-barrier-reef-coral-bleaching-global-heating',
    'https://www.theguardian.com/environment/article/2024/sep/09/sharks-deserting-coral-reefs-climate-crisis-heating-oceans-study',
    'https://www.theguardian.com/environment/2024/apr/17/great-barrier-reef-extreme-coral-bleaching',
    'https://www.theguardian.com/environment/2024/feb/28/aerial-surveys-of-great-barrier-reef-ordered-after-flights-confirm-mass-coral-bleaching',
    'https://www.theguardian.com/environment/2024/may/01/great-barrier-reef-coral-bleaching-crisis',
    'https://www.theguardian.com/environment/2023/dec/07/unprecedented-mass-coral-bleaching-expected-2024-professor-ove-hoegh-guldberg'
]

# data cleaning
aft_newsletter_advert = "skip past newsletter promotionSign up to Afternoon UpdateFree daily newsletterOur Australian afternoon update breaks down the key stories of the day, telling you what's happening and why it mattersEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
down_to_earth_newsletter_advert = "skip past newsletter promotionSign up to Down to EarthFree weekly newsletterThe planet's most important stories. Get all the week's environment news - the good, the bad and the essentialEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
morn_newsletter_advert = "skip past newsletter promotionSign up to Morning MailFree daily newsletterOur Australian morning briefing breaks down the key stories of the day, telling you what's happening and why it mattersEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
newsletter_signup_advert = "Sign up for Guardian Australia’s free morning and afternoon email newsletters for your daily news roundup"

fname_processed = 'guardian_articles_cleaned.parquet'

output_columns = [
    'url_num',
    'url_name',
    'url',
    'text',
    'char_count',
    'sentence_count_raw',
    'token_count',
    'text_cleaned',
    'type',
]

In [6]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')

fpath_processed = os.path.join(processed_data_dir, fname_processed)

In [7]:
def get_guardian_text_from_soup(soup):
    """Get Guardian text from soup object"""
    mydiv = soup.find("div", {"class": "article-body-commercial-selector"})
    # print(mydiv)
    if not mydiv:
        mydiv = soup.find("div", {"class": "content__article-body"})
    unwanted_tweets = mydiv.findAll(
        "figure", {"class": "element element-tweet"}
    )
    for unwanted in unwanted_tweets:
        unwanted.extract()
    unwanted_images = mydiv.findAll(
        "figure", {"class": "element element-embed"}
    )
    for unwanted in unwanted_images:
        unwanted.extract()
    unwanted_images2 = mydiv.findAll(
        "figure",
        {
            "class": (
                "element element-image "
                "img--landscape fig--narrow-caption fig--has-shares"
            )
        },
    )
    for unwanted in unwanted_images2:
        unwanted.extract()
    all_text = str(mydiv.text).replace("\n", "")
    art_text = all_text.split("Topics")[0]
    # print(art_text)
    return art_text

## Scrape Article Text

### Extract

In [8]:
%%time
l_texts = {}
for k, link in enumerate(urls_guardian, 1):
    print(f"Scraping article number {k}, Link: {link}")
    # print(site, link)
    with Timer() as t:
        r_session = requests.Session()
        retries = Retry(
            total=2,
            backoff_factor=0.1,
            status_forcelist=[500, 502, 503, 504],
        )
        r_session.mount("http://", HTTPAdapter(max_retries=retries))
        try:
            page_response = r_session.get(link, timeout=5)
        except Exception as ex:
            print(f"{ex} Error connecting to {link}")
        else:
            try:
                soup = BeautifulSoup(page_response.content, "lxml")
                # print(soup.prettify())
            except Exception as e:
                print(f"Experienced error {str(e)} when scraping {link}")
                text = np.nan
            else:
                text = get_guardian_text_from_soup(soup)
    print(f"Scraping time: {t.elapsed:.2f} seconds")
    l_texts[link] = [text]
    if k != len(urls_guardian):
        print('\n')

Scraping article number 1, Link: https://www.theguardian.com/environment/2024/nov/14/worlds-largest-known-coral-discovered-in-solomon-islands
Scraping time: 0.50 seconds


Scraping article number 2, Link: https://www.theguardian.com/environment/article/2024/jun/26/most-of-it-was-dead-scientists-discovers-one-of-great-barrier-reefs-worst-coral-bleaching-events
Scraping time: 0.29 seconds


Scraping article number 3, Link: https://www.theguardian.com/environment/article/2024/jul/30/as-record-heat-risks-bleaching-73-of-the-worlds-coral-reefs-scientists-ask-what-do-we-do-now
Scraping time: 0.32 seconds


Scraping article number 4, Link: https://www.theguardian.com/environment/2024/nov/19/graveyard-of-dead-coral-great-barrier-reef-bleaching-damage
Scraping time: 0.46 seconds


Scraping article number 5, Link: https://www.theguardian.com/environment/2024/apr/15/great-barrier-reef-coral-bleaching-global-heating
Scraping time: 0.31 seconds


Scraping article number 6, Link: https://www.theguar

Store article metadata and text in tabular format

In [18]:
df = (
    pd.DataFrame.from_dict(l_texts, orient="index")
    .reset_index()
    .rename(columns={"index": "url", 0: "text"})
    .assign(
        url_num=lambda df: pd.Series(range(1, len(df)+1)),
        url_name=None,
        char_count=lambda df: df['text'].str.len(),
        sentence_count_raw=lambda df: df['text'].str.split(". ").str.len(),
        token_count=lambda df: (df['text'].str.len()/4).round().astype(int),
        type='news_article',
    )
    .convert_dtypes()
)
df

Unnamed: 0,url,text,url_num,url_name,char_count,sentence_count_raw,token_count,type
0,https://www.theguardian.com/environment/2024/n...,"The world’s largest known coral, visible from ...",1,,3087,481,772,news_article
1,https://www.theguardian.com/environment/articl...,At least 97% of corals on a reef in the Great ...,2,,7025,1115,1756,news_article
2,https://www.theguardian.com/environment/articl...,After 18 months of record-breaking ocean tempe...,3,,9264,1440,2316,news_article
3,https://www.theguardian.com/environment/2024/n...,Reefs across the north of the Great Barrier Re...,4,,4147,685,1037,news_article
4,https://www.theguardian.com/environment/2024/a...,Global heating has pushed the world’s coral re...,5,,6656,1042,1664,news_article
5,https://www.theguardian.com/environment/articl...,Sharks are deserting their coral reef homes as...,6,,4531,717,1133,news_article
6,https://www.theguardian.com/environment/2024/a...,The Great Barrier Reef is in the midst of what...,7,,6543,1041,1636,news_article
7,https://www.theguardian.com/environment/2024/f...,The Great Barrier Reef’s management authority ...,8,,4484,696,1121,news_article
8,https://www.theguardian.com/environment/2024/m...,Beneath the turquoise waters off Heron Island ...,9,,9597,1561,2399,news_article
9,https://www.theguardian.com/environment/2023/d...,"Record-breaking land and sea temperatures, dri...",10,,4749,766,1187,news_article


### Transform

Clean the article text (in the `text` column)

In [19]:
df = (
    df
    .assign(
        text_cleaned=lambda df: (
            df['text']
            .str.replace("– video", " ")
            .str.replace('–', '')
            .str.replace('“', '\"')
            .str.replace('”', '\"')
            .str.replace("‘", "'")
            .str.replace('’', "'")
            .str.replace('|', '')
            .str.replace(aft_newsletter_advert, " ")
            .str.replace(down_to_earth_newsletter_advert, " ")
            .str.replace(morn_newsletter_advert=, " ")
            .str.replace(newsletter_signup_advert, "")
            .str.replace("QuickGuideWhat is coral bleaching?Show", " What is coral bleaching? ")
            .str.replace("Quick GuideWhat is coral bleaching?Show", " What is coral bleaching? ")
            .str.replace("Was this helpful?Thank you for your feedback.", " ")
            .str.replace("Read more", " ")
            .str.replace("©", "")
        )
        
    )
    [output_columns]
)
df

Unnamed: 0,url_num,url_name,url,text,char_count,sentence_count_raw,token_count,text_cleaned,type
0,1,,https://www.theguardian.com/environment/2024/n...,"The world’s largest known coral, visible from ...",3087,481,772,"The world's largest known coral, visible from ...",news_article
1,2,,https://www.theguardian.com/environment/articl...,At least 97% of corals on a reef in the Great ...,7025,1115,1756,At least 97% of corals on a reef in the Great ...,news_article
2,3,,https://www.theguardian.com/environment/articl...,After 18 months of record-breaking ocean tempe...,9264,1440,2316,After 18 months of record-breaking ocean tempe...,news_article
3,4,,https://www.theguardian.com/environment/2024/n...,Reefs across the north of the Great Barrier Re...,4147,685,1037,Reefs across the north of the Great Barrier Re...,news_article
4,5,,https://www.theguardian.com/environment/2024/a...,Global heating has pushed the world’s coral re...,6656,1042,1664,Global heating has pushed the world's coral re...,news_article
5,6,,https://www.theguardian.com/environment/articl...,Sharks are deserting their coral reef homes as...,4531,717,1133,Sharks are deserting their coral reef homes as...,news_article
6,7,,https://www.theguardian.com/environment/2024/a...,The Great Barrier Reef is in the midst of what...,6543,1041,1636,The Great Barrier Reef is in the midst of what...,news_article
7,8,,https://www.theguardian.com/environment/2024/f...,The Great Barrier Reef’s management authority ...,4484,696,1121,The Great Barrier Reef's management authority ...,news_article
8,9,,https://www.theguardian.com/environment/2024/m...,Beneath the turquoise waters off Heron Island ...,9597,1561,2399,Beneath the turquoise waters off Heron Island ...,news_article
9,10,,https://www.theguardian.com/environment/2023/d...,"Record-breaking land and sea temperatures, dri...",4749,766,1187,"Record-breaking land and sea temperatures, dri...",news_article


### Load

Export to disk

In [20]:
%%time
df.to_parquet(fpath_processed, index=False)

CPU times: user 15.5 ms, sys: 2.08 ms, total: 17.6 ms
Wall time: 16.2 ms
