# Scrape Guardian Newspaper Articles

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import re
import sys
from glob import glob
from random import choice, randint
from time import sleep
from urllib3.util.retry import Retry

import pandas as pd
import requests
from bs4 import BeautifulSoup
from contexttimer import Timer
from requests.adapters import HTTPAdapter

In [None]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [None]:
%aimport webscraping_utils
from webscraping_utils import get_custom_headers_list

## About

### Objective

Scrape Guardian newspaper articles from URLs that were exported in the previous notebook (`02_get_guardian_article_urls.ipynb`).

### Output

The output is a `DataFrame` containing the following columns

1. `url_num`
   - (news article) url number
2. `url_name`
   - `NULL`
3. `url`
   - web url of news article
4. `text`
   - raw text
5. `char_count`
   - approximate number of characters in raw text
6. `sentence_count_raw`
   - approximate number of sentences in raw text
7. `token_count`
   - approximate number of tokens in raw text
8. `type`
   - type of data source (`news_article`)

## User Inputs

In [5]:
article_title_search_terms = [
    'coral',
    'reef',
    'algae',
    'algal',
    'ocean',
    'marine',
    'zooxanthellae',
    'trophic',
    'symbiosis',
    'symbionts',
    'anthropogenic',
    'eutrophication',
]

urls_pages = range(1, 5+1)
min_wait = 9
max_wait = 18

fname_processed = 'guardian_articles'

output_columns = [
    'url_num',
    'url_name',
    'url',
    'text',
    'char_count',
    'sentence_count_raw',
    'token_count',
    'type',
]

In [6]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')

fpaths_urls = sorted(
    glob(
        os.path.join(raw_data_dir, 'guardian', 'urls', "urls_*.csv")
    )
)

fpath_processed = os.path.join(
    processed_data_dir,
    f"{fname_processed}_pgs_{min(urls_pages)}_{max(urls_pages)}.parquet"
)

# Define list of request headers to (randomly) choose from
headers_list = get_custom_headers_list()

# combine article search term list into string with | as delimiter
article_title_search_terms_str = ' | '.join(
    [f"(webUrl.str.contains('{t}'))" for t in article_title_search_terms]
)

In [None]:
def numeric_sort(test_string):
    """Sort by numeric part of string."""
    return list(map(int, re.findall(r'\d+', test_string)))[0]


def get_guardian_text_from_soup(soup):
    """Get Guardian text from beautifulsoup4 soup object"""
    mydiv = soup.find("div", {"class": "article-body-commercial-selector"})
    # print(mydiv)
    if not mydiv:
        mydiv = soup.find("div", {"class": "content__article-body"})
    unwanted_tweets = mydiv.findAll(
        "figure", {"class": "element element-tweet"}
    )
    for unwanted in unwanted_tweets:
        unwanted.extract()
    unwanted_images = mydiv.findAll(
        "figure", {"class": "element element-embed"}
    )
    for unwanted in unwanted_images:
        unwanted.extract()
    unwanted_images2 = mydiv.findAll(
        "figure",
        {
            "class": (
                "element element-image "
                "img--landscape fig--narrow-caption fig--has-shares"
            )
        },
    )
    for unwanted in unwanted_images2:
        unwanted.extract()
    all_text = str(mydiv.text).replace("\n", "")
    art_text = all_text.split("Topics")[0]
    return art_text

## Load Article URLs

Sort list of urls by page bounds

In [None]:
fpaths_urls.sort(key=numeric_sort)

Load and filter URLs from required page to only get URLs whose artitle title contains terms related to *coral bleaching*

In [None]:
df_urls = (
    pd.concat(
        [pd.read_csv(f, usecols=['page', 'webUrl']) for f in fpaths_urls]
    )
    .query(article_title_search_terms_str)
)
urls_guardian = (
    df_urls
    .query(f"page.isin(@urls_pages)")
    ['webUrl']
    .tolist()
)
assert urls_guardian
print(
    f"Found {len(urls_guardian)} relevant article(s) out of {len(df_urls):,} "
    "total articles to be scraped on pages "
    f"{min(urls_pages)}-{max(urls_pages)}"
)

**Notes**

1. `.query(article_title_search_terms_str)` evaluates to
   ```python
   .query(
       "(webUrl.str.contains('coral')) | "
       "(webUrl.str.contains('reef')) | "
       "(webUrl.str.contains('algae')) | "
       "(webUrl.str.contains('algal')) | "
       "(webUrl.str.contains('ocean')) | "
       "(webUrl.str.contains('marine')) | "
       "(webUrl.str.contains('zooxanthellae')) | "
       "(webUrl.str.contains('trophic')) | "
       "(webUrl.str.contains('symbiosis')) | "
       "(webUrl.str.contains('symbionts')) | "
       "(webUrl.str.contains('anthropogenic')) | "
       "(webUrl.str.contains('eutrophication'))"
   )
   ```

## Scrape Article Text

### Extract

In [None]:
%%time
l_texts = {}
for k, link in enumerate(urls_guardian, 1):
    print(f"Scraping article number {k}/{len(urls_guardian):,}, Link: {link}")
    with Timer() as t:
        # construct session
        r_session = requests.Session()
        retries = Retry(
            total=2,
            backoff_factor=0.1,
            status_forcelist=[500, 502, 503, 504],
        )
        r_session.mount("http://", HTTPAdapter(max_retries=retries))
        # scrape
        try:
            # construct headers dictionary
            headers = choice(headers_list)
            page_response = r_session.get(link, timeout=5, headers=headers)
        except Exception as ex:
            print(f"{ex} Error connecting to {link}")
        else:
            try:
                # extract entire soup from raw scraped text
                soup = BeautifulSoup(page_response.content, "lxml")
                # print(soup.prettify())
            except Exception as e:
                print(f"Experienced error {str(e)} when scraping {link}")
                text = np.nan
            else:
                # parse output text from relevant parts of soup
                text = get_guardian_text_from_soup(soup)
    num_chars = len(text) if text else 0
    print(
        f"Scraped {num_chars:,} characters of text from article {k} in "
        f"{t.elapsed:.2f} seconds"
    )
    # append article URL and parsed text from that URL to dictionary
    l_texts[link] = [text]
    # if more articles are to be scraped then print error message and wait
    if k != len(urls_guardian):
        random_sleep_time = randint(min_wait, max_wait)
        print(
            f"Pausing for {random_sleep_time} seconds after retrieving "
            f"article {k} from pages {min(urls_pages)}-{max(urls_pages)}...",
            end="",
        )
        sleep(random_sleep_time)
        print("done.")
        print('\n')

### Transform

Store article metadata and text in tabular format

In [None]:
df = (
    # construct DataFrame from scraped dicrionary
    pd.DataFrame.from_dict(l_texts, orient="index")
    .reset_index()
    # rename columns
    .rename(columns={"index": "url", 0: "text"})
    # extract metadata
    .assign(
        url_num=lambda df: pd.Series(range(1, len(df)+1)),
        url_name=None,
        # rough count of text characters
        char_count=lambda df: df['text'].str.len(),
        # rough sentence count
        sentence_count_raw=lambda df: df['text'].str.split(". ").str.len(),
        # rough token count
        token_count=lambda df: (df['text'].str.len()/4).round().astype(int),
        type='news_article',
    )
    .convert_dtypes()
)
df

### Load

Export to disk

In [None]:
%%time
df.to_parquet(fpath_processed, index=False)
print(
    f"Exported {len(df):,} row(s) of article text from pages "
    f"{min(urls_pages)}-{max(urls_pages)} to "
    f"{os.path.basename(fpath_processed)}"
)

## Conclusion

This notebook has retrieved the text of news articles in the Guardian's **Environment** section. Scraped articles contained a pre-determined set of terms related to *coral bleaching* events. Text was retrieved from batches of articles at once, where each batch corresponds to a single page of results returned by quering the publication's API `/content` endpoint.

In order to ensure articles discussed coral bleaching, each scraped article retrieved contained one of the following terms

1. coral
2. reef
3. algae
4. algal
5. ocean
6. marine
7. zooxanthellae
8. trophic
9. symbiosis
10. symbionts
11. anthropogenic
12. eutrophication

Articles that were returned by the API query but that did not contain at least one of these terms were not scraped.