# Clean Text of Guardian Newspaper Articles

In [None]:
import os
import re
import sys
from glob import glob

import pandas as pd
from contexttimer import Timer

In [None]:
PROJ_ROOT = os.path.join(os.pardir)

## About

### Objective

Clean text of all scraped newspaper articles, combine with article metadata (extracted from API results) and produce a single file with all cleaned texts and associated metadata.

### Output

The output is a `DataFrame` containing the same columns as in the previous notebook, but with a cleaned text column

1. `url_num`
   - (news article) url number
2. `url_name`
   - `NULL`
3. `url`
   - web url of news article
4. `text`
   - raw text
5. `char_count`
   - approximate number of characters in raw text
6. `sentence_count_raw`
   - approximate number of sentences in raw text
7. `token_count`
   - approximate number of tokens in raw text
8. `text_cleaned`<sup>[1](#myfootnote1)</sup>
   - cleaned text from paper
9. `type`
   - type of data source (`news_article`)
10. `webPublicationDate`<sup>[1](#myfootnote1)</sup>
    - date on which news article was published
11. `year`<sup>[1](#myfootnote1)</sup>
    - year in which news article was published
12. `webTitle`<sup>[1](#myfootnote1)</sup>
    - title of published news article
13. `page`<sup>[1](#myfootnote1)</sup>
    - page of API results in which the article URL was listed

<a name="myfootnote1">1</a>: added in this notebook

## User Inputs

In [None]:
article_title_search_terms = [
    'coral',
    'reef',
    'algae',
    'algal',
    'ocean',
    'marine',
    'zooxanthellae',
    'trophic',
    'symbiosis',
    'symbionts',
    'anthropogenic',
    'eutrophication',
]

# content of banners to be removed
latest_news = (
    'Quick GuideHow to get the latest news from Guardian AustraliaShow Emailsign up for ourdaily morning and afternoon email newslettersAppdownload our free appand never miss the biggest storiesSocialfollow us on YouTube,TikTok,Instagram,FacebookorTwitterPodcastlisten to our daily episodes onApple Podcasts,Spotifyor search "Full Story" in your favourite appPhotograph Tim Robberts/Stone RF'
)
more_features = (
    'Find more age of extinction coverage here, and follow biodiversity reporters Phoebe Weston and Patrick Greenfield on Twitter for all the latest news and features'
)
twitter_url1 = 'https//t.co/0mTKbg4pYr pic.twitter.com/tV152DUK4h'
twitter_url2 = 'pic.twitter.com7eZb2ZXFT0— Tanya Plibersek (tanya_plibersek) July 1, 2022'

aft_newsletter_advert = "skip past newsletter promotionSign up to Afternoon UpdateFree daily newsletterOur Australian afternoon update breaks down the key stories of the day, telling you what's happening and why it mattersEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
down_to_earth_newsletter_advert = "skip past newsletter promotionSign up to Down to EarthFree weekly newsletterThe planet's most important stories. Get all the week's environment news - the good, the bad and the essentialEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
morn_newsletter_advert = "skip past newsletter promotionSign up to Morning MailFree daily newsletterOur Australian morning briefing breaks down the key stories of the day, telling you what's happening and why it mattersEnter your email address Sign upPrivacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply.after newsletter promotion"
newsletter_signup_advert = "Sign up for Guardian Australia’s free morning and afternoon email newsletters for your daily news roundup"

# index of article that is to be spot-checked after cleaning
rows = [
    5,
    18,
    44,
    151,
    200,
    229,
    100,
    125,
    175,
    250,
    260,
    40,
    50,
    70,
    130,
    190,
    240,
]

output_columns = [
    'url_num',
    'url_name',
    'url',
    'text',
    'char_count',
    'sentence_count_raw',
    'token_count',
    'type',
]

In [None]:
def numeric_sort(test_string):
    """Sort by numeric part of string."""
    return list(map(int, re.findall(r'\d+', test_string)))[0]

In [None]:
data_dir = os.path.join(PROJ_ROOT, 'data')
raw_data_dir = os.path.join(data_dir, 'raw')
processed_data_dir = os.path.join(data_dir, 'processed')

# filepaths of scraped pages with articles
fpaths_processed = glob(
    os.path.join(processed_data_dir, "guardian_articles*pg*.parquet")
)
# sort by page number of filename
fpaths_processed.sort(key=numeric_sort)

# filepaths of URLs retrieved from API results
fpaths_urls = sorted(
    glob(
        os.path.join(raw_data_dir, 'guardian', 'urls', "urls_*.csv")
    )
)

# filepath at which to save cleaned data
fpath_processed_cleaned_and_combined = os.path.join(
    processed_data_dir, "guardian_articles_cleaned_combined.parquet"
)

# combine article search term list into string with | as delimiter
article_title_search_terms_str = ' | '.join(
    [f"(webUrl.str.contains('{t}'))" for t in article_title_search_terms]
)

## Clean Article Text

### Extract

#### Scraped Text

Load all columns of scraped data

In [None]:
df = pd.read_parquet(fpaths_processed, columns=output_columns)

There was some overlap between articles returned from different pages of the API. This can be detected based on the presence of duplicates in the `url` column and is shown below

In [None]:
print(
    f"Number of rows: {len(df)}, "
    "Number of rows without duplicates: "
    f"{len(df.drop_duplicates(subset=['url']))}"
)

**Notes**

1. When specifying [API inputs](https://open-platform.theguardian.com/explore/), the `order-by` filter was set to ensure the order of the results returned by querying the API was deterministic across multiple queries. Despite this, there is some overlap between links shown on different pages of API response results. It is not known why this occurred. So, some articles were returned on multiple pages of API response results. This introduced the duplication shown above. Future work should start by checking if the API response returns the same listings across multiple calls to the endpoint.

For this reason, rows with duplicated articles have to be dropped based on the URL column

In [None]:
df = df.drop_duplicates(subset=['url'])
print(
    f"Number of rows: {len(df)}, "
    "Number of rows without duplicates: "
    f"{len(df.drop_duplicates(subset=['url']))}"
)

#### Metadata

Load and process metadata from list of URLs retrieved by querying the API

In [None]:
df_urls = (
    # extract
    pd.concat(
        [pd.read_csv(f) for f in fpaths_urls]
    )
    # # filter to get articles related to coral bleaching
    .query(article_title_search_terms_str)
    # transform
    # # add datetime attributes
    .assign(
        webPublicationDate=lambda df: pd.to_datetime(
            df['webPublicationDate']
        ),
        year=lambda df: df['webPublicationDate'].dt.year,
    )
    # # slice to get useful columns
    [['webUrl', 'webPublicationDate', 'year', 'webTitle', 'page']]
)
df_urls.head(1)

**Notes**

1. `.query(article_title_search_terms_str)` evaluates to
   ```python
   .query(
       "(webUrl.str.contains('coral')) | "
       "(webUrl.str.contains('reef')) | "
       "(webUrl.str.contains('algae')) | "
       "(webUrl.str.contains('algal')) | "
       "(webUrl.str.contains('ocean')) | "
       "(webUrl.str.contains('marine')) | "
       "(webUrl.str.contains('zooxanthellae')) | "
       "(webUrl.str.contains('trophic')) | "
       "(webUrl.str.contains('symbiosis')) | "
       "(webUrl.str.contains('symbionts')) | "
       "(webUrl.str.contains('anthropogenic')) | "
       "(webUrl.str.contains('eutrophication'))"
   )
   ```

### Transform

Specify index of `DataFrame` row whose text is to be printed after cleaning

In [None]:
k = 0

Perform cleaning

In [None]:
df = (
    df
    .assign(
        text_cleaned=lambda df: (
            df['text']
            # FIRST PASS
            # new line was dropped after web scraping, so there is no trailing
            # space after '.' This regex adds this trailing space at end of
            # each sentence.
            .str.replace('\.(?!\s|\d|$)', '. ', regex=True)
            # remove text corresponding to video
            .str.replace("– video", " ")
            # # old
            # .str.replace('–', "")
            # new
            # remove special characters
            .str.replace('–', "-")
            .str.replace('“', '\"')
            .str.replace('”', '\"')
            .str.replace("‘", "'")
            .str.replace('’', "'")
            .str.replace('|', '')
            # remove advertising banners
            .str.replace(aft_newsletter_advert, " ")
            .str.replace(down_to_earth_newsletter_advert, " ")
            .str.replace(morn_newsletter_advert, " ")
            .str.replace(newsletter_signup_advert, "")
            # remove unwanted part of embedded infographic
            .str.replace(
                "QuickGuideWhat is coral bleaching?Show",
                " What is coral bleaching? "
            )
            .str.replace(
                "Quick GuideWhat is coral bleaching?Show",
                " What is coral bleaching? "
            )
            .str.replace("Was this helpful?Thank you for your feedback.", " ")
            .str.replace("Read more", " ")
            # remove special characters
            .str.replace("©", "")
            # SECOND PASS
            # remove special characters
            .str.replace(":", "")
            .str.replace(",", " ")
            # remove double white-space
            .str.replace("  ", " ")
            # remove special characters
            .str.replace("\xa0", "")
            # remove adververtising banners
            .str.replace(latest_news, '')
            # remove special characters
            .str.replace("@", "")
            .str.replace("\'", "")
            .str.replace('\"', "")
            # remove social meida hashtags
            .str.replace("#coralbleaching2020", '')
            .str.replace("#GreatBarrierReef", '')
            # remove urls
            .str.replace(twitter_url1, ' ')
            .str.replace("#pic.twitter.com/Tz1vqfI40t", '')
            # add hyphen
            .str.replace('20122013', '2012-2013')
            # remove text corresponding to embedded images
            .str.replace("The Guardian", "")
            # remove special characters
            .str.replace("/", '')
            .str.replace("…", '')
            # remove text corresponding to embedded images
            .str.replace('Photograph Manu San FelixNGKAUST', ' ')
            # remove text to expand size of embedded image
            .str.replace('View image in full', ' ')
            # remove adververtising banners
            .str.replace(more_features, '')
            # # remove text corresponding to embedded images
            .str.replace('Photograph Serço Ekşiyan', '')
            .str.replace('Photograph Sam McNeilAP', '')
            # remove urls
            .str.replace(twitter_url2, '')
            .str.replace('httpst.coeE5LCrSwtL— Terry Hughes (ProfTerryHughes)', ' ')
            # remove text corresponding to embedded images
            .str.replace('Photograph Grumpy Turtle Films', ' ')
            # remove narrative text
            .str.split("As told to", expand=True)[0]
            # change to lowercase
            .str.lower()
        )
    )
    [output_columns[:-1]+ ['text_cleaned'] + [output_columns[-1]]]
)
# (optional) print cleaned text to assess cleaning procedure
row = df.iloc[rows[k]]
url = row['url']
text = row['text_cleaned']
# print(url)
# text

Show the first row of the combined `DataFrame` with the `text_cleaned` (cleaned text) column

In [None]:
df.head(1)

Show the first row of the combined `DataFrame` with the article metadata

In [None]:
df_urls.head(1)

Merge article text and metadata

In [None]:
df_merged = df.merge(
    df_urls.rename(columns={"webUrl": "url"}), on=['url'], how='left',
)
with pd.option_context('display.max_columns', None):
    display(df_merged.head(1))

### Load

Export merged article contents to disk

In [None]:
%%time
df_merged.to_parquet(fpath_processed_cleaned_and_combined, index=False)
print(
    f"Exported {len(df):,} rows of data with cleaned article texts to "
    f"{os.path.basename(fpath_processed_cleaned_and_combined)}"
)

## Conclusion

This notebook has cleaned the text of the scraped batches of Guardian news articles relating to *coral bleaching*. All cleaned texts were then combined with article metadata retrieved from querying the publication's `/content` API endpoint and then exported to a single file for use in NLP analysis.