# Article Content Scraping Exploration
--------

Load Article Data
---

In [18]:
import pandas as pd

csv_path = '../data/interim/articles_sources_removed.csv'

articles = pd.read_csv(csv_path, index_col=None)
articles.head()

Unnamed: 0,url,bias,quality,score_count,source
0,https://www.huffpost.com/entry/pete-buttigieg-...,-10.142857,46.915997,7,Huffington Post
1,https://www.huffpost.com/entry/elizabeth-warre...,-8.75,36.75012,8,Huffington Post
2,https://www.huffpost.com/entry/usmca-north-ame...,-11.111111,41.320375,9,Huffington Post
3,https://www.huffpost.com/entry/john-bolton-tru...,-11.375,37.876554,8,Huffington Post
4,https://www.huffpost.com/entry/beyonc%C3%A9-ja...,-10.5,43.661313,8,Huffington Post


View the First URL
---

In [9]:
a = articles.iloc[0]['url']
a

'https://www.huffpost.com/entry/pete-buttigieg-lgbtq-town-hall-blood-not-welcome-country_n_5d9fec43e4b02c9da047bf31'

Try Requesting the Article
---

In [10]:
import requests
import bs4

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
}

response = requests.get(a, headers=headers)
response.status_code

200

Define a Function to Scrape Article Contents
---

In [47]:
def scrape_contents(art: str):
    h = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
    }

    r = requests.get(art, headers=h)

    if r.status_code != 200:
        return None

    soup = bs4.BeautifulSoup(r.content)
    title = soup.select('title')[0].text
    entry_text = soup.select('.entry__text')[0]
    content = entry_text.select('.content-list-component')

    full_text = []
    for item in content:
        for el in item.contents:
            if isinstance(el, bs4.NavigableString):
                text = el.strip()
            else:
                text = el.text.strip()

            remove = [
                ('\n', ''),
                ('\u00a0', ''),
                ('\u00f8', ''),
                ('\u2013', '-'),
                ('\u2014', '-'),
                ('\u2015', '-'),
                ('\u2018', "'"),
                ('\u2019', "'"),
                ('\u201c', '"'),
                ('\u201d', '"'),
                ('\u2026', '...'),
                ('\u267f', ''),
                ('\u270b', ''),
                ('\ufe0f', ''),
                ('\ufeff', '')
            ]

            for r in remove:
                title = title.replace(r[0], r[1])
                text = text.replace(r[0], r[1])

            if text:
                full_text.append(text)

    return title, '\n'.join(full_text)

Scrape Huffington Post Articles' Contents
---

In [15]:
hp = articles.loc[articles['source'] == 'Huffington Post']

cntnt = []
for hp_a in hp.itertuples():
    t, c = scrape_contents(hp_a[1])
    cntnt.append(c)

for cnt in cntnt:
    print(cnt, '\n\n\n')

South Bend, Indiana, Mayor Pete Buttigieg said at Thursday’s LGBTQ town hall that he would overhaul rules that prevent gay men who have been sexually active within the last year from donating blood.
The 2020 Democrat, who is the first openly gay man to run for president, was asked at the town hall hosted by CNN and the Human Rights Campaign what he would do about the Food and Drug Administration’s blood donation policy that prohibits cis men who have had sex with cis men in the past 12 months from giving blood.
Buttigieg responded by speaking about his own experience as a gay man leading South Bend’s annual blood drive.
“I remember the moment when I realized that, unlike most initiatives that I spearhead, I can’t lead by example on this one, because my blood’s not welcome in this country,” he said. “And it’s not based on science; it’s based on prejudice.”
Pete Buttigieg on the prohibition on gay men donating blood: “My blood is not welcome in this country. And it's not based on science

Scrape Huffington Post Articles' Contents and Store
---

In [48]:
import json

huffpost_articles = articles.loc[articles['source'] == 'Huffington Post']

article_contents = []
for article in huffpost_articles.itertuples():
    t, c = scrape_contents(article[1])

    obj = {
        'title': t,
        'source': 'Huffington Post',
        'url': article[1],
        'contents': c
    }

    with open(f'../data/interim/article_content/article_{article[0]}.json', 'w') as article_fp:
        json.dump(obj, article_fp)