# motive: scrap data from curated list of news websites
- curated list of websites:
    - 

In [83]:
import pandas as pd
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup
import datetime

## steps for navigating, and scraping data
1. find webpage with links to articles or article previews
2. add all the article links found on the page to a `article_hrefs_this_page` list
3. for each link in `article_hrefs_this_page`:
    * scrap:
        - title  
        - URL  
        - author  
        - raw_text  
        - publish_date  
        - images  
        - tags

In [71]:
def pprint(html):
    bs = BeautifulSoup(html)
    print(bs.prettify())

In [72]:
def make_disallowed(disallow_string, homepage):
    disallowed_hrefs = [homepage + str(href[len("disallow: "): ]) for href in disallow_string.strip().split('\n')]
    return disallowed_hrefs

In [73]:
bbc_disallow_string = """
Disallow: /bitesize/search$
Disallow: /bitesize/search/
Disallow: /bitesize/search?
Disallow: /cbbc/search/
Disallow: /cbbc/search$
Disallow: /cbbc/search?
Disallow: /cbeebies/search/
Disallow: /cbeebies/search$
Disallow: /cbeebies/search?
Disallow: /chwilio/
Disallow: /chwilio$
Disallow: /chwilio?
Disallow: /education/blocks$
Disallow: /education/blocks/
Disallow: /newsround
Disallow: /search/
Disallow: /search$
Disallow: /search?
Disallow: /sport/videos/*
Disallow: /food/favourites
Disallow: /food/search*?*
Disallow: /food/recipes/search*?*
Disallow: /education/my$
Disallow: /education/my/
Disallow: /bitesize/my$
Disallow: /bitesize/my/
Disallow: /food/recipes/*/shopping-list
Disallow: /food/menus/*/shopping-list
Disallow: /news/0
Disallow: /ugc$
Disallow: /ugc/
Disallow: /ugcsupport$
Disallow: /ugcsupport/
Disallow: /userinfo/
Disallow: /userinfo
Disallow: /u5llnop$
Disallow: /u5llnop/
Disallow: /sounds/search$
Disallow: /sounds/search/
Disallow: /sounds/search?
Disallow: /ws/includes
Disallow: /radio/imda
"""
bbc_disallowed = make_disallowed(disallow_string=bbc_disallow_string, homepage="https://www.bbc.com")
print(bbc_disallowed[:3])

['https://www.bbc.com/bitesize/search$', 'https://www.bbc.com/bitesize/search/', 'https://www.bbc.com/bitesize/search?']


In [74]:
path_to_webdriver = "../chromedriver.exe"
driver = webdriver.Chrome(path_to_webdriver)
homepage = "https://www.bbc.com/news"
driver.get(homepage)
print(f"scraping from: {driver.title}")

# 0. find the categories in the navigation bar
nav_bar = driver.find_element_by_xpath("//ul[@class='gs-o-list-ui--top-no-border nw-c-nav__wide-sections']")
preview_hrefs_this_page = nav_bar.find_elements_by_class_name("nw-o-link")
preview_hrefs = [tag.get_attribute("href") for tag in preview_hrefs_this_page]
for href in preview_hrefs:
    if href not in bbc_disallowed:
        print(href)

scraping from: Home - BBC News
https://www.bbc.com/news
https://www.bbc.com/news/coronavirus
https://www.bbc.com/news/av/10462520
https://www.bbc.com/news/world
https://www.bbc.com/news/world/asia
https://www.bbc.com/news/uk
https://www.bbc.com/news/business
https://www.bbc.com/news/technology
https://www.bbc.com/news/science_and_environment
https://www.bbc.com/news/stories
https://www.bbc.com/news/entertainment_and_arts
https://www.bbc.com/news/health


In [75]:
# scraping for one link only but loop this 👇
# 1. find webpage with links to articles or article previews
for preview_href in preview_hrefs[4:6]:
    page_w_hrefs = preview_href
    driver.get(page_w_hrefs)

    href_elements = driver.find_elements_by_xpath("//a[@class='gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor']")

    # 2. add all the article links found on the page to a `article_hrefs_this_page` list
    article_hrefs = [tag.get_attribute("href") for tag in href_elements]
    print(article_hrefs[:2])

['https://www.bbc.com/news/world-asia-china-57314397', 'https://www.bbc.com/news/world-australia-57353654']
['https://www.bbc.com/news/health-57358446', 'https://www.bbc.com/news/uk-57353048']


In [90]:
# 3. for each link in `article_hrefs_this_page`:
    # * scrap:
        # - title  
        # - URL  
        # - author  
        # - raw_text  
        # - publish_date  
        # - images  
        # - tags
news_data = pd.DataFrame(columns=["url", "title", "raw_text", "publish_datetime"])
for article_url in article_hrefs[:2]:
    driver.get(article_url)
    title_element = driver.find_element_by_xpath("//h1[@class='ssrcss-1pl2zfy-StyledHeading e1fj1fc10']")
    title = title_element.text
    raw_text_elements = driver.find_elements_by_xpath("//div[@class='ssrcss-18snukc-RichTextContainer e5tfeyi1']")
    raw_text = ""
    for raw_text_element in raw_text_elements:
        raw_text += (" " + raw_text_element.text)
    raw_text = raw_text.strip()
    datetime_element = driver.find_element_by_xpath("//time[@data-testid='timestamp']")
    publish_datetime = str(datetime_element.get_attribute("datetime"))

    this_article_data = pd.DataFrame(data={
        "url": [article_url],
        "title": [title],
        "raw_text": [raw_text],
        "publish_datetime": [publish_datetime], 
    })
    news_data = news_data.append(this_article_data, ignore_index=True)

    print("-" * 12)
    print("url:", article_url)
    print("title:", title)
    print("publish_datetime:", publish_datetime)
    print("raw_text:", raw_text[:40])

------------
url: https://www.bbc.com/news/health-57358446
title: UK approves Pfizer jab for 12 to 15-year-olds
publish_datetime: 2021-06-04T12:09:08.000Z
raw_text: The UK regulator has approved the use of
------------
url: https://www.bbc.com/news/uk-57353048
title: Covid-19: Portugal queries amber status as UK tightens rules
publish_datetime: 2021-06-04T12:13:12.000Z
raw_text: Portugal has questioned the UK's decisio


In [91]:
news_data

Unnamed: 0,url,title,raw_text,publish_datetime
0,https://www.bbc.com/news/health-57358446,UK approves Pfizer jab for 12 to 15-year-olds,The UK regulator has approved the use of the P...,2021-06-04T12:09:08.000Z
1,https://www.bbc.com/news/uk-57353048,Covid-19: Portugal queries amber status as UK ...,Portugal has questioned the UK's decision to r...,2021-06-04T12:13:12.000Z
