# motive: scrap data from curated list of news websites
- curated list of websites:
    - 

In [36]:
import pandas as pd
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup

## steps for navigating, and scraping data
1. find webpage with links to articles or article previews
2. add all the article links found on the page to a `article_hrefs_this_page` list
3. for each link in `article_hrefs_this_page`:
    * scrap:
        - title  
        - URL  
        - author  
        - raw_text  
        - publish_date  
        - images  
        - tags

In [37]:
def pprint(html):
    bs = BeautifulSoup(html)
    print(bs.prettify())

In [38]:
def make_disallowed(disallow_string, homepage):
    disallowed_hrefs = [homepage + str(href[len("disallow: "): ]) for href in disallow_string.strip().split('\n')]
    return disallowed_hrefs

In [39]:
bbc_disallow_string = """
Disallow: /bitesize/search$
Disallow: /bitesize/search/
Disallow: /bitesize/search?
Disallow: /cbbc/search/
Disallow: /cbbc/search$
Disallow: /cbbc/search?
Disallow: /cbeebies/search/
Disallow: /cbeebies/search$
Disallow: /cbeebies/search?
Disallow: /chwilio/
Disallow: /chwilio$
Disallow: /chwilio?
Disallow: /education/blocks$
Disallow: /education/blocks/
Disallow: /newsround
Disallow: /search/
Disallow: /search$
Disallow: /search?
Disallow: /sport/videos/*
Disallow: /food/favourites
Disallow: /food/search*?*
Disallow: /food/recipes/search*?*
Disallow: /education/my$
Disallow: /education/my/
Disallow: /bitesize/my$
Disallow: /bitesize/my/
Disallow: /food/recipes/*/shopping-list
Disallow: /food/menus/*/shopping-list
Disallow: /news/0
Disallow: /ugc$
Disallow: /ugc/
Disallow: /ugcsupport$
Disallow: /ugcsupport/
Disallow: /userinfo/
Disallow: /userinfo
Disallow: /u5llnop$
Disallow: /u5llnop/
Disallow: /sounds/search$
Disallow: /sounds/search/
Disallow: /sounds/search?
Disallow: /ws/includes
Disallow: /radio/imda
"""
bbc_disallowed = make_disallowed(disallow_string=bbc_disallow_string, homepage="https://www.bbc.com")
print(bbc_disallowed[:3])

['https://www.bbc.com/bitesize/search$', 'https://www.bbc.com/bitesize/search/', 'https://www.bbc.com/bitesize/search?']


In [40]:
path_to_webdriver = "../chromedriver.exe"
driver = webdriver.Chrome(path_to_webdriver)
homepage = "https://www.bbc.com/news"
driver.get(homepage)
print(f"scraping from: {driver.title}")

# 0. find the categories in the navigation bar
nav_bar = driver.find_element_by_class_name("nw-c-nav__wide-sections")
preview_hrefs_this_page = nav_bar.find_elements_by_class_name("nw-o-link")
preview_hrefs = [tag.get_attribute("href") for tag in preview_hrefs_this_page]
for href in preview_hrefs:
    if href not in bbc_disallowed:
        print(href)

scraping from: Home - BBC News
https://www.bbc.com/news
https://www.bbc.com/news/coronavirus
https://www.bbc.com/news/av/10462520
https://www.bbc.com/news/world
https://www.bbc.com/news/world/asia
https://www.bbc.com/news/uk
https://www.bbc.com/news/business
https://www.bbc.com/news/technology
https://www.bbc.com/news/science_and_environment
https://www.bbc.com/news/stories
https://www.bbc.com/news/entertainment_and_arts
https://www.bbc.com/news/health


In [66]:
# scraping for one link only but loop this 👇
# 1. find webpage with links to articles or article previews
for preview_href in preview_hrefs[4:6]:
    page_w_hrefs = preview_href
    driver.get(page_w_hrefs)

    href_elements = driver.find_elements_by_xpath("//a[@class='gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor']")

    # 2. add all the article links found on the page to a `article_hrefs_this_page` list
    article_hrefs = [tag.get_attribute("href") for tag in href_elements]
    print(article_hrefs[:2])

['https://www.bbc.com/news/world-asia-china-57314397', 'https://www.bbc.com/news/world-australia-57353654']
['https://www.bbc.com/news/uk-57353048', 'https://www.bbc.com/news/uk-england-merseyside-57356486']


In [67]:
# 3. for each link in `article_hrefs_this_page`:
    # * scrap:
        # - title  
        # - URL  
        # - author  
        # - raw_text  
        # - publish_date  
        # - images  
        # - tags
for article_url in article_hrefs[:2]:
    driver.get(article_url)
    title_element = driver.find_element_by_xpath("//h1[@class='ssrcss-1pl2zfy-StyledHeading e1fj1fc10']")
    title = title_element.get_attribute("innerHTML")
    raw_text_elements = driver.find_elements_by_xpath("//div[@class='ssrcss-18snukc-RichTextContainer e5tfeyi1']")
    raw_text = ""
    for raw_text_element in raw_text_elements:
        raw_text += (" " + raw_text_element.text)
    print(title)
    print(raw_text)

Covid-19: Portugal queries amber status as UK tightens rules
What are the rules for green, amber and red lists?
More than half of UK adults fully vaccinated The UK government said the decision to move Portugal, including Madeira and the Azores, to the amber list followed increased concern about a mutation of the Delta variant, which was first identified in India. The Department for Transport said 68 cases of the Delta variant had been identified in Portugal, including cases with an additional, potentially detrimental mutation, being referred to as the Nepal mutation. Public Health England (PHE) told the BBC the mutation of the variant was present in multiple countries, including a small number of cases in the UK. It is investigating the mutation to better understand whether it could be more transmissible and less effectively tackled by vaccines. The number of positive Covid cases in Portugal has also nearly doubled since the last review, the department said, adding the situation "has r