In [1]:
import pandas as pd

#Change these dates based on which year you are scraping for
# insert dates
dates = pd.date_range(start = "2022-01-01", end = "2022-12-31")
year = '2022'

# generate filenames
metaArticles_temp_filename = 'metaArticles_temp_'+year+'.csv'
metaArticles_filename = 'metaArticles_'+year+'.csv'

articles_temp_filename = 'articles_temp_'+year+'.csv'
articles_filename = 'articles_'+year+'.csv'

In [2]:
# to get the URL
import requests

# to parse the HTMLDOM
from bs4 import BeautifulSoup

# patience is virtue
import time

# %pip install selenium
# virtual browser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from random import randint

# Scrape article meta infos

In [3]:
def scrape_metaInfo(soup, df):
    elements = soup.find_all(class_ = 'link-block')
    for element in elements:

        # get URL of element
        url = element['href']
        isArticle = url.startswith('/artikel') # check whether URL leads to an article (instead of e.g. a video)

        if isArticle:
            url = 'https://nos.nl' + url
            # print(url)

            # get title of element
            title = element.find('div', {'class': 'list-time__title link-hover'}).text

            newRow = pd.DataFrame({'url':[url], 'title':[title]})
            df = pd.concat([df, newRow], ignore_index=True)

        else:
            next # skips this element if it refers to something besides an article
        
    return df


In [4]:
from datetime import datetime

def scrape_articlesMeta(daterange):
    """
    Scrapes URL, title, and date from search results on NOS
    
    Parameters:
        - daterange: range of dates created using
          daterange = pd.date_range(start = "2018-09-09", end = datetime.today())
    
    Returns:
        - dataframe that contains the columns outlet, url, title, and date
    """
    
    # initialise driver
    options = webdriver.ChromeOptions()
    options.add_argument("--incognito")
    driver = webdriver.Chrome(options = options) 

    # initialise dataframe
    df = pd.DataFrame(data = {'url': [], 'title': []})

    # make list of URLs
    listURLs = [('https://nos.nl/nieuws/buitenland/archief/' + str(date.date())) for date in daterange]

    # loop over urls in listURLs for scraping
    for url in listURLs:
        print('Scraping ' + url + ' ...')

        # open page
        driver.get(url)
        driver.set_window_size(838, 900)

        soup = BeautifulSoup(driver.page_source)

        df = scrape_metaInfo(soup, df) # this function should scrape all necessary data from the soup and save it to a dataframe
        
        # save as csv files
        df.to_csv(metaArticles_temp_filename, index=False)
        print('Total number of articles: ' + str(len(df)))

        time.sleep(randint(1, 10)) # to not get blocked by the website


    driver.close() # close tab

    n_articles = len(df)
    print(str(n_articles) + ' articles were scraped.')

    return df

In [12]:
# datetime.today()
metaArticles = scrape_articlesMeta(daterange = dates)

# save as csv files
metaArticles.to_csv(metaArticles_filename, index=False)

Scraping https://nos.nl/nieuws/buitenland/archief/2019-01-01 ...
Total number of articles: 16
Scraping https://nos.nl/nieuws/buitenland/archief/2019-01-02 ...
Total number of articles: 30
Scraping https://nos.nl/nieuws/buitenland/archief/2019-01-03 ...
Total number of articles: 48
Scraping https://nos.nl/nieuws/buitenland/archief/2019-01-04 ...
Total number of articles: 62
Scraping https://nos.nl/nieuws/buitenland/archief/2019-01-05 ...
Total number of articles: 79
79 articles were scraped.


# Scrape articles

In [9]:
metaArticles = pd.read_csv('D:/OneDrive - Universiteit Utrecht/PER3_PersonalizationForPublicMedia/assignment2/General/Dataset/metaArticles_2022.csv')

In [10]:
def scrape_paragraphs(soup, df, url, title):

    article = soup.find_all('p', class_ = 'sc-5a7f8528-0 ihDdJT')
    article = ' '.join([paragraph.text for paragraph in article])

    date = soup.find('time')['datetime'][:10]

    tags = soup.find_all('p', class_ = 'sc-370380d-7 fmeDxO')
    tags = [tag.text for tag in tags]
    
    try:
        image = soup.find('img', class_ = 'sc-8e313b0a-1 dGrJDt')
        image = image['src']
    except:
        image = ''    

    newRow = pd.DataFrame({'url':[url], 'title': [title], 'article': [article], 'date': [date], 'tags': [tags], 'image': [image]})
    df = pd.concat([df, newRow], ignore_index=True)
        
    return df

In [11]:
from tqdm.notebook import tqdm

def scrape_articles(metaArticles):
    """
    Scrapes paragraphs from articles on NOS
    
    Parameters:
        - metaArticles - dataframe that contains a column called 'url' and one called 'title' with all urls to be scraped
    
    Returns:
        - dataframe that contains the columns url, paragraph and leadParagraph
    """
    # initialise dataframe
    df = pd.DataFrame(data = {'url': [], 'title': [], 'article': [], 'date': [], 'tags': [], 'image': []})

    for row in tqdm(range(len(metaArticles.index))): # loop over row indices of metaArticles
        url = metaArticles.iloc[row]['url']

        # get data from the url
        res = requests.get(url)

        # parse to bs4
        soup = BeautifulSoup(res.content)

        df = scrape_paragraphs(soup, df, url, metaArticles.iloc[row]['title']) # this function should scrape all necessary data from the soup and save it to a dataframe
        
        # save as csv files
        df.to_csv(articles_temp_filename, index=False, sep = '|')

        time.sleep(randint(1, 20)) # to not get blocked by the website

    return df

In [13]:
articles = scrape_articles(metaArticles[1377:])
articles.to_csv(articles_filename, index=False, sep = '|')

  0%|          | 0/3605 [00:00<?, ?it/s]