In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import re
import time
from datetime import date, timedelta

In [13]:
# Initialize 
start_date = date(2022, 4, 10)
end_date = date(2023, 4, 10)

date_array = pd.date_range(start_date,end_date-timedelta(days=1),freq='d').date

article_data_list = []

In [14]:
start_time = time.time()

for i in date_array:
    url = 'https://nos.nl/nieuws/archief/{}'.format(i)
    response = requests.get(url)
    
    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all the <time> elements on the page
        time_elements = soup.find_all('time')
        title_elements = soup.find_all(class_='list-time__title link-hover')

        if time_elements and title_elements:
            for i in range(len(time_elements)):

                # Extract the article-id from the href attribute
                href = title_elements[i].find_parent('a')['href']
                article_id_match = re.search(r'/artikel/(\d+)-', href)
                article_id = article_id_match.group(1) if article_id_match else 'N/A'

                # Extract the title 
                title_text = title_elements[i].text.strip()

                # Extract the date and time part from the datetime attribute
                datetime_value = time_elements[i].get('datetime', '')
                date_time_parts = datetime_value.split('T')
                date_part = date_time_parts[0]
                time_part = date_time_parts[1].split('+')[0]

                # Create the dictionary with article data
                article_data = {
                    'Article ID': article_id,
                    'Link': url + href,
                    'Title': title_text,
                    'Date': date_part,
                    'Time': time_part
                }

                # Append dictionary elements to list
                article_data_list.append(article_data)
        else:
            print("No publication date or title found on the webpage")
    else:
        print("Failed to retrieve the webpage")

end_time = time.time()
elapsed_time = end_time - start_time
elapsed_minutes = int(elapsed_time // 60)
elapsed_seconds = int(elapsed_time % 60)


print(f'Elapsed time: {elapsed_minutes} minutes and {elapsed_seconds} seconds')

Elapsed time: 3 minutes and 26 seconds


Train WIFI output

    Elapsed time: 21 minutes and 23 seconds
    Dataframe shape: (14044, 5)
    Unique rows [Article ID]: 13607
    Unique rows [Link]: 13809
    Unique rows [Title]: 13808
    Unique rows [Date]: 365
    
Eduroam WIFI output

    Elapsed time: 3 minutes and 26 seconds
    Dataframe shape: (13809, 5)
    Unique rows [Article ID]: 13607
    Unique rows [Link]: 13809
    Unique rows [Title]: 13808
    Unique rows [Date]: 365

In [15]:
article_df = pd.DataFrame(article_data_list)
#article_df.set_index('Article ID', inplace=True)
#article_df.to_excel('test_NOS_articles_231004-221004.xlsx')

In [16]:
article_df.shape

(13809, 5)

In [17]:
print(article_df['Article ID'].nunique())
print(article_df['Link'].nunique())
print(article_df['Title'].nunique())
print(article_df['Date'].nunique())

13607
13809
13808
365
