In [10]:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import re
import time
from datetime import date, timedelta

In [64]:
pd.set_option('display.max_colwidth', None)
article_df = pd.read_excel('NOS_articles_221004-231004.xlsx')
article_df['Category'] = None
article_df['Images'] = None
article_df['Paragraphs'] = None
article_df.shape

(13387, 8)

In [169]:
def extract_data_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'lxml')
            
            # Find the header image
            featured_image_div = soup.find('div', class_='sc-f8368fbb-0 eYpAEN')
            
            featured_image = []
            if featured_image_div:
                featured_image = [img['src'] for img in featured_image_div.find_all('img')]
                    
            # Find the article text and category
            target_divs = soup.find_all('div', class_='sc-e0c07641-1 eHATPt')
            target_paragraphs = []
            article_category = []
            
            for div in target_divs:
                paragraphs = [p.get_text() for p in div.find_all('p', class_='sc-6d77a1d1-0 chzewu')]
                target_paragraphs.extend(paragraphs)
                
                category = [p.get_text() for p in div.find_all('p', class_='sc-f9df6382-7 cMuisv')]
                article_category.extend(category)
                
            return article_category, featured_image, target_paragraphs
        else:
            print(f"Failed to fetch URL: {url}")
            return None, None
    except Exception as e:
        print(f"Error processing URL: {url}\n{str(e)}")
        return None, None

In [170]:
# Make test set to test the code
test_set = article_df.copy()[:10]
test_set.shape

(10, 8)

In [171]:
test_set.head()

Unnamed: 0,Article ID,Link,Title,Date,Time,Category,Images,Paragraphs
0,2447107.0,https://nos.nl/artikel/2447107-britse-regering-kondigt-hardere-koers-tegen-illegale-immigranten-aan,Britse regering kondigt hardere koers tegen illegale immigranten aan,2022-10-04,22:58:15,,,
1,2447103.0,https://nos.nl/artikel/2447103-nog-veel-onbekend-over-energieplafond-bedrijven-duidelijkheid-snel-nodig,Nog veel onbekend over energieplafond bedrijven: 'Duidelijkheid snel nodig',2022-10-04,22:43:15,,,
2,2447102.0,https://nos.nl/artikel/2447102-puinspoor-van-10-000-kilometer-volgt-ruimterots-die-botste-met-sonde,Puinspoor van 10.000 kilometer volgt ruimterots die botste met sonde,2022-10-04,22:33:13,,,
3,2447100.0,https://nos.nl/artikel/2447100-begrip-bij-geemigreerde-boeren-voor-acties-wij-gaan-nooit-meer-terug,Begrip bij geëmigreerde boeren voor acties: 'Wij gaan nooit meer terug',2022-10-04,21:56:48,,,
4,2447098.0,https://nos.nl/artikel/2447098-vredesbesprekingen-regering-colombia-en-guerrillabeweging-worden-hervat,Vredesbesprekingen regering Colombia en guerrillabeweging worden hervat,2022-10-04,20:56:01,,,


In [172]:
start_time = time.time()

for index, row in test_set.iterrows():
    url = row['Link']
    article_category, featured_image, target_paragraphs = extract_data_from_url(url)
    
    # Store the extracted data in your DataFrame (create new columns as needed)
    test_set.at[index, 'Category'] = article_category
    test_set.at[index, 'Images'] = featured_image
    test_set.at[index, 'Paragraphs'] = target_paragraphs
    
end_time = time.time()
elapsed_time = end_time - start_time
elapsed_minutes = int(elapsed_time // 60)
elapsed_seconds = int(elapsed_time % 60)


print(f'Elapsed time: {elapsed_minutes} minutes and {elapsed_seconds} seconds')

Elapsed time: 0 minutes and 0 seconds


Elapsed time for #rows

    10 : 0 minutes and 0 seconds
    100 : 0 minutes and 11 seconds
    500 : 1 minutes and 53 seconds
    1000 : 3 mintues and 5 seconds

In [173]:
test_set['Category'][:1]

0    [Buitenland]
Name: Category, dtype: object

In [174]:
test_set.to_excel('NOS_testset_with articles.xlsx')