In [1]:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import re
import time
from datetime import date, timedelta

In [2]:
pd.set_option('display.max_colwidth', None)
article_df = pd.read_excel('NOS_articles_221004-231004.xlsx')
article_df['Category'] = None
article_df['Images'] = None
article_df['Paragraphs'] = None
article_df.shape

(13387, 8)

In [8]:
def extract_data_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'lxml')
            
            # Find the header image
            featured_image_div = soup.find('div', class_='sc-f8368fbb-0 eYpAEN')
            
            featured_image = []
            if featured_image_div:
                featured_image = [img['src'] for img in featured_image_div.find_all('img')]
                    
            # Find the article text and category
            target_divs = soup.find_all('div', class_='sc-e0c07641-1 eHATPt')
            target_paragraphs = []
            article_category = []
            
            for div in target_divs:
                paragraphs = [p.get_text() for p in div.find_all('p', class_='sc-6d77a1d1-0 chzewu')]
                target_paragraphs.extend(paragraphs)
                
                category = [p.get_text() for p in div.find_all('p', class_='sc-f9df6382-7 cMuisv')]
                article_category.extend(category)
                
            return article_category, featured_image, target_paragraphs
        else:
            print(f"Failed to fetch URL: {url}")
            return None, None, None
    except Exception as e:
        print(f"Error processing URL: {url}\n{str(e)}")
        return None, None, None

In [12]:
start_time = time.time()

for index, row in article_df.iterrows():
    url = row['Link']
    article_category, featured_image, target_paragraphs = extract_data_from_url(url)
    
    if article_category == None:
        article_category = ''
    if featured_image == None:
        featured_image = ''
    if target_paragraphs == None:
        target_paragraphs = ''
        
    
    # Store the extracted data in your DataFrame (create new columns as needed)
    article_df.at[index, 'Category'] = article_category
    article_df.at[index, 'Images'] = featured_image
    article_df.at[index, 'Paragraphs'] = target_paragraphs
    
end_time = time.time()
elapsed_time = end_time - start_time
elapsed_minutes = int(elapsed_time // 60)
elapsed_seconds = int(elapsed_time % 60)


print(f'Elapsed time: {elapsed_minutes} minutes and {elapsed_seconds} seconds')

Error processing URL: https://nos.nl/artikel/2468439-schip-kantelt-tegen-de-kade-in-schotland-25-gewonden
504 Server Error: Gateway Time-out for url: https://nos.nl/artikel/2468439-schip-kantelt-tegen-de-kade-in-schotland-25-gewonden
Elapsed time: 63 minutes and 21 seconds


Elapsed time for #rows

    10 : 0 minutes and 0 seconds
    100 : 0 minutes and 11 seconds
    500 : 1 minutes and 53 seconds
    1000 : 3 mintues and 5 seconds
    
    13387 : 63 mintues and 21 seconds

In [38]:
print(article_df.shape)
article_df.columns

(13387, 8)


Index(['Article ID', 'Link', 'Title', 'Date', 'Time', 'Category', 'Images',
       'Paragraphs'],
      dtype='object')

In [40]:
# article_df.to_excel('NOS_articles_221004-231004.xlsx')

### Correct empty categories to 'Liveblog'

In [31]:
article_df.loc[article_df['Category'] == '', 'Category'] = 'Liveblog'

In [34]:
article_df.to_excel('NOS_articles_including_liveblogs_221004-231004.xlsx')

### Manually add missing data due to server error

In [41]:
article_df_ = article_df.copy()

In [68]:
row_index = article_df_[article_df_['Link'] == 'https://nos.nl/artikel/2468439-schip-kantelt-tegen-de-kade-in-schotland-25-gewonden'].index[0]

# Use .iloc to set values in the specified columns for that row
article_df_.iloc[row_index, article_df_.columns.get_loc('Category')] = missing_article_category
article_df_.iloc[row_index, article_df_.columns.get_loc('Images')] = missing_article_image
article_df_.iat[row_index, 7] = missing_article_paragraphs

### Write .excel 

In [70]:
article_df_.to_excel('NOS_articles_221004-231004_corrected.xlsx')