In [58]:
from bs4 import BeautifulSoup
import requests
import os
from PIL import Image
from io import BytesIO
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time
from datetime import date, timedelta
from urllib.parse import urlparse

### Import dataset

In [112]:
pd.set_option('display.max_colwidth', None)
df = pd.read_excel('/Users/wiesruyters/Documents/WhD/Repositories/NOS_scrape/Datasets/NOS_articles_221004-231004_corrected.xlsx')

In [150]:
article_df = df.copy()

In [151]:
article_df = article_df.drop('Unnamed: 0', axis=1)
article_df['Images'] = article_df['Images'].str.replace(r"[\[\]']", '', regex = True)

##### Basic statistics

In [152]:
article_df['Date'] = pd.to_datetime(article_df['Date'], format='%Y-%m-%d')
article_df['Month'] = article_df['Date'].dt.strftime('%B')
article_df['Weekday'] = article_df['Date'].dt.weekday

print(f'''
The columns present in the dataframe are respectively: \n{article_df.columns.to_list()}
\nThe shape of the dataframe is {article_df.shape}
\nThe top-10 categories entail:\n{article_df['Category'].value_counts().head(10)}
\nThe publishing rate (Oct '22 to Oct '23) per month was:\n{article_df['Month'].value_counts()}
''')


The columns present in the dataframe are respectively: 
['Article ID', 'Link', 'Title', 'Date', 'Time', 'Category', 'Images', 'Paragraphs', 'Month', 'Weekday']

The shape of the dataframe is (13387, 10)

The top-10 categories entail:
['Buitenland']                            3904
['Binnenland']                            2387
['Economie']                               908
['Politiek']                               688
['Binnenland', 'Buitenland']               456
['NH Nieuws', 'Regionaal nieuws']          277
['Cultuur & Media']                        274
['Omroep Brabant', 'Regionaal nieuws']     272
['Binnenland', 'Politiek']                 206
['Omroep West', 'Regionaal nieuws']        174
Name: Category, dtype: int64

The publishing rate (Oct '22 to Oct '23) per month was:
October      1224
March        1215
November     1181
December     1154
May          1130
June         1122
April        1089
January      1080
September    1073
July         1063
August       1029
February   

### Create subset of political articles

##### Establish lists for entity recognition
political_parties: all parties taking seat in the second chamber, only including parties with more than 1 member or/and an established party name <br>
tk_figures: subjective selection of members of the second chamber <br>
cabinet_figures: all members of the Dutch cabinet

In [178]:
political_parties = [
    ' VVD', 'Volkspartij voor Vrijheid en Democratie',
    ' D66', 'Democraten 66',
    ' PVV', 'Partij voor de Vrijheid',
    ' CDA', 'Christen Democraten',
    ' SP ', ' SP.', ' SP,', ' SP:', ' SP;', ' SP-', 'Socialistische Partij',
    ' GL ', 'GroenLinks',
    ' PvdA', 'Partij van de Arbeid', 'P van de A',
    ' PvdD', 'Partij voor de Dieren', 'P voor de D',
    ' CU ', ' CU.', ' CU,', ' CU:', 'CU;', 'CU-', 'ChristenUnie', 'Christen Unie',
    ' FVD', 'Forum voor Democratie', 'Forum',
    ' SGP', 'Staatkundig Gereformeerde Partij',
    ' Denk ', ' Denk.', ' Denk,', ' Denk:', ' Denk;', ' Denk-',
    ' BBB', 'BoerBurgerBeweging', 'Boer Burger Beweging',
    ' Volt',
    ' Groep Van Haga',
    ' JA21',
    ' BIJ1',
]

tk_terms = [
    'Tweede Kamer', 'Tweede Kamerlid',
    'Volksvertegenwoordiger', 'Volksvertegenwoordiging',
    'Coalitie', 'Oppositie', 
    'Partijprominent', 'Partij prominent',
    'Kamervoorzitter'
]

tk_figures = [
    'Fleur Agema', 'Agema',
    'Farid Azarkan', 'Azarkan',
    'Thierry Baudet', 'Baudet',
    'Vera Bergkamp', 'Bergkamp',
    'Mirjam Bikker', 'Bikker',
    'Henri Bontebal', 'Bontebal',
    'Martin Bosma', 'Bosma',
    'Kauthar Bouchallikh', 'Bouchallikh',
    'Laurens Dassen', 'Dassen',
    'Joost Eerdmans', 'Eerdmans',
    'Corinne Ellemeet', 'Ellemeet',
    'Dion Graus', 'Graus',
    'Nilüfer Gündoğan', 'Gündoğan',
    'Wybren van Haga', 'van Haga',
    'Erik Haverkort', 'Haverkort',
    'Pieter Heerma', 'Heerma',
    'Sophie Hermans', 'Hermans',
    'Pepijn van Houwelingen', 'van Houwelingen',
    'Jesse Klaver', 'Klaver',
    'Tunahan Kuzu', 'Kuzu',
    'Attje Kuiken', 'Kuiken',
    'Lilian Marijnissen', 'Marijnissen',
    'Henk Nijboer', 'Nijboer',
    'Pieter Omtzigt', 'Omtzigt',
    'Esther Ouwehand', 'Ouwehand',
    'Jan Paternotte', 'Paternotte',
    'Caroline van der Plas', 'van der Plas',
    'Silvana Simons', 'Simons',
    'Kees van der Staaij', 'van der Staaij',
    'Sjoerd Sjoerdsma', 'Sjoerdsma',
    'Judith Tielen', 'Tielen',
    'Lisa Westerveld', 'Westerveld',
    'Geert Wilders', 'Wilders'
]

cabinet_terms =[
    'Staatssecretaris', 'Minister',
    'Kabinet', 'Ministerraad', 'staats',
    'Regering', 'Overheid', 'Bewindspersoon', 'Bewindspersonen',
    'Minister President', 'Premier'
]

cabinet_figures = [
    'Mark Rutte', 'Rutte',
    'Sigrid Kaag', 'Kaag',
    'Carola Schouten', 'Schouten',
    'Karien van Gennip', 'van Gennip',
    'Hanke Bruins Slot', 'Bruins Slot',
    'Wopke Hoekstra', 'Hoekstra',
    'Dilan Yeşilgöz-Zegerius', 'Yeşilgöz',
    'Hugo de Jonge', 'de Jonge',
    'Robbert Dijkgraaf', 'Dijkgraaf',
    'Kajsa Ollongren', 'Ollongren',
    'Mark Harbers', 'Harbers',
    'Micky Adriaansens', 'Adriaansens',
    'Piet Adema', 'Adema',
    'Ernst Kuipers', 'Kuipers',
    'Liesje Schreinemacher', 'Schreinemacher',
    'Franc Weerwind', 'Weerwind',
    'Mariëlle Paul', 'Paul',
    'Rob Jetten', 'Jetten',
    'Christianne van der Wal-Zeggelink', 'Christianne van der Wal', 'van der Wal',
    'Conny Helder',
    'Eeric van der Burg', 'van der Burg',
    'Alexandra van Huffelen', 'van Huffelen',
    'Gunay Uslu', 'Uslu',
    'Marnix van Rij', 'van Rij',
    'Aukje de Vries', 'de Vries',
    'Christophe van der Maat', 'van der Maat',
    'Vivianne Heijnen', 'Heijnen',
    'Hans Vijlbrief', 'Vijlbrief',
    'Maarten van Ooijen', 'van Ooijen'
]

lc_political_parties = [item.lower() for item in political_parties]
lc_tk_terms = [item.lower() for item in tk_terms]
lc_tk_figures = [item.lower() for item in tk_figures]
lc_cabinet_terms = [item.lower() for item in cabinet_terms]
lc_cabinet_figures = [item.lower() for item in cabinet_figures]

In [179]:
lc_political_words = lc_political_parties + lc_tk_terms + lc_tk_figures + lc_cabinet_terms + lc_cabinet_figures

Lowercase Paragraph and Title text to match entities better

In [180]:
article_df['lc Title'] = article_df['Title'].apply(lambda x: x.lower())
article_df['lc Paragraphs'] = article_df['Paragraphs'].apply(lambda x: x.lower())

Wordcounts

In [181]:
article_df['Political title'] = article_df['lc Title'].apply(lambda text: [word for word in political_words if word in text])

In [182]:
article_df['Political paragraphs'] = article_df['lc Paragraphs'].apply(lambda text: [word for word in political_words if word in text])

In [183]:
print(f'''
The number of possibly political titles counts {article_df['Political title'].apply(lambda x: len(x) > 0).sum()} out of {len(article_df)} articles in total - about {int(article_df['Political title'].apply(lambda x: len(x) > 0).sum()/len(article_df)*100)}%\n
The number of possibly political articles counts {article_df['Political paragraphs'].apply(lambda x: len(x) > 0).sum()} out of {len(article_df)} articles in total - about {int(article_df['Political paragraphs'].apply(lambda x: len(x) > 0).sum()/len(article_df)*100)}%
''')


The number of possibly political titles counts 1099 out of 13387 articles in total - about 8%

The number of possibly political articles counts 6896 out of 13387 articles in total - about 51%



##### Create subset with possibly political titles, only with image urls

In [184]:
political_titles = article_df[article_df['Political title'].apply(lambda x: len(x) > 0) & article_df['Images'].apply(lambda x: x!= '')] 

In [191]:
political_titles.shape

(1079, 14)

### Set-up image subtitle scraping

In [186]:
def figcaption_scrape(url):
    try: 
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        figcaption_element = soup.find('figcaption', class_='sc-8d6b37cc-2 kFTos')
        
        if figcaption_element:
            return figcaption_element.get_text()
        else:
            return ''
    except Exception as e:
        return str(e)

In [187]:
start_time = time.time()

article_df['Image subtitle'] = article_df['Link'].apply(figcaption_scrape)

end_time = time.time()
elapsed_time = end_time - start_time
elapsed_minutes = int(elapsed_time // 60)
elapsed_seconds = int(elapsed_time % 60)


print(f'Elapsed time: {elapsed_minutes} minutes and {elapsed_seconds} seconds')

Elapsed time: 60 minutes and 37 seconds


In [189]:
article_df.to_excel('NOS_articles_221004-231004_corrected_img_subscription.xlsx')

### Set-up image scraping for political title articles

In [192]:
start_time = time.time()

img_directory = '/Users/wiesruyters/Documents/WhD/Repositories/NOS_scrape/Images'

if not os.path.exists(img_directory):
    os.makedirs(img_directory)

for index, row in political_titles.iterrows():
    img_url = row['Images']
    art_id = row['Article ID']
    
    if len(img_url) > 0:
        img_response = requests.get(img_url)
        
        if img_response.status_code == 200:
            img_filename = f"article_{art_id}.jpg"
            img_path = os.path.join(img_directory, img_filename)
            
            with open(img_path, 'wb') as img_file:
                img_file.write(img_response.content)
            
end_time = time.time()
elapsed_time = end_time - start_time
elapsed_minutes = int(elapsed_time // 60)
elapsed_seconds = int(elapsed_time % 60)


print(f'Elapsed time: {elapsed_minutes} minutes and {elapsed_seconds} seconds')

Elapsed time: 1 minutes and 53 seconds
