In [1]:
from bs4 import BeautifulSoup
import requests
import lxml
from PIL import Image
from io import BytesIO
from urllib.parse import urljoin

from datetime import datetime, timedelta
import time

import pandas as pd
import numpy as np

import random
from random import uniform, choice

import itertools
import re
import sys
from datetime import datetime, date, timedelta
from tqdm import tqdm

import parquet
import pyarrow
import ast
import json
import html

In [16]:
pd.set_option('max_colwidth', None)

### The Guardian article URL collection with academic API

The Guardian pre-processing API response from query:
https://content.guardianapis.com/search?from-date=2024-04-01&to-date=2024-08-31&api-key=[your_API_key]0&page-size=200

In [11]:
base_url = "https://content.guardianapis.com/search"
params = {
    'from-date': '2024-04-01',
    'to-date': '2024-08-31',
    'api-key': 'ff436465-9a66-4d62-89f9-9a5bd432f7e0',
    'page-size': 200,  # Max
    'show-fields': 'all',
    'page': 1
}

articles_data = []

for page in range(1, 169):  # pages 1 to 168 (based on response)
    params['page'] = page
    
    try:
        response = requests.get(base_url, params=params).json()
        
        for article in response['response']['results']:
            articles_data.append({
                'id': article.get('id'),
                'type': article.get('type'),
                'section': article.get('sectionName'),
                'publication_date': article.get('webPublicationDate'),
                'title': article.get('webTitle'),
                'url': article.get('webUrl')
            })
            
    except Exception as e:
        print(f"Error on page {page}: {str(e)}")
        continue

In [22]:
guardian_api_response = pd.DataFrame(articles_data)

In [None]:
guardian_api_response['publication_date'] = pd.to_datetime(guardian_api_response['publication_date'])
guardian_api_response['date'] = pd.to_datetime(guardian_api_response['publication_date']).dt.date

print(f"Full API response is of shape {guardian_api_response.shape}")
print(f"Total set of article types lists: {guardian_api_response["type"].unique()}\n")

guardian_articles = guardian_api_response[guardian_api_response['type']=="article"]
print(f"The remaining dataset with only articles is of shape {guardian_articles.shape}")

Full API response is of shape (33427, 7)
Total set of article types lists: ['article' 'crossword' 'liveblog' 'interactive']
The remaining dataset with only articles is of shape (31601, 7)


In [None]:
guardian_articles.to_csv("datasets/news/UK input/guardian.csv")

### BBC article collection

In [37]:
BASE_URL = "https://www.bbc.com/pages/archive/{year}/{month}/{day}?page={page}"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.bbc.com/pages/archive",
    "Cookie": "BBC-consent=necessary"
}
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1"
]
MAX_PAGES = 80

# UK election period 
start_date = datetime(2024, 4, 1)
end_date = datetime(2024, 8, 31)
date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Initialisation
articles_data = []

for date in date_range:
    year = date.year
    month = f"{date.month:02d}"
    day = f"{date.day:02d}"

    for page in range(1, MAX_PAGES + 1):
        url = BASE_URL.format(year=year, month=month, day=day, page=page)

        try:
            # Rotate User-Agent
            HEADERS["User-Agent"] = choice(USER_AGENTS)
            
            # Request with delay
            time.sleep(uniform(1, 3))
            response = requests.get(url, headers=HEADERS, timeout=15)
            
            if response.status_code != 200:
                print(f"Status {response.status_code} at {url}")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            
            # Extract articles
            article_cards = []
            for element in soup.select('div[data-testid="liverpool-card"], [data-testid*="card"]'):
                links = element.select('a[href*="/news/"]')
                if links:
                    article_cards.extend(links)

            print(f"Found {len(article_cards)} valid articles on {url}")

            if not article_cards:
                print(f"No articles with valid links at {url}")

            # Extract images
            image_divs = soup.select('div[data-testid="card-media"], [data-testid*="media"] img[srcset]')

            for idx, card in enumerate(article_cards):
                try:
                    article_url = card.get('href', '')
                    if not article_url.startswith('http'):
                        article_url = "https://www.bbc.com" + article_url
                
                    # Image handling
                    img_data = {
                        "image_url_page_tile": None,
                        "image_alt": ""
                    }
                    if idx < len(image_divs):
                        img_tag = image_divs[idx]
                        srcset = img_tag.get("srcset", "")
                        if srcset:
                            img_data["image_url_page_tile"] = srcset.split(",")[0].split(" ")[0].strip()
                        img_data["image_alt"] = img_tag.get("alt", "")

                    articles_data.append({
                        "date": date.date(),
                        "page": page,
                        "article_url": article_url,
                        "page_tile_img_url": img_data["image_url_page_tile"],
                        "page_tile_img_alt": img_data["image_alt"]
                    })
                    
                except Exception as e:
                    print(f"Skipping malformed card at index {idx}: {str(e)[:100]}")
                    continue

        except Exception as e:
            print(f"Error on {url}: {str(e)[:100]}")
            time.sleep(5)
            continue

Found 7 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=1
Found 2 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=2
Found 6 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=3
Found 1 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=4
Found 4 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=5
Found 4 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=6
Found 6 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=7
Found 1 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=8
Found 5 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=9
Found 5 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=10
Found 3 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=11
Found 4 valid articles on https://www.bbc.com/pages/archive/2024/04/01?page=12
Found 3 valid articles on https://www.bbc.com/pages/archive/2

In [39]:
bbc_articles = pd.DataFrame(articles_data)

In [40]:
bbc_articles.head()

Unnamed: 0,date,page,article_url,page_tile_img_url,page_tile_img_alt
0,2024-04-01,1,https://www.bbc.com/news/blogs-the-papers-68710143,,
1,2024-04-01,1,https://www.bbc.com/news/health-68684976,https://ichef.bbci.co.uk/news/240/cpsprodpb/3647/production/_133059831_untitleddesign.jpg.webp,Compilation of the Times and Metro front pages
2,2024-04-01,1,https://www.bbc.com/news/world-us-canada-68710223,,
3,2024-04-01,1,https://www.bbc.com/news/entertainment-arts-68675076,https://ichef.bbci.co.uk/news/240/cpsprodpb/E5DC/production/_128244885_gettyimages-1204982830.jpg.webp,"The technology involves an insulin pump, continuous glucose monitor and an algorithm to calculate the amount of insulin needed"
4,2024-04-01,1,https://www.bbc.com/news/business-68429393,,


In [41]:
bbc_articles.to_csv('datasets/news/UK input/bbc.csv')

## Combine Guardian and BBC article urls as input for paperboy (R studio)

In [21]:
guardian_articles = pd.read_csv('datasets/news/UK input/guardian.csv')
guardian_articles.drop(columns=['Unnamed: 0'], inplace=True)
bbc_articles = pd.read_csv('datasets/news/UK input/bbc.csv')
bbc_articles.drop(columns=['Unnamed: 0'], inplace=True)

print(f'Columns for the Guardian dataset: {guardian_articles.columns}')
print(f'Columns for the BBC dataset: {bbc_articles.columns}')

Columns for the Guardian dataset: Index(['id', 'type', 'section', 'publication_date', 'title', 'url', 'date'], dtype='object')
Columns for the BBC dataset: Index(['date', 'page', 'article_url', 'page_tile_img_url',
       'page_tile_img_alt'],
      dtype='object')


In [18]:
uk_article_urls = pd.DataFrame()
uk_article_urls['url'] = pd.concat([bbc_articles['article_url'], guardian_articles['url']])

In [None]:
uk_article_urls.to_csv('datasets/news/UK input/uk_article_urls.csv')