In [1]:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import re
import sys
import time
from datetime import date, timedelta
from tqdm import tqdm
import parquet
import pyarrow
import ast
import json
import html
import hashlib
import urljoin

Fetch UK input

In [2]:
uk_articles_df = pd.read_parquet('datasets/news/UK paperboy results/uk_paperboy_results.parquet')

In [3]:
print(f'Columns in the UK dataframe: {uk_articles_df.columns}')
uk_articles = uk_articles_df.copy()

Columns in the UK dataframe: Index(['url', 'expanded_url', 'domain', 'status', 'datetime', 'author',
       'headline', 'text', 'misc'],
      dtype='object')


In [4]:
def generate_short_id(url):
    hash_obj = hashlib.sha256(url.encode())
    hex_digest = hash_obj.hexdigest()
    short_id = hex_digest[:15]
    return short_id

In [5]:
uk_articles['outlet'] = uk_articles.apply(lambda row: 'BBC' if 'bbc.com' in row.expanded_url else 'The Guardian', axis=1)
uk_articles['id'] = uk_articles['expanded_url'].apply(lambda x: x.replace('-', '/').split('/')[-1] if 'bbc.com' in x else generate_short_id(x))
uk_articles['images'] = uk_articles['misc'].str.split(';').str[1].str.strip()
uk_articles['category'] = ''
uk_articles['text'] = uk_articles['text'].str.replace('\n', ' ')
uk_articles['alt_txt'] = uk_articles['misc'].str.extract(r'alt="([^"]*)" src')
uk_articles = uk_articles[['outlet', 'id', 'expanded_url', 'images', 'datetime', 'category', 'headline', 'text', 'alt_txt']]
uk_column_rename = {
    'expanded_url': 'url', 
    'headline': 'title', 
    'text': 'paragraphs'
}
uk_articles = uk_articles.rename(columns = uk_column_rename)

In [6]:
uk_articles.columns

Index(['outlet', 'id', 'url', 'images', 'datetime', 'category', 'title',
       'paragraphs', 'alt_txt'],
      dtype='object')

Image_urls for bbc

In [16]:
def scrape_bbc_image(article_url):
    """
    Scraper function that fetches the main image url and the figcaption + alt
    """
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(article_url, headers=headers, timeout=45)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        image_block = soup.find('div', {'data-component': 'image-block'})
        if not image_block:
            return None, None
        
        img_tag = image_block.find('img', {'srcset': True})
        if img_tag:
            srcset = img_tag['srcset']
            urls = [url.strip() for url in srcset.split(',')]
            image_url = next((url.split(' ')[0] for url in urls if '480w' in url), urls[0].split(' ')[0])
        else:
            image_url = None
        
        # Fetch alt text
        alt_text = img_tag.get('alt', '') if img_tag else ''
        
        # Fetch caption
        caption_block = soup.find('div', {'data-component': 'caption-block'})
        if not caption_block:
            return None, None
        figcaption = caption_block.find('figcaption')
        caption = figcaption.get_text(strip=True) if figcaption else ''
        
        # Combine for highest information richness
        combined_text = f"{alt_text} | {caption}" if caption else alt_text
        time.sleep(1.2)

        return image_url, combined_text
    
    except Exception as e:
        print(f"Error scraping {article_url}: {e}")
        return None, None

In [9]:
bbc_mask = uk_articles['outlet'] == 'BBC'
bbc_indices = uk_articles[bbc_mask].index

for idx in tqdm(bbc_indices, total=len(bbc_indices)):
    url = uk_articles.at[idx, 'url']
    
    if any(skip_path in url for skip_path in ['/news/videos/', '/news/live/']):
        continue
    
    image_url, alt_caption = scrape_bbc_image(url)
    
    if image_url:
        uk_articles.at[idx, 'images'] = image_url
    if alt_caption:
        uk_articles.at[idx, 'alt_txt'] = alt_caption

  9%|▉         | 4533/51177 [2:07:13<92:33:31,  7.14s/it]

Error scraping https://www.bbc.com/news/articles/czvjd9rz455o: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 10%|█         | 5169/51177 [2:25:32<97:43:36,  7.65s/it]

Error scraping https://www.bbc.com/news/uk-england-derbyshire-68837439: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 21%|██▏       | 10966/51177 [5:07:48<79:27:02,  7.11s/it]

Error scraping https://www.bbc.com/news/articles/cq5nd99pxj1o: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 26%|██▌       | 13084/51177 [6:06:19<73:46:17,  6.97s/it]

Error scraping https://www.bbc.com/news/articles/c9wz4d881vyo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 29%|██▉       | 14799/51177 [6:53:07<72:08:21,  7.14s/it]

Error scraping https://www.bbc.com/news/articles/cn0v02wxgrwo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 35%|███▍      | 17716/51177 [8:13:41<58:03:02,  6.25s/it]

Error scraping https://www.bbc.com/news/articles/cydd7v4d337o: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 38%|███▊      | 19665/51177 [9:07:24<64:22:22,  7.35s/it]

Error scraping https://www.bbc.com/news/articles/c8vv4z6g0ezo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 39%|███▊      | 19773/51177 [9:10:40<62:09:28,  7.13s/it]

Error scraping https://www.bbc.com/news/articles/crggjkx2zx4o: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 46%|████▋     | 23769/51177 [10:57:46<57:15:41,  7.52s/it]

Error scraping https://www.bbc.com/news/articles/cv227v1n317o: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 59%|█████▉    | 30256/51177 [13:51:14<42:05:51,  7.24s/it]

Error scraping https://www.bbc.com/news/articles/cpv3979gerzo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 62%|██████▏   | 31924/51177 [14:35:54<37:28:56,  7.01s/it]

Error scraping https://www.bbc.com/news/articles/c047kp3nmevo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 65%|██████▌   | 33427/51177 [15:16:30<27:22:51,  5.55s/it]

Error scraping https://www.bbc.com/news/articles/c3g08z8m72ro: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 70%|███████   | 35958/51177 [16:26:56<30:06:50,  7.12s/it]

Error scraping https://www.bbc.com/news/articles/cv2gw48q19zo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 74%|███████▍  | 38042/51177 [17:25:09<28:31:02,  7.82s/it]

Error scraping https://www.bbc.com/news/articles/cg6426wr0e6o: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 80%|███████▉  | 40812/51177 [18:42:58<18:33:41,  6.45s/it]

Error scraping https://www.bbc.com/news/articles/c1rr1qqqny0o: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


 82%|████████▏ | 41788/51177 [19:10:51<18:43:25,  7.18s/it]

Error scraping https://www.bbc.com/news/articles/cldyd5413pvo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


100%|█████████▉| 51165/51177 [23:34:50<01:30,  7.50s/it]   

Error scraping https://www.bbc.com/news/articles/c3d9ze22dreo: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20)


100%|██████████| 51177/51177 [23:35:05<00:00,  1.66s/it]


In [17]:
bbc_missed_images = [
    'https://www.bbc.com/news/articles/czvjd9rz455o',
    'https://www.bbc.com/news/uk-england-derbyshire-68837439',
    'https://www.bbc.com/news/articles/cq5nd99pxj1o',
    'https://www.bbc.com/news/articles/c9wz4d881vyo',
    'https://www.bbc.com/news/articles/cn0v02wxgrwo',
    'https://www.bbc.com/news/articles/cydd7v4d337o',
    'https://www.bbc.com/news/articles/c8vv4z6g0ezo',
    'https://www.bbc.com/news/articles/crggjkx2zx4o',
    'https://www.bbc.com/news/articles/cv227v1n317o',
    'https://www.bbc.com/news/articles/cpv3979gerzo',
    'https://www.bbc.com/news/articles/c047kp3nmevo',
    'https://www.bbc.com/news/articles/c3g08z8m72ro',
    'https://www.bbc.com/news/articles/cv2gw48q19zo',
    'https://www.bbc.com/news/articles/cg6426wr0e6o',
    'https://www.bbc.com/news/articles/c1rr1qqqny0o',
    'https://www.bbc.com/news/articles/cldyd5413pvo',
    'https://www.bbc.com/news/articles/c3d9ze22dreo'
]

In [18]:
missed_mask = uk_articles['url'].isin(bbc_missed_images)
combined_mask = missed_mask & (uk_articles['outlet'] == 'BBC')
missed_indices = uk_articles[combined_mask].index

for idx in tqdm(missed_indices, total=len(missed_indices)):
    url = uk_articles.at[idx, 'url']

    image_url, alt_caption = scrape_bbc_image(url)
    
    if image_url:
        uk_articles.at[idx, 'images'] = image_url
    if alt_caption:
        uk_articles.at[idx, 'alt_txt'] = alt_caption

100%|██████████| 17/17 [00:28<00:00,  1.66s/it]


In [20]:
uk_articles.to_parquet('datasets/news/uk_articles_preprocessed.parquet')

In [22]:
uk_articles_parquet = pd.read_parquet('datasets/news/uk_articles_preprocessed.parquet')
uk_paperboy_parquet = pd.read_parquet('datasets/news/UK paperboy results/uk_paperboy_results.parquet')

In [24]:
print(uk_articles_parquet.shape, uk_articles_parquet.columns)
print(uk_paperboy_parquet.shape, uk_paperboy_parquet.columns)

(82777, 9) Index(['outlet', 'id', 'url', 'images', 'datetime', 'category', 'title',
       'paragraphs', 'alt_txt'],
      dtype='object')
(82789, 9) Index(['url', 'expanded_url', 'domain', 'status', 'datetime', 'author',
       'headline', 'text', 'misc'],
      dtype='object')


In [26]:
datetime_mapping = uk_paperboy_parquet.set_index('expanded_url')['datetime'].to_dict()
mask = uk_articles_parquet['url'].isin(datetime_mapping.keys())
uk_articles_parquet.loc[mask, 'datetime'] = uk_articles_parquet.loc[mask, 'url'].map(datetime_mapping)


In [27]:
num_updated = mask.sum()
print(f"Updated datetime for {num_updated} rows out of {len(uk_articles_parquet)}")

Updated datetime for 82777 rows out of 82777
