In [47]:
# For necessary data processing and calculations
import numpy as np
import pandas as pd

# For reading and writing files
import json
import pickle
import sys
import os
import glob
import shutil
import io
from io import BytesIO
from pathlib import Path
import hashlib
import pyarrow
import fastparquet
import uuid
import re

# For image processing
from PIL import Image, UnidentifiedImageError
import dlib
import cv2
import insightface
from insightface.app import FaceAnalysis

# For machine learning
import tensorflow as tf

# For web scraping
import html
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse

# For tracking programming progress
from tqdm import tqdm
import time
import random

In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [3]:
pd.options.display.max_colwidth = None

Initalize Insightface

In [4]:
app = FaceAnalysis(
    name='buffalo_l',
    providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
app.prepare(ctx_id=0, det_size=(640, 640))



Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/wiesruyters/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None

In [5]:
def clean_image_urls(df):
    """
    Removes articles with invalid/empty image URLs from DataFrame
    Returns cleaned DataFrame and keeps original index
    """
    # Make sure we have the required columns
    if 'images' not in df.columns:
        raise ValueError("DataFrame must contain 'images' column")
    
    # Create mask for valid URLs
    mask = (
        df['images'].notna() & 
        df['images'].astype(str).str.strip().ne('') &
        ~df['images'].astype(str).str.lower().isin(['none', 'nan', 'null']) &
        df['images'].astype(str).str.contains('http')
    )
    
    # Filter and clean
    clean_df = df[mask].copy()
    clean_df.loc[:, 'images'] = clean_df['images'].astype(str).str.strip(" '\"")
    
    return clean_df.reset_index(drop=True)

def to_list_if_not(value):
    if isinstance(value, float) and np.isnan(value):
        return []
    if not isinstance(value, list):
        return [value]
    return value


Initalize NL & UK datasets

In [22]:
nl_articles = pd.read_parquet('datasets/news/nl_articles_preprocessed.parquet')
uk_articles = pd.read_parquet('datasets/news/uk_articles_preprocessed.parquet')

In [23]:
nl_articles['datetime'] = pd.to_datetime(nl_articles['datetime']).dt.tz_localize(None)

uk_articles['country'] = 'UK'
uk_articles['datetime'] = pd.to_datetime(uk_articles['datetime'], format='mixed')
uk_articles = uk_articles[['country', 'outlet', 'id', 'url', 'images', 'datetime', 'category', 'title',
       'paragraphs', 'alt_txt']]
uk_articles['category'] = uk_articles['category'].apply(lambda x: [] if x=='' else x)

In [24]:
# For testing
nl_articles_sample = nl_articles.sample(n=10)
uk_articles_sample = uk_articles.sample(n=10)
all_articles_sample = pd.concat([nl_articles_sample, uk_articles_sample])

img_articles_sample = clean_image_urls(all_articles_sample)

### Necessary dtype conversions in order to concatenate both countries
Create a unique id to store img downloads in order to match them later. <br>
Also, create compatible dtypes for rparquet file writing. <br>

In [25]:
# For final data downloads
all_articles = pd.concat([nl_articles, uk_articles])
all_articles['datetime'] = pd.to_datetime(all_articles['datetime'])

In [26]:
num_digits = len(str(len(all_articles)))
all_articles['id_unique'] = [f"id{i:0{num_digits}d}" for i in range(1, len(all_articles)+1)]

all_articles['id'] = all_articles['id'].astype(str)
all_articles['id_unique'] = all_articles['id_unique'].astype(str)

all_articles['category'] = all_articles['category'].apply(to_list_if_not)

In [27]:
print(f'All UK, NL articled concatenated are of shape {all_articles.shape}')
print(f'and have the following dtypes:')
print(f'{all_articles.dtypes}')

All UK, NL articled concatenated are of shape (94692, 11)
and have the following dtypes:
country               object
outlet                object
id                    object
url                   object
images                object
datetime      datetime64[ns]
category              object
title                 object
paragraphs            object
alt_txt               object
id_unique             object
dtype: object


In [28]:
# Now save to parquet
all_articles.to_parquet('datasets/news/all_articles_with_id.parquet')

Filter only rows that feature an image

In [29]:
# (Debugger)
print(all_articles_sample.shape)
#print(all_articles_sample['images'])
print('*******')
print(img_articles_sample.shape)
#print(img_articles_sample['images'])

(20, 10)
*******
(19, 10)


In [30]:
img_articles = clean_image_urls(all_articles)
print(f'After filtering for articles that have no main image, e.g. because they host a video or liveblog,')
print(f'a dataset of shape {img_articles.shape} remains.')

After filtering for articles that have no main image, e.g. because they host a video or liveblog,
a dataset of shape (86862, 11) remains.


In [31]:
img_articles.to_parquet('datasets/news/img_articles_with_id.parquet')

In [32]:
img_articles[['id', 'id_unique']][:1]

Unnamed: 0,id,id_unique
0,2503389,id00001


Define functions

In [33]:
def generate_filename(outlet, article_id, id_unique, face_index):
    """Generates a standardized filename for a cropped face image."""
    safe_outlet = "".join(c for c in outlet if c.isalnum() or c in ('_')).rstrip()
    return f"{safe_outlet}_{article_id}_{id_unique}_{face_index}.jpg"

def download_image(url, timeout=None):
    """
    Robust image downloader with complete validation
    """
        
    try:
        response = requests.get(
            url,
            headers={'User-Agent': 'Mozilla/5.0'},
            timeout=timeout
        )
        response.raise_for_status()
        
        if 'image' not in response.headers.get('Content-Type', '').lower():
            print(f"URL does not point to an image: {url}")
            return None
            
        image = cv2.imdecode(np.frombuffer(response.content, np.uint8), cv2.IMREAD_COLOR)
        return image if image is not None else None
        
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return None

def process_and_save_faces(df, output_base, face_app, skip_existing=True, start_index=0, progress_file='progress_index.txt'):
    """
    Processes DataFrame to download images, detect faces, and save them
    """
    successful_saves = []
    failed_articles = []

    # Initialize progress bar
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing..."):
        current_global_index = start_index + i

        try:
            # Extract data with validation
            country = str(row.get('country', ''))
            outlet = str(row.get('outlet', ''))
            article_id = str(row.get('id', ''))
            id_unique = str(row.get('id_unique', ''))
            img_url = str(row.get('images', ''))
            article_url = str(row.get('url', ''))
            
            # Validate required fields
            if not all([country, outlet, article_id, img_url]):
                failed_articles.append({
                    'outlet': outlet,
                    'id': article_id,
                    'id_unique': id_unique,
                    'url': article_url,
                    'image': img_url,
                    'reason': 'Missing required field(s)'
                })
                continue

            with open(progress_file, 'w') as f:
                f.write(str(current_global_index + 1))
                
            # Set output directory
            output_dir = os.path.join(output_base, country, outlet)
            os.makedirs(output_dir, exist_ok=True)
            
            # Download image with retries
            img = None
            for attempt in range(3):
                img = download_image(img_url)
                if img is not None:
                    break
                time.sleep(1)
            
            if img is None:
                failed_articles.append({
                    'outlet': outlet,
                    'id': article_id,
                    'id_unique': id_unique,
                    'url': article_url,
                    'image': img_url,
                    'reason': f'Failed to download image'
                })
                continue
                
            # Face detection and processing
            faces = face_app.get(img)
            faces = [face for face in faces if face.det_score > 0.6]
            
            for i, face in enumerate(faces):
                face_filename = generate_filename(outlet, article_id, id_unique, i)
                save_path = os.path.join(output_dir, face_filename)
                
                if skip_existing and os.path.exists(save_path):
                    continue
                    
                # Process and save face crop
                bbox = face.bbox.astype(int)
                x1, y1, x2, y2 = bbox
                pad_w, pad_h = int(0.35 * (x2-x1)), int(0.35 * (y2-y1))
                
                x1 = max(0, x1 - pad_w)
                y1 = max(0, y1 - pad_h)
                x2 = min(img.shape[1], x2 + pad_w)
                y2 = min(img.shape[0], y2 + pad_h)
                
                face_crop = img[y1:y2, x1:x2]
                if face_crop.size == 0:
                    continue
                    
                cv2.imwrite(save_path, face_crop, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
                successful_saves.append(save_path)
                
                time.sleep(0.75)

        except Exception as e:
            failed_articles.append({
                'outlet': outlet if 'outlet' in locals() else 'unknown',
                'id': article_id if 'article_id' in locals() else 'unknown',
                'id_unique': id_unique if 'id_unique' in locals() else 'unknown',
                'url': article_url if 'article_url' in locals() else 'unknown',
                'image': img_url if 'image_url' in locals() else 'unknown',
                'reason': f'Unexpected error: {str(e)}'
            })
            
    return successful_saves, failed_articles

In [34]:
OUTPUT_BASE = 'datasets/news/UK-NL__news_faces'
os.makedirs(OUTPUT_BASE, exist_ok=True)

In [35]:
print(f"Starting to process {len(img_articles)} articles...")
successful_saves, failed_articles = process_and_save_faces(
    df=img_articles,
    output_base=OUTPUT_BASE,
    face_app=app,
    skip_existing=True
)

print("\n--- Processing Complete ---")
print(f"Successfully saved {len(successful_saves)} face images.")
if failed_articles:
    print(f"Encountered {len(failed_articles)} issues.")
    for failure in failed_articles:
        print(f"-Outlet {failure['outlet']} - article id {failure['id']} - unique id {failure['id_unique']}:")
        print(f"    for url {failure['url']} with image {failure['image']}:")
        print(f"    {failure['reason']}")
        print('* * * ' * 3)

Starting to process 86862 articles...


Processing...:   7%|▋         | 5746/86862 [3:27:12<54:16:40,  2.41s/it]  

Error downloading https://media.nu.nl/m/6hyxw7ka42lm_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/6hyxw7ka42lm_wd854


Processing...:   7%|▋         | 5752/86862 [3:28:05<127:55:53,  5.68s/it]

Error downloading https://media.nu.nl/m/k0jx258aztth_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/k0jx258aztth_wd854


Processing...:   7%|▋         | 5761/86862 [3:28:53<59:01:09,  2.62s/it] 

Error downloading https://media.nu.nl/m/4qkxds1aqs4h_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/4qkxds1aqs4h_wd854


Processing...:   7%|▋         | 5827/86862 [3:33:09<82:22:18,  3.66s/it] 

Error downloading https://media.nu.nl/m/1kax8eqaahtn_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/1kax8eqaahtn_wd854


Processing...:   7%|▋         | 5843/86862 [3:34:40<71:12:45,  3.16s/it] 

Error downloading https://media.nu.nl/m/1vmx6d1a0glg_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/1vmx6d1a0glg_wd854


Processing...:   7%|▋         | 5883/86862 [3:37:43<105:57:02,  4.71s/it]

Error downloading https://media.nu.nl/m/rbsx7qiawj7u_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/rbsx7qiawj7u_wd854


Processing...:   7%|▋         | 5885/86862 [3:38:18<219:29:08,  9.76s/it]

Error downloading https://media.nu.nl/m/uy2xoynaandt_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/uy2xoynaandt_wd854
Error downloading https://media.nu.nl/m/uy2xoynaandt_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/uy2xoynaandt_wd854
Error downloading https://media.nu.nl/m/uy2xoynaandt_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/uy2xoynaandt_wd854


Processing...:   7%|▋         | 5912/86862 [3:41:19<57:05:09,  2.54s/it] 

Error downloading https://media.nu.nl/m/t83xev9aokel_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/t83xev9aokel_wd854


Processing...:   7%|▋         | 5946/86862 [3:43:47<72:45:16,  3.24s/it] 

Error downloading https://media.nu.nl/m/jb3x2eaadznc_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/jb3x2eaadznc_wd854


Processing...:   7%|▋         | 5973/86862 [3:45:56<82:41:16,  3.68s/it] 

Error downloading https://media.nu.nl/m/67wxibvam1vq_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/67wxibvam1vq_wd854


Processing...:   7%|▋         | 5997/86862 [3:47:43<72:02:22,  3.21s/it] 

Error downloading https://media.nu.nl/m/qxnx14nad9gb_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/qxnx14nad9gb_wd854


Processing...:   7%|▋         | 6059/86862 [3:51:59<95:09:29,  4.24s/it] 

Error downloading https://media.nu.nl/m/l8dx7ugabzcn_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/l8dx7ugabzcn_wd854


Processing...:   7%|▋         | 6064/86862 [3:52:41<127:01:54,  5.66s/it]

Error downloading https://media.nu.nl/m/hfexuw7af88e_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hfexuw7af88e_wd854
Error downloading https://media.nu.nl/m/hfexuw7af88e_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hfexuw7af88e_wd854
Error downloading https://media.nu.nl/m/hfexuw7af88e_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hfexuw7af88e_wd854


Processing...:   7%|▋         | 6068/86862 [3:54:03<247:02:08, 11.01s/it]

Error downloading https://media.nu.nl/m/isfxzkbagj4j_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/isfxzkbagj4j_wd854


Processing...:   7%|▋         | 6140/86862 [3:59:21<67:31:54,  3.01s/it] 

Error downloading https://media.nu.nl/m/ro6xcmya1z47_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/ro6xcmya1z47_wd854


Processing...:   7%|▋         | 6156/86862 [4:00:46<66:24:12,  2.96s/it] 

Error downloading https://media.nu.nl/m/zc3xdsaatel2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/zc3xdsaatel2_wd854


Processing...:   7%|▋         | 6196/86862 [4:03:34<56:42:41,  2.53s/it] 

Error downloading https://media.nu.nl/m/zehx18yapc3x_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/zehx18yapc3x_wd854


Processing...:   7%|▋         | 6215/86862 [4:05:01<125:09:54,  5.59s/it]

Error downloading https://media.nu.nl/m/g55xm00ane5z_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/g55xm00ane5z_wd854


Processing...:   7%|▋         | 6226/86862 [4:06:16<88:53:18,  3.97s/it] 

Error downloading https://media.nu.nl/m/qx6xjjcaxslq_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/qx6xjjcaxslq_wd854


Processing...:   7%|▋         | 6239/86862 [4:07:38<97:41:38,  4.36s/it] 

Error downloading https://media.nu.nl/m/dwhx6vwa2z96_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/dwhx6vwa2z96_wd854


Processing...:   7%|▋         | 6276/86862 [4:10:54<73:41:57,  3.29s/it] 

Error downloading https://media.nu.nl/m/8a1xltsa5w4h_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/8a1xltsa5w4h_wd854


Processing...:   7%|▋         | 6296/86862 [4:12:19<48:25:42,  2.16s/it] 

Error downloading https://media.nu.nl/m/pxsxys0a6x4q_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/pxsxys0a6x4q_wd854


Processing...:   7%|▋         | 6317/86862 [4:13:58<52:04:11,  2.33s/it] 

Error downloading https://media.nu.nl/m/fsjxteeadvax_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/fsjxteeadvax_wd854


Processing...:   7%|▋         | 6356/86862 [4:17:11<89:29:38,  4.00s/it] 

Error downloading https://media.nu.nl/m/z70xbdba4cd7_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/z70xbdba4cd7_wd854


Processing...:   7%|▋         | 6363/86862 [4:18:09<130:03:15,  5.82s/it]

Error downloading https://media.nu.nl/m/3o9xz33a42xo_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3o9xz33a42xo_wd854
Error downloading https://media.nu.nl/m/3o9xz33a42xo_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3o9xz33a42xo_wd854


Processing...:   7%|▋         | 6416/86862 [4:21:58<78:24:25,  3.51s/it] 

Error downloading https://media.nu.nl/m/j6exfjsaefzr_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/j6exfjsaefzr_wd854


Processing...:   7%|▋         | 6417/86862 [4:22:24<235:16:50, 10.53s/it]

Error downloading https://media.nu.nl/m/ql7xks0aq2rm_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/ql7xks0aq2rm_wd854


Processing...:   7%|▋         | 6440/86862 [4:24:01<79:52:54,  3.58s/it] 

Error downloading https://media.nu.nl/m/fgtxur1ampo9_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/fgtxur1ampo9_wd854


Processing...:   7%|▋         | 6442/86862 [4:24:49<283:30:36, 12.69s/it]

Error downloading https://media.nu.nl/m/hhzxyimaopyk_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hhzxyimaopyk_wd854


Processing...:   7%|▋         | 6446/86862 [4:25:37<211:37:13,  9.47s/it]

Error downloading https://media.nu.nl/m/oakx04najwbx_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/oakx04najwbx_wd854
Error downloading https://media.nu.nl/m/oakx04najwbx_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/oakx04najwbx_wd854


Processing...:   7%|▋         | 6460/86862 [4:27:27<104:33:59,  4.68s/it]

Error downloading https://media.nu.nl/m/pspxyaaas3g8_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/pspxyaaas3g8_wd854


Processing...:   7%|▋         | 6487/86862 [4:29:09<52:54:11,  2.37s/it] 

Error downloading https://media.nu.nl/m/aycxaxxamphf_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/aycxaxxamphf_wd854
Error downloading https://media.nu.nl/m/aycxaxxamphf_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/aycxaxxamphf_wd854


Processing...:   7%|▋         | 6510/86862 [4:31:26<74:10:05,  3.32s/it] 

Error downloading https://media.nu.nl/m/81exhoqao2c2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/81exhoqao2c2_wd854


Processing...:   8%|▊         | 6516/86862 [4:32:07<84:36:25,  3.79s/it] 

Error downloading https://media.nu.nl/m/mofxfcia4vgr_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/mofxfcia4vgr_wd854


Processing...:   8%|▊         | 6529/86862 [4:33:40<109:25:58,  4.90s/it]

Error downloading https://media.nu.nl/m/0jwxbmya7wqh_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/0jwxbmya7wqh_wd854


Processing...:   8%|▊         | 6583/86862 [4:37:11<72:25:59,  3.25s/it] 

Error downloading https://media.nu.nl/m/qflx01ga1ssb_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/qflx01ga1ssb_wd854


Processing...:   8%|▊         | 6585/86862 [4:37:42<186:30:56,  8.36s/it]

Error downloading https://media.nu.nl/m/gjzxdtraa2r7_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/gjzxdtraa2r7_wd854


Processing...:   8%|▊         | 6607/86862 [4:39:28<77:55:07,  3.50s/it] 

Error downloading https://media.nu.nl/m/5gjxypoaptn2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/5gjxypoaptn2_wd854
Error downloading https://media.nu.nl/m/5gjxypoaptn2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/5gjxypoaptn2_wd854


Processing...:   8%|▊         | 6612/86862 [4:40:38<151:02:57,  6.78s/it]

Error downloading https://media.nu.nl/m/3d3x7qwamjg5_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3d3x7qwamjg5_wd854
Error downloading https://media.nu.nl/m/3d3x7qwamjg5_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3d3x7qwamjg5_wd854


Processing...:   8%|▊         | 6629/86862 [4:42:53<114:04:20,  5.12s/it]

Error downloading https://media.nu.nl/m/49pxparahmhe_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/49pxparahmhe_wd854


Processing...:   8%|▊         | 6671/86862 [4:46:01<63:43:17,  2.86s/it] 

Error downloading https://media.nu.nl/m/3qmx17pawqxe_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3qmx17pawqxe_wd854


Processing...:   8%|▊         | 6678/86862 [4:47:02<153:24:57,  6.89s/it]

Error downloading https://media.nu.nl/m/3zax5uoad2gy_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3zax5uoad2gy_wd854


Processing...:   8%|▊         | 6715/86862 [4:50:50<120:53:32,  5.43s/it]

Error downloading https://media.nu.nl/m/gnzxx9ea7ya2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/gnzxx9ea7ya2_wd854


Processing...:  11%|█         | 9187/86862 [7:24:36<57:56:41,  2.69s/it] 

Error downloading https://media.nu.nl/m/ve2xecqa8pre_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/ve2xecqa8pre_wd854


Processing...:  12%|█▏        | 10055/86862 [8:19:12<71:10:16,  3.34s/it] 

Error downloading https://media.nu.nl/m/e6mxupvafq1c_wd854: 404 Client Error: Not Found for url: https://media.nu.nl/m/e6mxupvafq1c_wd854


Processing...:  12%|█▏        | 10275/86862 [8:35:55<46:58:57,  2.21s/it] 

Error downloading https://media.nu.nl/m/4zux03xap9k6_wd854: 504 Server Error: Gateway Time-out for url: https://media.nu.nl/m/4zux03xap9k6_wd854
Error downloading https://media.nu.nl/m/4zux03xap9k6_wd854: 504 Server Error: Gateway Time-out for url: https://media.nu.nl/m/4zux03xap9k6_wd854
Error downloading https://media.nu.nl/m/4zux03xap9k6_wd854: 504 Server Error: Gateway Time-out for url: https://media.nu.nl/m/4zux03xap9k6_wd854


Processing...:  14%|█▎        | 11807/86862 [9:44:04<30:13:21,  1.45s/it] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/B186/production/_133064454_boatrepairstill.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/B186/production/_133064454_boatrepairstill.jpg.webp


Processing...:  16%|█▌        | 13672/86862 [10:23:39<110:25:54,  5.43s/it]

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/c4ac/live/f1ef6600-f71b-11ee-af97-c31fb967c02d.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/c4ac/live/f1ef6600-f71b-11ee-af97-c31fb967c02d.jpg.webp


Processing...:  22%|██▏       | 18913/86862 [12:21:02<13:09:01,  1.44it/s] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/6c8a/live/fa58ea10-0548-11ef-b9d8-4f52aebe147d.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/6c8a/live/fa58ea10-0548-11ef-b9d8-4f52aebe147d.jpg.webp


Processing...:  23%|██▎       | 20216/86862 [12:50:41<14:43:15,  1.26it/s] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/c287/live/d8d7ac20-087a-11ef-806d-f7f83a5f5a2f.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/c287/live/d8d7ac20-087a-11ef-806d-f7f83a5f5a2f.jpg.webp


Processing...:  25%|██▍       | 21352/86862 [13:23:46<41:06:03,  2.26s/it] 


KeyboardInterrupt: 

In [36]:
print(f"Starting to process {len(img_articles[21350:])} articles...")
successful_saves2, failed_articles2 = process_and_save_faces(
    df=img_articles[21350:],
    output_base=OUTPUT_BASE,
    face_app=app,
    skip_existing=True
)

print("\n--- Processing Complete ---")
print(f"Successfully saved {len(successful_saves2)} face images.")
if failed_articles2:
    print(f"Encountered {len(failed_articles2)} issues.")
    for failure in failed_articles2:
        print(f"-Outlet {failure['outlet']} - article id {failure['id']} - unique id {failure['id_unique']}:")
        print(f"    for url {failure['url']} with image {failure['image']}:")
        print(f"    {failure['reason']}")
        print('* * * ' * 3)

Starting to process 65512 articles...


Processing...:   2%|▏         | 1180/65512 [27:24<23:14:01,  1.30s/it]

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/1841F/production/_133295399_bruceandfarrah.png.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/1841F/production/_133295399_bruceandfarrah.png.webp


Processing...:  15%|█▍        | 9750/65512 [3:45:45<38:31:07,  2.49s/it] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/1ed7/live/4346bd10-259c-11ef-8997-c3654bee9602.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/1ed7/live/4346bd10-259c-11ef-8997-c3654bee9602.jpg.webp


Processing...:  16%|█▋        | 10745/65512 [4:08:47<26:42:15,  1.76s/it] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/c8c9/live/52e6a7f0-2711-11ef-a82d-c9f935a67f4f.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/c8c9/live/52e6a7f0-2711-11ef-a82d-c9f935a67f4f.jpg.webp


Processing...:  21%|██        | 13892/65512 [5:24:19<20:05:07,  1.40s/it] 


KeyboardInterrupt: 

In [37]:
print(f"Starting to process {len(img_articles[35240:])} articles...")
successful_saves3, failed_articles3 = process_and_save_faces(
    df=img_articles[35240:],
    output_base=OUTPUT_BASE,
    face_app=app,
    skip_existing=True
)

print("\n--- Processing Complete ---")
print(f"Successfully saved {len(successful_saves3)} face images.")
if failed_articles3:
    print(f"Encountered {len(failed_articles3)} issues.")
    for failure in failed_articles3:
        print(f"-Outlet {failure['outlet']} - article id {failure['id']} - unique id {failure['id_unique']}:")
        print(f"    for url {failure['url']} with image {failure['image']}:")
        print(f"    {failure['reason']}")
        print('* * * ' * 3)

Starting to process 51622 articles...


Processing...:   0%|          | 58/51622 [01:30<10:21:34,  1.38it/s]

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/7a15/live/922aeb70-2fe4-11ef-9b42-87ce3f31eabc.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/7a15/live/922aeb70-2fe4-11ef-9b42-87ce3f31eabc.jpg.webp


Processing...:  16%|█▌        | 8336/51622 [3:23:07<26:50:20,  2.23s/it] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'



Processing...:  27%|██▋       | 14013/51622 [5:37:16<7:49:52,  1.33it/s]  

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/31e3/live/52b29640-558d-11ef-9ae6-47d584e77449.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/31e3/live/52b29640-558d-11ef-9ae6-47d584e77449.jpg.webp


Processing...:  31%|███       | 15909/51622 [6:20:58<13:43:41,  1.38s/it] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/e5d5/live/bf235bb0-5a25-11ef-9d2d-89abc1f1e271.png.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/e5d5/live/bf235bb0-5a25-11ef-9d2d-89abc1f1e271.png.webp


Processing...:  35%|███▌      | 18155/51622 [7:14:08<8:45:11,  1.06it/s]  

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/59c1/live/6d26e1b0-60b0-11ef-b76f-f1f593af629e.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/59c1/live/6d26e1b0-60b0-11ef-b76f-f1f593af629e.jpg.webp


Processing...:  39%|███▉      | 20053/51622 [7:58:10<18:24:37,  2.10s/it] 

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'



Processing...:  79%|███████▊  | 40626/51622 [19:29:28<5:44:31,  1.88s/it]  

Error downloading https://i.guim.co.uk/img/media/0a549ae55c32d22dab4676ca53c11139d5b05170/0_21_5472_3283/master/5472.jpg?width=465&dpr=1&s=none&crop=none: HTTPSConnectionPool(host='i.guim.co.uk', port=443): Max retries exceeded with url: /img/media/0a549ae55c32d22dab4676ca53c11139d5b05170/0_21_5472_3283/master/5472.jpg?width=465&dpr=1&s=none&crop=none (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))


Processing...: 100%|██████████| 51622/51622 [25:54:01<00:00,  1.81s/it]    


--- Processing Complete ---
Successfully saved 93108 face images.
Encountered 5 issues.
-Outlet BBC - article id c3g68g11445o - unique id id49220:
    for url https://www.bbc.com/news/articles/c3g68g11445o with image https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp:
    Failed to download image
* * * * * * * * * 
-Outlet BBC - article id c9wjxl0wwwjo - unique id id62146:
    for url https://www.bbc.com/news/articles/c9wjxl0wwwjo with image https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp:
    Failed to download image
* * * * * * * * * 
-Outlet The Guardian - article id 9388442333b1ead - unique id id71763:
    for url https://www.theguardian.com/technology/article/2024/jul/20/google-is-the-worlds-biggest-search-engine-broken with image https://i.guim.co.uk/img/uploads/2024/07/18/search-ani.gif?width=465&dpr=1&s=none&crop=none:
    Failed to download image
* * * * * * * * * 
-Outlet The




In [40]:
failed_articles3

[{'outlet': 'BBC',
  'id': 'c3g68g11445o',
  'id_unique': 'id49220',
  'url': 'https://www.bbc.com/news/articles/c3g68g11445o',
  'image': 'https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp',
  'reason': 'Failed to download image'},
 {'outlet': 'BBC',
  'id': 'c9wjxl0wwwjo',
  'id_unique': 'id62146',
  'url': 'https://www.bbc.com/news/articles/c9wjxl0wwwjo',
  'image': 'https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp',
  'reason': 'Failed to download image'},
 {'outlet': 'The Guardian',
  'id': '9388442333b1ead',
  'id_unique': 'id71763',
  'url': 'https://www.theguardian.com/technology/article/2024/jul/20/google-is-the-worlds-biggest-search-engine-broken',
  'image': 'https://i.guim.co.uk/img/uploads/2024/07/18/search-ani.gif?width=465&dpr=1&s=none&crop=none',
  'reason': 'Failed to download image'},
 {'outlet': 'The Guardian',
  'id': 'b0e7eaf8c72cf5b',
  'id_unique': 'id73014',
  'u

Re-iterate over the error logs to parse urls

In [42]:
error1_string = '''
Starting to process 86862 articles...
Processing...:   7%|▋         | 5746/86862 [3:27:12<54:16:40,  2.41s/it]  Error downloading https://media.nu.nl/m/6hyxw7ka42lm_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/6hyxw7ka42lm_wd854
Processing...:   7%|▋         | 5752/86862 [3:28:05<127:55:53,  5.68s/it]Error downloading https://media.nu.nl/m/k0jx258aztth_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/k0jx258aztth_wd854
Processing...:   7%|▋         | 5761/86862 [3:28:53<59:01:09,  2.62s/it] Error downloading https://media.nu.nl/m/4qkxds1aqs4h_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/4qkxds1aqs4h_wd854
Processing...:   7%|▋         | 5827/86862 [3:33:09<82:22:18,  3.66s/it] Error downloading https://media.nu.nl/m/1kax8eqaahtn_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/1kax8eqaahtn_wd854
Processing...:   7%|▋         | 5843/86862 [3:34:40<71:12:45,  3.16s/it] Error downloading https://media.nu.nl/m/1vmx6d1a0glg_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/1vmx6d1a0glg_wd854
Processing...:   7%|▋         | 5883/86862 [3:37:43<105:57:02,  4.71s/it]Error downloading https://media.nu.nl/m/rbsx7qiawj7u_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/rbsx7qiawj7u_wd854
Processing...:   7%|▋         | 5885/86862 [3:38:18<219:29:08,  9.76s/it]Error downloading https://media.nu.nl/m/uy2xoynaandt_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/uy2xoynaandt_wd854
Error downloading https://media.nu.nl/m/uy2xoynaandt_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/uy2xoynaandt_wd854
Error downloading https://media.nu.nl/m/uy2xoynaandt_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/uy2xoynaandt_wd854
Processing...:   7%|▋         | 5912/86862 [3:41:19<57:05:09,  2.54s/it] Error downloading https://media.nu.nl/m/t83xev9aokel_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/t83xev9aokel_wd854
Processing...:   7%|▋         | 5946/86862 [3:43:47<72:45:16,  3.24s/it] Error downloading https://media.nu.nl/m/jb3x2eaadznc_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/jb3x2eaadznc_wd854
Processing...:   7%|▋         | 5973/86862 [3:45:56<82:41:16,  3.68s/it] Error downloading https://media.nu.nl/m/67wxibvam1vq_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/67wxibvam1vq_wd854
Processing...:   7%|▋         | 5997/86862 [3:47:43<72:02:22,  3.21s/it] Error downloading https://media.nu.nl/m/qxnx14nad9gb_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/qxnx14nad9gb_wd854
Processing...:   7%|▋         | 6059/86862 [3:51:59<95:09:29,  4.24s/it] Error downloading https://media.nu.nl/m/l8dx7ugabzcn_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/l8dx7ugabzcn_wd854
Processing...:   7%|▋         | 6064/86862 [3:52:41<127:01:54,  5.66s/it]Error downloading https://media.nu.nl/m/hfexuw7af88e_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hfexuw7af88e_wd854
Error downloading https://media.nu.nl/m/hfexuw7af88e_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hfexuw7af88e_wd854
Error downloading https://media.nu.nl/m/hfexuw7af88e_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hfexuw7af88e_wd854
Processing...:   7%|▋         | 6068/86862 [3:54:03<247:02:08, 11.01s/it]Error downloading https://media.nu.nl/m/isfxzkbagj4j_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/isfxzkbagj4j_wd854
Processing...:   7%|▋         | 6140/86862 [3:59:21<67:31:54,  3.01s/it] Error downloading https://media.nu.nl/m/ro6xcmya1z47_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/ro6xcmya1z47_wd854
Processing...:   7%|▋         | 6156/86862 [4:00:46<66:24:12,  2.96s/it] Error downloading https://media.nu.nl/m/zc3xdsaatel2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/zc3xdsaatel2_wd854
Processing...:   7%|▋         | 6196/86862 [4:03:34<56:42:41,  2.53s/it] Error downloading https://media.nu.nl/m/zehx18yapc3x_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/zehx18yapc3x_wd854
Processing...:   7%|▋         | 6215/86862 [4:05:01<125:09:54,  5.59s/it]Error downloading https://media.nu.nl/m/g55xm00ane5z_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/g55xm00ane5z_wd854
Processing...:   7%|▋         | 6226/86862 [4:06:16<88:53:18,  3.97s/it] Error downloading https://media.nu.nl/m/qx6xjjcaxslq_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/qx6xjjcaxslq_wd854
Processing...:   7%|▋         | 6239/86862 [4:07:38<97:41:38,  4.36s/it] Error downloading https://media.nu.nl/m/dwhx6vwa2z96_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/dwhx6vwa2z96_wd854
Processing...:   7%|▋         | 6276/86862 [4:10:54<73:41:57,  3.29s/it] Error downloading https://media.nu.nl/m/8a1xltsa5w4h_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/8a1xltsa5w4h_wd854
Processing...:   7%|▋         | 6296/86862 [4:12:19<48:25:42,  2.16s/it] Error downloading https://media.nu.nl/m/pxsxys0a6x4q_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/pxsxys0a6x4q_wd854
Processing...:   7%|▋         | 6317/86862 [4:13:58<52:04:11,  2.33s/it] Error downloading https://media.nu.nl/m/fsjxteeadvax_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/fsjxteeadvax_wd854
Processing...:   7%|▋         | 6356/86862 [4:17:11<89:29:38,  4.00s/it] Error downloading https://media.nu.nl/m/z70xbdba4cd7_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/z70xbdba4cd7_wd854
Processing...:   7%|▋         | 6363/86862 [4:18:09<130:03:15,  5.82s/it]Error downloading https://media.nu.nl/m/3o9xz33a42xo_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3o9xz33a42xo_wd854
Error downloading https://media.nu.nl/m/3o9xz33a42xo_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3o9xz33a42xo_wd854
Processing...:   7%|▋         | 6416/86862 [4:21:58<78:24:25,  3.51s/it] Error downloading https://media.nu.nl/m/j6exfjsaefzr_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/j6exfjsaefzr_wd854
Processing...:   7%|▋         | 6417/86862 [4:22:24<235:16:50, 10.53s/it]Error downloading https://media.nu.nl/m/ql7xks0aq2rm_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/ql7xks0aq2rm_wd854
Processing...:   7%|▋         | 6440/86862 [4:24:01<79:52:54,  3.58s/it] Error downloading https://media.nu.nl/m/fgtxur1ampo9_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/fgtxur1ampo9_wd854
Processing...:   7%|▋         | 6442/86862 [4:24:49<283:30:36, 12.69s/it]Error downloading https://media.nu.nl/m/hhzxyimaopyk_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/hhzxyimaopyk_wd854
Processing...:   7%|▋         | 6446/86862 [4:25:37<211:37:13,  9.47s/it]Error downloading https://media.nu.nl/m/oakx04najwbx_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/oakx04najwbx_wd854
Error downloading https://media.nu.nl/m/oakx04najwbx_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/oakx04najwbx_wd854
Processing...:   7%|▋         | 6460/86862 [4:27:27<104:33:59,  4.68s/it]Error downloading https://media.nu.nl/m/pspxyaaas3g8_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/pspxyaaas3g8_wd854
Processing...:   7%|▋         | 6487/86862 [4:29:09<52:54:11,  2.37s/it] Error downloading https://media.nu.nl/m/aycxaxxamphf_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/aycxaxxamphf_wd854
Error downloading https://media.nu.nl/m/aycxaxxamphf_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/aycxaxxamphf_wd854
Processing...:   7%|▋         | 6510/86862 [4:31:26<74:10:05,  3.32s/it] Error downloading https://media.nu.nl/m/81exhoqao2c2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/81exhoqao2c2_wd854
Processing...:   8%|▊         | 6516/86862 [4:32:07<84:36:25,  3.79s/it] Error downloading https://media.nu.nl/m/mofxfcia4vgr_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/mofxfcia4vgr_wd854
Processing...:   8%|▊         | 6529/86862 [4:33:40<109:25:58,  4.90s/it]Error downloading https://media.nu.nl/m/0jwxbmya7wqh_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/0jwxbmya7wqh_wd854
Processing...:   8%|▊         | 6583/86862 [4:37:11<72:25:59,  3.25s/it] Error downloading https://media.nu.nl/m/qflx01ga1ssb_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/qflx01ga1ssb_wd854
Processing...:   8%|▊         | 6585/86862 [4:37:42<186:30:56,  8.36s/it]Error downloading https://media.nu.nl/m/gjzxdtraa2r7_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/gjzxdtraa2r7_wd854
Processing...:   8%|▊         | 6607/86862 [4:39:28<77:55:07,  3.50s/it] Error downloading https://media.nu.nl/m/5gjxypoaptn2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/5gjxypoaptn2_wd854
Error downloading https://media.nu.nl/m/5gjxypoaptn2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/5gjxypoaptn2_wd854
Processing...:   8%|▊         | 6612/86862 [4:40:38<151:02:57,  6.78s/it]Error downloading https://media.nu.nl/m/3d3x7qwamjg5_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3d3x7qwamjg5_wd854
Error downloading https://media.nu.nl/m/3d3x7qwamjg5_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3d3x7qwamjg5_wd854
Processing...:   8%|▊         | 6629/86862 [4:42:53<114:04:20,  5.12s/it]Error downloading https://media.nu.nl/m/49pxparahmhe_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/49pxparahmhe_wd854
Processing...:   8%|▊         | 6671/86862 [4:46:01<63:43:17,  2.86s/it] Error downloading https://media.nu.nl/m/3qmx17pawqxe_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3qmx17pawqxe_wd854
Processing...:   8%|▊         | 6678/86862 [4:47:02<153:24:57,  6.89s/it]Error downloading https://media.nu.nl/m/3zax5uoad2gy_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/3zax5uoad2gy_wd854
Processing...:   8%|▊         | 6715/86862 [4:50:50<120:53:32,  5.43s/it]Error downloading https://media.nu.nl/m/gnzxx9ea7ya2_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/gnzxx9ea7ya2_wd854
Processing...:  11%|█         | 9187/86862 [7:24:36<57:56:41,  2.69s/it] Error downloading https://media.nu.nl/m/ve2xecqa8pre_wd854: 500 Server Error: Internal Server Error for url: https://media.nu.nl/m/ve2xecqa8pre_wd854
Processing...:  12%|█▏        | 10055/86862 [8:19:12<71:10:16,  3.34s/it] Error downloading https://media.nu.nl/m/e6mxupvafq1c_wd854: 404 Client Error: Not Found for url: https://media.nu.nl/m/e6mxupvafq1c_wd854
Processing...:  12%|█▏        | 10275/86862 [8:35:55<46:58:57,  2.21s/it] Error downloading https://media.nu.nl/m/4zux03xap9k6_wd854: 504 Server Error: Gateway Time-out for url: https://media.nu.nl/m/4zux03xap9k6_wd854
Error downloading https://media.nu.nl/m/4zux03xap9k6_wd854: 504 Server Error: Gateway Time-out for url: https://media.nu.nl/m/4zux03xap9k6_wd854
Error downloading https://media.nu.nl/m/4zux03xap9k6_wd854: 504 Server Error: Gateway Time-out for url: https://media.nu.nl/m/4zux03xap9k6_wd854
Processing...:  14%|█▎        | 11807/86862 [9:44:04<30:13:21,  1.45s/it] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/B186/production/_133064454_boatrepairstill.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/B186/production/_133064454_boatrepairstill.jpg.webp
Processing...:  16%|█▌        | 13672/86862 [10:23:39<110:25:54,  5.43s/it]Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/c4ac/live/f1ef6600-f71b-11ee-af97-c31fb967c02d.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/c4ac/live/f1ef6600-f71b-11ee-af97-c31fb967c02d.jpg.webp
Processing...:  22%|██▏       | 18913/86862 [12:21:02<13:09:01,  1.44it/s] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/6c8a/live/fa58ea10-0548-11ef-b9d8-4f52aebe147d.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/6c8a/live/fa58ea10-0548-11ef-b9d8-4f52aebe147d.jpg.webp
Processing...:  23%|██▎       | 20216/86862 [12:50:41<14:43:15,  1.26it/s] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/c287/live/d8d7ac20-087a-11ef-806d-f7f83a5f5a2f.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/c287/live/d8d7ac20-087a-11ef-806d-f7f83a5f5a2f.jpg.webp
Processing...:  25%|██▍       | 21352/86862 [13:23:46<41:06:03,  2.26s/it]'''

In [43]:
error2_string = '''
Starting to process 65512 articles...
Processing...:   2%|▏         | 1180/65512 [27:24<23:14:01,  1.30s/it]Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/1841F/production/_133295399_bruceandfarrah.png.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/1841F/production/_133295399_bruceandfarrah.png.webp
Processing...:  15%|█▍        | 9750/65512 [3:45:45<38:31:07,  2.49s/it] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/1ed7/live/4346bd10-259c-11ef-8997-c3654bee9602.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/1ed7/live/4346bd10-259c-11ef-8997-c3654bee9602.jpg.webp
Processing...:  16%|█▋        | 10745/65512 [4:08:47<26:42:15,  1.76s/it] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/c8c9/live/52e6a7f0-2711-11ef-a82d-c9f935a67f4f.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/c8c9/live/52e6a7f0-2711-11ef-a82d-c9f935a67f4f.jpg.webp
Processing...:  21%|██        | 13892/65512 [5:24:19<20:05:07,  1.40s/it]
'''

In [45]:
error3_string = '''
Starting to process 51622 articles...
Processing...:   0%|          | 58/51622 [01:30<10:21:34,  1.38it/s]Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/7a15/live/922aeb70-2fe4-11ef-9b42-87ce3f31eabc.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/7a15/live/922aeb70-2fe4-11ef-9b42-87ce3f31eabc.jpg.webp
Processing...:  16%|█▌        | 8336/51622 [3:23:07<26:50:20,  2.23s/it] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Processing...:  27%|██▋       | 14013/51622 [5:37:16<7:49:52,  1.33it/s]  Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/31e3/live/52b29640-558d-11ef-9ae6-47d584e77449.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/31e3/live/52b29640-558d-11ef-9ae6-47d584e77449.jpg.webp
Processing...:  31%|███       | 15909/51622 [6:20:58<13:43:41,  1.38s/it] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/e5d5/live/bf235bb0-5a25-11ef-9d2d-89abc1f1e271.png.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/e5d5/live/bf235bb0-5a25-11ef-9d2d-89abc1f1e271.png.webp
Processing...:  35%|███▌      | 18155/51622 [7:14:08<8:45:11,  1.06it/s]  Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/59c1/live/6d26e1b0-60b0-11ef-b76f-f1f593af629e.jpg.webp: 502 Server Error: Bad Gateway for url: https://ichef.bbci.co.uk/news/480/cpsprodpb/59c1/live/6d26e1b0-60b0-11ef-b76f-f1f593af629e.jpg.webp
Processing...:  39%|███▉      | 20053/51622 [7:58:10<18:24:37,  2.10s/it] Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Processing...:  79%|███████▊  | 40626/51622 [19:29:28<5:44:31,  1.88s/it]  Error downloading https://i.guim.co.uk/img/media/0a549ae55c32d22dab4676ca53c11139d5b05170/0_21_5472_3283/master/5472.jpg?width=465&dpr=1&s=none&crop=none: HTTPSConnectionPool(host='i.guim.co.uk', port=443): Max retries exceeded with url: /img/media/0a549ae55c32d22dab4676ca53c11139d5b05170/0_21_5472_3283/master/5472.jpg?width=465&dpr=1&s=none&crop=none (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))
Processing...: 100%|██████████| 51622/51622 [25:54:01<00:00,  1.81s/it]    
'''

In [46]:
error_strings = error1_string + '' + error2_string + '' + error3_string

In [61]:
error_urls = []
patterns = [
    r'Error downloading (https?://[^\s:]+)', 
    r'Error downloading\s+(https?://[^\s:]+)'
]

for pattern in patterns:
    error_urls.extend(re.findall(pattern, error_strings))

unique_error_urls = list(set(error_urls))

In [71]:
print(f'A total number of {len(unique_error_urls)} image urls out of the original {img_articles.shape[0]} articles got an error.')

A total number of 60 image urls out of the original 86862 articles got an error.


In [72]:
retry_img_articles = img_articles[img_articles['images'].isin(unique_error_urls)]

In [73]:
retry_img_articles.shape

(70, 11)

In [74]:
print(f"Starting to process {len(retry_img_articles)} articles...")
successful_saves4, failed_articles4 = process_and_save_faces(
    df=retry_img_articles,
    output_base=OUTPUT_BASE,
    face_app=app,
    skip_existing=True
)

print("\n--- Processing Complete ---")
print(f"Successfully saved {len(successful_saves4)} face images.")
if failed_articles4:
    print(f"Encountered {len(failed_articles4)} issues.")
    for failure in failed_articles4:
        print(f"-Outlet {failure['outlet']} - article id {failure['id']} - unique id {failure['id_unique']}:")
        print(f"    for url {failure['url']} with image {failure['image']}:")
        print(f"    {failure['reason']}")
        print('* * * ' * 3)

Starting to process 70 articles...


Processing...:  91%|█████████▏| 64/70 [01:33<00:02,  2.96it/s]

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'



Processing...:  97%|█████████▋| 68/70 [01:37<00:01,  1.68it/s]

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'

Error downloading https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp: OpenCV(4.5.5) /Users/runner/miniforge3/conda-bld/libopencv_1656375809040/work/modules/imgcodecs/src/loadsave.cpp:816: error: (-215:Assertion failed) !buf.empty() in function 'imdecode_'



Processing...: 100%|██████████| 70/70 [01:41<00:00,  1.45s/it]


--- Processing Complete ---
Successfully saved 3 face images.
Encountered 2 issues.
-Outlet BBC - article id c3g68g11445o - unique id id49220:
    for url https://www.bbc.com/news/articles/c3g68g11445o with image https://ichef.bbci.co.uk/news/480/cpsprodpb/abbb/live/24c394e0-4783-11ef-b74b-5f98efd74680.jpg.webp:
    Failed to download image
* * * * * * * * * 
-Outlet BBC - article id c9wjxl0wwwjo - unique id id62146:
    for url https://www.bbc.com/news/articles/c9wjxl0wwwjo with image https://ichef.bbci.co.uk/news/480/cpsprodpb/2442/live/018f9390-64f0-11ef-a457-abfbbac808ad.jpg.webp:
    Failed to download image
* * * * * * * * * 



