# This is a web scraping demo, for:


*   extracting all images from an article (url)
*   filtering them (e.g. ignoring small icons, duplicates etc.)
* testing bulk requests to the external Hive API
* aggregating the results into a "trustworthiness score" for an article/ post etc.



##Configuring the Environment

In [1]:
!pip install -q selenium
!pip install -q webdriver-manager
!pip install -q google-colab-selenium

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [91]:
import requests
import re
import time
import google_colab_selenium as gcs

import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

In [3]:
from google.colab import userdata
API_KEY = userdata.get('API_KEY')

##Extracting Images

###extract ALL images

In [None]:
def get_images_from_page_basic(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    img_tags = soup.find_all("img")

    img_urls = [img.get("src") for img in img_tags if img.get("src")]
    img_urls = [urljoin(url, img_url) for img_url in img_urls]

    return img_urls

In [None]:
page_url_1 = "https://www.bbc.com/news/articles/c5yv5976z9po"
page_url_2 = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"
page_url_3 = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"
page_url_4 = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

test_urls = [page_url_1, page_url_2, page_url_3, page_url_4]

for url in test_urls:
  images = get_images_from_page_basic(url)
  print_found_imgs(images)
  print(" --- ")

Found 2 images:
https://www.bbc.com/bbcx/grey-placeholder.png
https://ichef.bbci.co.uk/news/480/cpsprodpb/3cb2/live/61ec64f0-dd4c-11ef-b20c-cf1b3bd7a488.jpg.webp
 --- 
Found 15 images:
https://www.spectator.co.uk/wp-content/uploads/2025/03/cover-15032025-issue.jpg?w=358
https://www.spectator.co.uk/wp-content/uploads/2025/03/cover-15032025-issue.jpg?w=358
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1280
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1365
https://src.spectator.co.uk/wp-content/uploads/2023/01/Newsletter-plane_-footer_1000w.jpg
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1365
https://www.spectator.co.uk/wp-content/uploads/2025/03/GettyImages-1246845745-2.jpg?resize=378,213
https://www.spectator.co.uk/wp-content/uploads/2025/03/POLCOL.png?resize=200,133
https://www.spectator.co.uk/wp-content/uploads/2025/01/PA-78818805.jpg?resize=200,133
https://www.spectator.co.uk/wp-cont

###extract AND filter images

####Filter rules:

* skip **small** images (rendered size less than 200)
* skip classes/ alt containing: **avatar, loading, icon, logo, banner, footer, placeholder**
* skip **invisible** images
* skip images without a source url


####using **Selenium**

In [5]:
def set_options():
  options = Options();
  options.add_argument("--headless=new")
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-dev-shm-usage")
  options.add_argument("--disable-blink-features=AutomationControlled")
  options.binary_location = "/usr/bin/google-chrome"
  return options

In [6]:
def scroll_page(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [70]:
js_code = """
        const exclusionPattern = /avatar|loading|icon|logo|banner|footer|placeholder|teaser/i;

        function isValidImage(img, minSize) {
            let src = img.getAttribute('src');
            let srcset = img.getAttribute('srcset');
            let clientWidth = img.clientWidth;
            let clientHeight = img.clientHeight;
            let className = img.getAttribute('class') || "";
            let alt = img.getAttribute('alt') || "";
            let style = img.getAttribute('style') || "";

            if (!src || !src.startsWith('http')) {
                if (srcset) {
                    let srcsetUrls = srcset.split(',')
                        .map(s => s.trim().split(' ')[0])
                        .filter(url => url && url.startsWith('http'));
                    src = srcsetUrls.length > 0 ? srcsetUrls[0] : null;
                }
            }
            if (!src) return null;
            if (src && src.startsWith('//')) {
                src = window.location.protocol + src;
            }

            if ((clientWidth < minSize) || (clientHeight < minSize)) return null;


            if (style.includes("display: none") || style.includes("opacity: 0")) return null;
            if (exclusionPattern.test(src)) return null;
            if (exclusionPattern.test(className)) return null;
            if (exclusionPattern.test(alt)) return null;
            return src;
        }

        let imgSet = new Set();
        document.querySelectorAll('img').forEach(img => {
            let validSrc = isValidImage(img, arguments[0]);
            if (validSrc) {
                imgSet.add(validSrc);
            }
        });
        return Array.from(imgSet);
        """

In [71]:
def get_images_selenium(url, min_size=200):
    options = set_options()
    driver = gcs.Chrome(options=options)

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        driver.execute_script("document.querySelectorAll('.popup-class').forEach(e => e.remove());")

        scroll_page(driver)

        img_urls = driver.execute_script(js_code, min_size)

    except Exception as e:
        print("An error occurred: %s", e)
        raise
    finally:
        driver.quit()

    return img_urls


####using **Soup**

In [86]:
def is_valid_image(img, min_size, base_url):
    src = img.get('src')
    srcset = img.get('srcset')
    width = img.get('width')
    height = img.get('height')

    class_attr = " ".join(img.get('class', [])) if img.get('class') else ""
    alt = img.get('alt') or ""
    style = img.get('style') or ""

    if not src or not src.startswith('http'):
        if srcset:
            srcset_urls = [s.strip().split()[0] for s in srcset.split(',')]
            srcset_urls = [url for url in srcset_urls if url.startswith('http')]
            src = srcset_urls[0] if srcset_urls else None

    if not src:
        return None

    if src.startswith("//"):
        parsed_base = urlparse(base_url)
        scheme = parsed_base.scheme if parsed_base.scheme else "https"
        src = f"{scheme}:{src}"

    if not src.startswith('http'):
        return None

    try:
        if width is not None and int(width) < min_size:
            return None
    except ValueError:
        pass
    try:
        if height is not None and int(height) < min_size:
            return None
    except ValueError:
        pass

    if "display:none" in style.replace(" ", "") or "opacity:0" in style.replace(" ", ""):
        return None

    exclusion_pattern = re.compile(r"avatar|loading|icon|logo|banner|footer|placeholder|teaser", re.IGNORECASE)
    if exclusion_pattern.search(src):
        return None
    if exclusion_pattern.search(class_attr):
        return None
    if exclusion_pattern.search(alt):
        return None

    return src

In [None]:
def normalize_url(url):
    """
    Returns the URL without any query parameters.
    This helps grouping images that are essentially the same.
    """
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"

In [None]:
def extract_width(url):
    """
    Attempt to extract a width value from the URL query parameters.
    Looks for parameters such as 'w' or 'resize' (which might be 'width,height').
    """
    parsed = urlparse(url)
    qs = parse_qs(parsed.query)
    if 'w' in qs:
        try:
            return int(qs['w'][0])
        except:
            pass
    if 'resize' in qs:
        try:
            parts = qs['resize'][0].split(',')
            return int(parts[0])
        except:
            pass
    if 'h' in qs:
        try:
            return int(qs['h'][0].replace('px', ''))
        except:
            pass
    if 'crop' in qs:
        try:
            parts = qs['crop'][0].split(',')
            if len(parts) >= 3:
                return int(parts[2].replace('px', ''))
        except:
            pass
    return 0

In [None]:
def get_images_bs(url, min_size=300):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print("An error occurred while fetching the page:", e)
        raise

    soup = BeautifulSoup(response.text, 'html.parser')

    for popup in soup.select('.popup-class'):
        popup.decompose()

    grouped_images = {}
    for img in soup.find_all('img'):
        valid_src = is_valid_image(img, min_size, url)
        if valid_src:
            key = normalize_url(valid_src)
            current_width = extract_width(valid_src)
            # If we already have an image for this key, keep the one with the larger width.
            if key in grouped_images:
                existing_width = extract_width(grouped_images[key])
                if current_width > existing_width:
                    grouped_images[key] = valid_src
            else:
                grouped_images[key] = valid_src

    return list(grouped_images.values())

###Testing:

Selenium VS Beautiful Soup

In [10]:
def print_found_imgs(images):
  print(f"Found {len(images)} image(s):")
  for img in images:
      print(img)

In [78]:
page_url_1 = "https://www.bbc.com/news/articles/c5yv5976z9po"
page_url_2 = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"
page_url_3 = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"
page_url_4 = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

test_urls = [page_url_1, page_url_2, page_url_3, page_url_4]

In [88]:
def test(urls):
  results = []

  for url in test_urls:
    row = {"URL": url}

    # Test Selenium
    start = time.time()
    images_selenium = get_images_selenium(url)
    elapsed_selenium = time.time() - start
    row["Selenium Images Found"] = len(images_selenium)
    row["Selenium Time Taken (s)"] = round(elapsed_selenium, 2)

    # Test BeautifulSoup
    start = time.time()
    images_soup = get_images_bs(url)
    elapsed_soup = time.time() - start
    row["BeautifulSoup Images Found"] = len(images_soup)
    row["BeautifulSoup Time Taken (s)"] = round(elapsed_soup, 2)

    results.append(row)

  return pd.DataFrame(results)


df_comparison = test(test_urls)
df_comparison

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,URL,Selenium Images Found,Selenium Time Taken (s),BeautifulSoup Images Found,BeautifulSoup Time Taken (s)
0,https://www.bbc.com/news/articles/c5yv5976z9po,1,16.07,1,0.3
1,https://www.spectator.co.uk/article/is-this-ne...,2,5.54,2,0.32
2,https://www.theverge.com/24353060/deepseek-ai-...,10,18.39,10,0.19
3,https://www.bu.edu/articles/2025/does-chinas-d...,0,4.83,1,0.14


In [82]:
# Test Selenium
for url in test_urls:
    images = get_images_selenium(url)
    print_found_imgs(images)
    print(" --- ")

<IPython.core.display.Javascript object>

Found 1 image(s):
https://ichef.bbci.co.uk/news/480/cpsprodpb/3cb2/live/61ec64f0-dd4c-11ef-b20c-cf1b3bd7a488.jpg.webp
 --- 


<IPython.core.display.Javascript object>

Found 2 image(s):
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1280
https://www.spectator.co.uk/wp-content/uploads/2025/03/RossClarkSchoolsVATWeb.jpg?resize=378,213
 --- 


<IPython.core.display.Javascript object>

Found 10 image(s):
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_B.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/VST_0131_Site.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/247141_NOTEPAD_DEEPSEEK_AI_MICROSOFT_CVIRGINIA-1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/DCD_013024_v1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/uploads/chorus_asset/file/25546251/STK169_Mark_Zuckerburg_CVIRGINIA_C.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_D.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/si

<IPython.core.display.Javascript object>

Found 0 image(s):
 --- 


In [87]:
# Test BeautifulSoup
for url in test_urls:
    images = get_images_bs(url)
    print_found_imgs(images)

Found 1 image(s):
https://ichef.bbci.co.uk/news/480/cpsprodpb/3cb2/live/61ec64f0-dd4c-11ef-b20c-cf1b3bd7a488.jpg.webp
Found 2 image(s):
https://www.spectator.co.uk/wp-content/uploads/2025/03/cover-15032025-issue.jpg?w=358
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1365
Found 10 image(s):
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_B.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/VST_0131_Site.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/247141_NOTEPAD_DEEPSEEK_AI_MICROSOFT_CVIRGINIA-1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/DCD_013024_v1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/uploads/chorus_asset/f

The only difference is that BS scraped a photo that which rendered size is smaller than minimum, but original size in html DOM is larger. In this case it's an avatar for the author, so it's not a photograph directly related to the contents of the article.


---

TEST 2


---





In [89]:
page_url_2 = "https://wiadomosci.onet.pl/kraj/lukaszenko-na-kremlu-ujawnil-zamiary-putina-w-sprawie-usa-relacja-na-zywo/vj1vlcg?utm_campaign=cb"
page_url_3 = "https://c.newsnow.co.uk/A/1265886131?-2645%3A2959%3Ann_topic_top"
page_url_4 = "https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360"
page_url_5 = "https://www.bbc.com/news/live/cg70jylp32gt"

test_urls = [page_url_2, page_url_3, page_url_4, page_url_5]

In [90]:
def test(urls):
  results = []

  for url in test_urls:
    row = {"URL": url}

    # Test Selenium
    start = time.time()
    images_selenium = get_images_selenium(url)
    elapsed_selenium = time.time() - start
    row["Selenium Images Found"] = len(images_selenium)
    row["Selenium Time Taken (s)"] = round(elapsed_selenium, 2)

    # Test BeautifulSoup
    start = time.time()
    images_soup = get_images_bs(url)
    elapsed_soup = time.time() - start
    row["BeautifulSoup Images Found"] = len(images_soup)
    row["BeautifulSoup Time Taken (s)"] = round(elapsed_soup, 2)

    results.append(row)

  return pd.DataFrame(results)


df_comparison = test(test_urls)
df_comparison

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,URL,Selenium Images Found,Selenium Time Taken (s),BeautifulSoup Images Found,BeautifulSoup Time Taken (s)
0,https://wiadomosci.onet.pl/kraj/lukaszenko-na-...,1,8.29,1,1.3
1,https://c.newsnow.co.uk/A/1265886131?-2645%3A2...,12,9.77,0,0.3
2,https://news.sky.com/story/politics-latest-liv...,9,7.2,0,0.12
3,https://www.bbc.com/news/live/cg70jylp32gt,12,10.1,14,0.26


In [92]:
# Test Selenium
for url in test_urls:
    images = get_images_selenium(url)
    print_found_imgs(images)
    print(" --- ")

<IPython.core.display.Javascript object>

Found 1 image(s):
https://ocdn.eu/pulscms-transforms/1/7VAk9kpTURBXy84ZjhkMjkwZGMyNTIyMDkwMGQyYjNjZjBiNDE5NzRlZC5qcGeTlQPMxM0BrM0GAM0DYJMFzQSwzQKkkwmmYTgzODE4Bt4AAaEwAQ/wolodymyr-zelenski.jpg
 --- 


<IPython.core.display.Javascript object>

Found 12 image(s):
https://img.wort.lu/public/luxemburg/dgqieg-itv-zur-europawahl-monica-semedo-focus-31922534.jpeg/alternates/SIXTEEN_NINE_1920/itv-zur-europawahl-monica-semedo-focus-31922534.jpeg
https://img.wort.lu/public/luxemburg/c9cp3a-fernand-kartheiser-donald-trump-lettre.jpeg/alternates/SIXTEEN_NINE_1920/Fernand-Kartheiser-Donald-Trump-lettre.jpeg
https://img.virgule.lu/public/luxembourg/o31pg8-tour-alcide-de-gasperi-1965-2025.jpg/alternates/SIXTEEN_NINE_1920/tour-alcide-de-gasperi-1965-2025.jpg
https://img.luxtimes.lu/public/luxembourg/6jhqgy-th-26323415-20220607.jpg/alternates/SIXTEEN_NINE_1920/TH_26323415_20220607.jpg
https://img.wort.lu/public/luxemburg/q3xtol-th-30254049-20231117.jpg/alternates/SIXTEEN_NINE_1920/TH_30254049_20231117.jpg
https://img.luxtimes.lu/public/luxembourg/piy1lb-kirchberg-is-home-to-most-of-luxembourgs-eu-institutions-photo-pierre-matge/alternates/SIXTEEN_NINE_1920/Kirchberg%20is%20home%20to%20most%20of%20Luxembourgs%20EU%20institutions%20Photo%20Pi

<IPython.core.display.Javascript object>

Found 9 image(s):
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-cdeee651-6941-4577-915b-f03c06c1d24d.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-18ab2bf0-752f-46e0-a264-074061f2e7f8.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-aecb8d20-6a2b-4544-b950-f3ae715e9da2.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-b77869ee-46d2-402e-b5b0-2c57a199db65.jpeg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-7066197a-aa91-49b4-8d79-fbdceeb91e80.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-60a11157-3576-4b7f-b61d-ab906953ecfb.jpeg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-065e90e5-7418-4a5c-ab55-1187ec66b3bb.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-3c46d87f-eea0-4ed8-ad55-a7faf6ee6025.jpeg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-a6da1bf3-f875-491f-a5b7-5979bb67a5d4.jpeg
 --- 


<IPython.core.display.Javascript object>

Found 12 image(s):
https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/04a6/live/ac7b8e00-fc01-11ef-896e-d7e7fb1719a4.jpg
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/436c2105-59ed-434e-8551-bee3f52bcba4.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/acba735e-4465-4111-bc0c-1e1f3d4c2045.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/022049d6-3042-41f0-a019-237d6afeff36.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/a4ab30fa-fbb2-4113-8f26-dc1afc7d2a6c.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/89c70375-0afa-4e88-b3aa-e179f926989f.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/9a965970-a856-436d-9994-3b85b9ec88ff.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/85bc566d-e6bd-45e9-af66-42cacf1932cc.jpg.webp
https://i

In [93]:
# Test BeautifulSoup
for url in test_urls:
    images = get_images_bs(url)
    print_found_imgs(images)

Found 1 image(s):
https://ocdn.eu/pulscms-transforms/1/7VAk9kpTURBXy84ZjhkMjkwZGMyNTIyMDkwMGQyYjNjZjBiNDE5NzRlZC5qcGeTlQPMxM0BrM0GAM0DYJMFzQSwzQKkkwmmYTgzODE4Bt4AAaEwAQ/wolodymyr-zelenski.jpg
Found 0 image(s):
Found 0 image(s):
Found 14 image(s):
https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/04a6/live/ac7b8e00-fc01-11ef-896e-d7e7fb1719a4.jpg
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/436c2105-59ed-434e-8551-bee3f52bcba4.jpg.webp
https://static.files.bbci.co.uk/core/website/assets/static/news/incident-types/bbc-verify.bac8ff232a.svg
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/acba735e-4465-4111-bc0c-1e1f3d4c2045.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/022049d6-3042-41f0-a019-237d6afeff36.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/a4ab30fa-fbb2-4113-8f26-dc1afc7d2a6c.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpspro



---

CONCLUSIONS:


---

BeutifulSoup is **significantly** faster, however, since it doesn't use JS rendering it can't scrape images from longer articles with lazy loading. Selenium not only let's us scrape more, but also can check the rendered size of an image which is very useful in differentiating thumbnails for other articles, or avatars etc. Ultimately, the choice is between speed and accuracy. Perhaps we should also consider:
* limiting the number of scraped images to top X (e.g. first 5)
* limiting the images to bigger sizes (sometimes thumbnails can be quite big, more than 300px even)



Worth to note:
* some websites block web scraping, for example: politico, x (former twitter)