# Image scraping demo, for:


*   extracting all images from an article (url)
*   filtering them (e.g. ignoring small icons, duplicates etc.)
* testing bulk requests to the external Hive API
* aggregating the results into a "trustworthiness score" for an article/ post etc.



##Configuring the Environment

In [1]:
!pip install -q selenium
!pip install -q webdriver-manager
!pip install -q google-colab-selenium

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [115]:
import requests
import re
import time
import google_colab_selenium as gcs

import time
import pandas as pd

from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urljoin
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

In [3]:
from google.colab import userdata
API_KEY = userdata.get('API_KEY')

##Extracting Images

###extract ALL images

In [None]:
def get_images_from_page_basic(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    img_tags = soup.find_all("img")

    img_urls = [img.get("src") for img in img_tags if img.get("src")]
    img_urls = [urljoin(url, img_url) for img_url in img_urls]

    return img_urls

In [None]:
page_url_1 = "https://www.bbc.com/news/articles/c5yv5976z9po"
page_url_2 = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"
page_url_3 = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"
page_url_4 = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

test_urls = [page_url_1, page_url_2, page_url_3, page_url_4]

for url in test_urls:
  images = get_images_from_page_basic(url)
  print_found_imgs(images)
  print(" --- ")

Found 2 images:
https://www.bbc.com/bbcx/grey-placeholder.png
https://ichef.bbci.co.uk/news/480/cpsprodpb/3cb2/live/61ec64f0-dd4c-11ef-b20c-cf1b3bd7a488.jpg.webp
 --- 
Found 15 images:
https://www.spectator.co.uk/wp-content/uploads/2025/03/cover-15032025-issue.jpg?w=358
https://www.spectator.co.uk/wp-content/uploads/2025/03/cover-15032025-issue.jpg?w=358
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1280
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1365
https://src.spectator.co.uk/wp-content/uploads/2023/01/Newsletter-plane_-footer_1000w.jpg
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1365
https://www.spectator.co.uk/wp-content/uploads/2025/03/GettyImages-1246845745-2.jpg?resize=378,213
https://www.spectator.co.uk/wp-content/uploads/2025/03/POLCOL.png?resize=200,133
https://www.spectator.co.uk/wp-content/uploads/2025/01/PA-78818805.jpg?resize=200,133
https://www.spectator.co.uk/wp-cont

###extract AND filter images

####Filter rules:

* skip **small** images (rendered size less than 200)
* skip classes/ alt containing: **avatar, loading, icon, logo, banner, footer, placeholder, teaser**
* skip **invisible** images
* skip images without a source url


####using **Selenium**

In [113]:
def set_options():
  options = Options();
  options.add_argument("--headless=new")
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-dev-shm-usage")
  options.add_argument("--disable-blink-features=AutomationControlled")
  options.binary_location = "/usr/bin/google-chrome"
  return options

In [114]:
def scroll_page(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [None]:
js_code = """
        const exclusionPattern = /avatar|loading|icon|logo|banner|footer|placeholder|teaser/i;

        function isValidImage(img, minSize) {
            let src = img.getAttribute('src');
            let srcset = img.getAttribute('srcset');
            let clientWidth = img.clientWidth;
            let clientHeight = img.clientHeight;
            let className = img.getAttribute('class') || "";
            let alt = img.getAttribute('alt') || "";
            let style = img.getAttribute('style') || "";

            if (!src || !src.startsWith('http')) {
                if (srcset) {
                    let srcsetUrls = srcset.split(',')
                        .map(s => s.trim().split(' ')[0])
                        .filter(url => url && url.startsWith('http'));
                    src = srcsetUrls.length > 0 ? srcsetUrls[0] : null;
                }
            }
            if (!src) return null;
            if (src && src.startsWith('//')) {
                src = window.location.protocol + src;
            }

            if ((clientWidth < minSize) || (clientHeight < minSize)) return null;


            if (style.includes("display: none") || style.includes("opacity: 0")) return null;
            if (exclusionPattern.test(src)) return null;
            if (exclusionPattern.test(className)) return null;
            if (exclusionPattern.test(alt)) return null;
            return src;
        }

        let imgSet = new Set();
        document.querySelectorAll('img').forEach(img => {
            let validSrc = isValidImage(img, arguments[0]);
            if (validSrc) {
                imgSet.add(validSrc);
            }
        });
        return Array.from(imgSet);
        """

In [None]:
def get_images_selenium(url, min_size=200):
    options = set_options()
    driver = gcs.Chrome(options=options)

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        driver.execute_script("document.querySelectorAll('.popup-class').forEach(e => e.remove());")

        scroll_page(driver)

        img_urls = driver.execute_script(js_code, min_size)

    except Exception as e:
        print("An error occurred: %s", e)
        raise
    finally:
        driver.quit()

    return img_urls


####using **Soup**

In [None]:
def is_valid_image(img, min_size, base_url):
    src = img.get('src')
    srcset = img.get('srcset')
    width = img.get('width')
    height = img.get('height')

    class_attr = " ".join(img.get('class', [])) if img.get('class') else ""
    alt = img.get('alt') or ""
    style = img.get('style') or ""

    if not src or not src.startswith('http'):
        if srcset:
            srcset_urls = [s.strip().split()[0] for s in srcset.split(',')]
            srcset_urls = [url for url in srcset_urls if url.startswith('http')]
            src = srcset_urls[0] if srcset_urls else None

    if not src:
        return None

    if src.startswith("//"):
        parsed_base = urlparse(base_url)
        scheme = parsed_base.scheme if parsed_base.scheme else "https"
        src = f"{scheme}:{src}"

    if not src.startswith('http'):
        return None

    try:
        if width is not None and int(width) < min_size:
            return None
    except ValueError:
        pass
    try:
        if height is not None and int(height) < min_size:
            return None
    except ValueError:
        pass

    if "display:none" in style.replace(" ", "") or "opacity:0" in style.replace(" ", ""):
        return None

    exclusion_pattern = re.compile(r"avatar|loading|icon|logo|banner|footer|placeholder|teaser", re.IGNORECASE)
    if exclusion_pattern.search(src):
        return None
    if exclusion_pattern.search(class_attr):
        return None
    if exclusion_pattern.search(alt):
        return None

    return src

In [None]:
def normalize_url(url):
    """
    Returns the URL without any query parameters.
    This helps grouping images that are essentially the same.
    """
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"

In [None]:
def extract_width(url):
    """
    Attempt to extract a width value from the URL query parameters.
    Looks for parameters such as 'w' or 'resize' (which might be 'width,height').
    """
    parsed = urlparse(url)
    qs = parse_qs(parsed.query)
    if 'w' in qs:
        try:
            return int(qs['w'][0])
        except:
            pass
    if 'resize' in qs:
        try:
            parts = qs['resize'][0].split(',')
            return int(parts[0])
        except:
            pass
    if 'h' in qs:
        try:
            return int(qs['h'][0].replace('px', ''))
        except:
            pass
    if 'crop' in qs:
        try:
            parts = qs['crop'][0].split(',')
            if len(parts) >= 3:
                return int(parts[2].replace('px', ''))
        except:
            pass
    return 0

In [None]:
def get_images_bs(url, min_size=300):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except Exception as e:
        print("An error occurred while fetching the page:", e)
        raise

    soup = BeautifulSoup(response.text, 'html.parser')

    for popup in soup.select('.popup-class'):
        popup.decompose()

    grouped_images = {}
    for img in soup.find_all('img'):
        valid_src = is_valid_image(img, min_size, url)
        if valid_src:
            key = normalize_url(valid_src)
            current_width = extract_width(valid_src)
            # If we already have an image for this key, keep the one with the larger width.
            if key in grouped_images:
                existing_width = extract_width(grouped_images[key])
                if current_width > existing_width:
                    grouped_images[key] = valid_src
            else:
                grouped_images[key] = valid_src

    return list(grouped_images.values())

###Testing:

Selenium VS Beautiful Soup

In [None]:
def print_found_imgs(images):
  print(f"Found {len(images)} image(s):")
  for img in images:
      print(img)

In [None]:
page_url_1 = "https://www.bbc.com/news/articles/c5yv5976z9po"
page_url_2 = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"
page_url_3 = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"
page_url_4 = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

test_urls = [page_url_1, page_url_2, page_url_3, page_url_4]

In [None]:
def test(urls):
  results = []

  for url in test_urls:
    row = {"URL": url}

    # Test Selenium
    start = time.time()
    images_selenium = get_images_selenium(url)
    elapsed_selenium = time.time() - start
    row["Selenium Images Found"] = len(images_selenium)
    row["Selenium Time Taken (s)"] = round(elapsed_selenium, 2)

    # Test BeautifulSoup
    start = time.time()
    images_soup = get_images_bs(url)
    elapsed_soup = time.time() - start
    row["BeautifulSoup Images Found"] = len(images_soup)
    row["BeautifulSoup Time Taken (s)"] = round(elapsed_soup, 2)

    results.append(row)

  return pd.DataFrame(results)


df_comparison = test(test_urls)
df_comparison

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,URL,Selenium Images Found,Selenium Time Taken (s),BeautifulSoup Images Found,BeautifulSoup Time Taken (s)
0,https://www.bbc.com/news/articles/c5yv5976z9po,1,16.07,1,0.3
1,https://www.spectator.co.uk/article/is-this-ne...,2,5.54,2,0.32
2,https://www.theverge.com/24353060/deepseek-ai-...,10,18.39,10,0.19
3,https://www.bu.edu/articles/2025/does-chinas-d...,0,4.83,1,0.14


In [None]:
# Test Selenium
for url in test_urls:
    images = get_images_selenium(url)
    print_found_imgs(images)
    print(" --- ")

<IPython.core.display.Javascript object>

Found 1 image(s):
https://ichef.bbci.co.uk/news/480/cpsprodpb/3cb2/live/61ec64f0-dd4c-11ef-b20c-cf1b3bd7a488.jpg.webp
 --- 


<IPython.core.display.Javascript object>

Found 2 image(s):
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1280
https://www.spectator.co.uk/wp-content/uploads/2025/03/RossClarkSchoolsVATWeb.jpg?resize=378,213
 --- 


<IPython.core.display.Javascript object>

Found 10 image(s):
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_B.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/VST_0131_Site.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/247141_NOTEPAD_DEEPSEEK_AI_MICROSOFT_CVIRGINIA-1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/DCD_013024_v1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/uploads/chorus_asset/file/25546251/STK169_Mark_Zuckerburg_CVIRGINIA_C.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_D.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/si

<IPython.core.display.Javascript object>

Found 0 image(s):
 --- 


In [None]:
# Test BeautifulSoup
for url in test_urls:
    images = get_images_bs(url)
    print_found_imgs(images)

Found 1 image(s):
https://ichef.bbci.co.uk/news/480/cpsprodpb/3cb2/live/61ec64f0-dd4c-11ef-b20c-cf1b3bd7a488.jpg.webp
Found 2 image(s):
https://www.spectator.co.uk/wp-content/uploads/2025/03/cover-15032025-issue.jpg?w=358
https://www.spectator.co.uk/wp-content/uploads/2025/03/iStock-1456167226.jpg?w=1365
Found 10 image(s):
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_B.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/VST_0131_Site.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/247141_NOTEPAD_DEEPSEEK_AI_MICROSOFT_CVIRGINIA-1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/DCD_013024_v1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/uploads/chorus_asset/f

The only difference is that BS scraped a photo which rendered size is smaller than minimum, but original size in html DOM is larger. In this case it's an avatar for the author, so it's not a photograph directly related to the contents of the article.


---

TEST 2


---





In [None]:
page_url_2 = "https://wiadomosci.onet.pl/kraj/lukaszenko-na-kremlu-ujawnil-zamiary-putina-w-sprawie-usa-relacja-na-zywo/vj1vlcg?utm_campaign=cb"
page_url_3 = "https://c.newsnow.co.uk/A/1265886131?-2645%3A2959%3Ann_topic_top"
page_url_4 = "https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360"
page_url_5 = "https://www.bbc.com/news/live/cg70jylp32gt"

test_urls = [page_url_2, page_url_3, page_url_4, page_url_5]

In [None]:
def test(urls):
  results = []

  for url in test_urls:
    row = {"URL": url}

    # Test Selenium
    start = time.time()
    images_selenium = get_images_selenium(url)
    elapsed_selenium = time.time() - start
    row["Selenium Images Found"] = len(images_selenium)
    row["Selenium Time Taken (s)"] = round(elapsed_selenium, 2)

    # Test BeautifulSoup
    start = time.time()
    images_soup = get_images_bs(url)
    elapsed_soup = time.time() - start
    row["BeautifulSoup Images Found"] = len(images_soup)
    row["BeautifulSoup Time Taken (s)"] = round(elapsed_soup, 2)

    results.append(row)

  return pd.DataFrame(results)


df_comparison = test(test_urls)
df_comparison

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,URL,Selenium Images Found,Selenium Time Taken (s),BeautifulSoup Images Found,BeautifulSoup Time Taken (s)
0,https://wiadomosci.onet.pl/kraj/lukaszenko-na-...,1,8.29,1,1.3
1,https://c.newsnow.co.uk/A/1265886131?-2645%3A2...,12,9.77,0,0.3
2,https://news.sky.com/story/politics-latest-liv...,9,7.2,0,0.12
3,https://www.bbc.com/news/live/cg70jylp32gt,12,10.1,14,0.26


In [None]:
# Test Selenium
for url in test_urls:
    images = get_images_selenium(url)
    print_found_imgs(images)
    print(" --- ")

<IPython.core.display.Javascript object>

Found 1 image(s):
https://ocdn.eu/pulscms-transforms/1/7VAk9kpTURBXy84ZjhkMjkwZGMyNTIyMDkwMGQyYjNjZjBiNDE5NzRlZC5qcGeTlQPMxM0BrM0GAM0DYJMFzQSwzQKkkwmmYTgzODE4Bt4AAaEwAQ/wolodymyr-zelenski.jpg
 --- 


<IPython.core.display.Javascript object>

Found 12 image(s):
https://img.wort.lu/public/luxemburg/dgqieg-itv-zur-europawahl-monica-semedo-focus-31922534.jpeg/alternates/SIXTEEN_NINE_1920/itv-zur-europawahl-monica-semedo-focus-31922534.jpeg
https://img.wort.lu/public/luxemburg/c9cp3a-fernand-kartheiser-donald-trump-lettre.jpeg/alternates/SIXTEEN_NINE_1920/Fernand-Kartheiser-Donald-Trump-lettre.jpeg
https://img.virgule.lu/public/luxembourg/o31pg8-tour-alcide-de-gasperi-1965-2025.jpg/alternates/SIXTEEN_NINE_1920/tour-alcide-de-gasperi-1965-2025.jpg
https://img.luxtimes.lu/public/luxembourg/6jhqgy-th-26323415-20220607.jpg/alternates/SIXTEEN_NINE_1920/TH_26323415_20220607.jpg
https://img.wort.lu/public/luxemburg/q3xtol-th-30254049-20231117.jpg/alternates/SIXTEEN_NINE_1920/TH_30254049_20231117.jpg
https://img.luxtimes.lu/public/luxembourg/piy1lb-kirchberg-is-home-to-most-of-luxembourgs-eu-institutions-photo-pierre-matge/alternates/SIXTEEN_NINE_1920/Kirchberg%20is%20home%20to%20most%20of%20Luxembourgs%20EU%20institutions%20Photo%20Pi

<IPython.core.display.Javascript object>

Found 9 image(s):
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-cdeee651-6941-4577-915b-f03c06c1d24d.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-18ab2bf0-752f-46e0-a264-074061f2e7f8.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-aecb8d20-6a2b-4544-b950-f3ae715e9da2.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-b77869ee-46d2-402e-b5b0-2c57a199db65.jpeg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-7066197a-aa91-49b4-8d79-fbdceeb91e80.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-60a11157-3576-4b7f-b61d-ab906953ecfb.jpeg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-065e90e5-7418-4a5c-ab55-1187ec66b3bb.jpg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-3c46d87f-eea0-4ed8-ad55-a7faf6ee6025.jpeg
https://liveblog.digitalimages.sky/lc-images-sky/lcimg-a6da1bf3-f875-491f-a5b7-5979bb67a5d4.jpeg
 --- 


<IPython.core.display.Javascript object>

Found 12 image(s):
https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/04a6/live/ac7b8e00-fc01-11ef-896e-d7e7fb1719a4.jpg
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/436c2105-59ed-434e-8551-bee3f52bcba4.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/acba735e-4465-4111-bc0c-1e1f3d4c2045.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/022049d6-3042-41f0-a019-237d6afeff36.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/a4ab30fa-fbb2-4113-8f26-dc1afc7d2a6c.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/89c70375-0afa-4e88-b3aa-e179f926989f.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/9a965970-a856-436d-9994-3b85b9ec88ff.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/85bc566d-e6bd-45e9-af66-42cacf1932cc.jpg.webp
https://i

In [None]:
# Test BeautifulSoup
for url in test_urls:
    images = get_images_bs(url)
    print_found_imgs(images)

Found 1 image(s):
https://ocdn.eu/pulscms-transforms/1/7VAk9kpTURBXy84ZjhkMjkwZGMyNTIyMDkwMGQyYjNjZjBiNDE5NzRlZC5qcGeTlQPMxM0BrM0GAM0DYJMFzQSwzQKkkwmmYTgzODE4Bt4AAaEwAQ/wolodymyr-zelenski.jpg
Found 0 image(s):
Found 0 image(s):
Found 14 image(s):
https://ichef.bbci.co.uk/ace/standard/1024/cpsprodpb/04a6/live/ac7b8e00-fc01-11ef-896e-d7e7fb1719a4.jpg
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/436c2105-59ed-434e-8551-bee3f52bcba4.jpg.webp
https://static.files.bbci.co.uk/core/website/assets/static/news/incident-types/bbc-verify.bac8ff232a.svg
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/acba735e-4465-4111-bc0c-1e1f3d4c2045.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/022049d6-3042-41f0-a019-237d6afeff36.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpsprodpb/vivo/live/images/2025/3/8/a4ab30fa-fbb2-4113-8f26-dc1afc7d2a6c.jpg.webp
https://ichef.bbci.co.uk/ace/standard/640/cpspro



---

CONCLUSIONS:


---

BeutifulSoup is **significantly** faster, however, since it doesn't use JS rendering it can't scrape images from longer articles with lazy loading. Selenium not only let's us scrape more, but also can check the rendered size of an image which is very useful in differentiating thumbnails for other articles, or avatars etc. Ultimately, the choice is between speed and accuracy. Perhaps we should also consider:
* limiting the number of scraped images to top X (e.g. first 5)
* limiting the images to bigger sizes (sometimes thumbnails can be quite big, more than 300px even)



Worth to note:
* some websites block web scraping, for example: politico, x (former twitter)

##Sending Requests to API in bulk

###Example image set:

# Text scraping demo for:
- text scraping
- preprocessing scraped text (remove ads, comments, footnotes)
- identify named entities (extract names of people, places, organizations)
- segment into claims (e.g. NLTK tokenization)
- cross check

##Configuring the environment

In [29]:
!pip install -q newspaper3k
!pip install -q lxml_html_clean
!pip install -q trafilatura

In [45]:
import trafilatura
import pandas as pd
import time
import difflib
import requests

from newspaper import Article
from difflib import SequenceMatcher

In [60]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

##Test data

In [3]:
page_url_1 = "https://www.bbc.com/news/articles/c5yv5976z9po"
page_url_2 = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"
page_url_3 = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"
page_url_4 = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

test_urls_1 = [page_url_1, page_url_2, page_url_3, page_url_4]

In [4]:
page_url_2 = "https://wiadomosci.onet.pl/kraj/lukaszenko-na-kremlu-ujawnil-zamiary-putina-w-sprawie-usa-relacja-na-zywo/vj1vlcg?utm_campaign=cb"
page_url_3 = "https://c.newsnow.co.uk/A/1265886131?-2645%3A2959%3Ann_topic_top"
page_url_4 = "https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360"
page_url_5 = "https://www.bbc.com/news/live/cg70jylp32gt"

test_urls_2 = [page_url_2, page_url_3, page_url_4, page_url_5]

In [131]:
test_url = ["https://www.nature.com/articles/d41586-025-00259-0"] #not good example, as it blocks web scraping
test_url_2 = ["https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex"]

##Scrape text

In [62]:
def scrape_text_newspaper3k(url):
  session = requests.Session()
  headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9",
  }
  try:
    response = session.get(url, headers=headers, timeout=10)
    if response.status_code == 200:
      article = Article(url)
      article.set_html(response.text)   # Use our session-fetched HTML
      article.parse()
      return article.text.strip(), response.elapsed.total_seconds()
    else:
      return (f"Newspaper fetch failed, status: {response.status}", response.elapsed.total_seconds())
  except Exception as e:
    return (f"Newspaper error on: {e}", 0)

In [85]:
def scrape_text_trafilatura(url):
  session = requests.Session()
  headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
      "Accept-Language": "en-US,en;q=0.9",
  }

  try:
    response = session.get(url, headers=headers, timeout=10)
    with open("raw_html.html", "w", encoding="utf-8") as file:
      file.write(response.text)
    if response.status_code == 200:
      text = trafilatura.extract(response.text)
      return text if text else "", response.elapsed.total_seconds()
    else:
      return (f"[trafilatura] fetch failed, status: {response.status_code}", response.elapsed.total_seconds())
  except Exception as e:
    return (f"[trafilatura] error on: {e}", 0)

In [7]:
def compare_texts(text1, text2):
    diff = difflib.ndiff(text1.split(), text2.split())
    only_newspaper = ' '.join(word[2:] for word in diff if word.startswith('- '))
    only_trafilatura = ' '.join(word[2:] for word in diff if word.startswith('+ '))
    return only_newspaper, only_trafilatura

In [8]:
def compare_np_tr(urls):
  results = []

  for url in urls:
    np_text, np_time = scrape_text_newspaper3k(url)
    tr_text, tr_time = scrape_text_trafilatura(url)

    matcher = SequenceMatcher(None, np_text, tr_text)
    only_in_np = []
    only_in_tr = []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'delete':
            only_in_np.append(np_text[i1:i2])
        elif tag == 'insert':
            only_in_tr.append(tr_text[j1:j2])

    results.append({
            'URL': url,
            'Newspaper3k_Time': round(np_time, 2),
            'Trafilatura_Time': round(tr_time, 2),
            'Newspaper3k_Chars': len(np_text),
            'Trafilatura_Chars': len(tr_text),
            'Only_in_Newspaper3k': ' '.join(only_in_np).strip(),
            'Only_in_Trafilatura': ' '.join(only_in_tr).strip()
        })

  return pd.DataFrame(results)

####comparison

In [144]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

test_1 = compare_np_tr(test_urls_1)
test_1

Unnamed: 0,URL,Newspaper3k_Time,Trafilatura_Time,Newspaper3k_Chars,Trafilatura_Chars,Only_in_Newspaper3k,Only_in_Trafilatura
0,https://www.bbc.com/news/articles/c5yv5976z9po,0.03,0.02,7057,7522,"4 February 2025 Share Save Kelly Ng, Brandon Drenon, Tom Gerken and Marc Cieslak BBC News Share Save\n\nGetty Images DeepSeek has stunned the world - what do we know about it?\n \n \n \n \n The BBC's AI correspondent explains\n\nWho is behind DeepSeek?","To understand why DeepSeek has made such a stir, it helps to start with AI and its capability to make a computer seem like a person.\nA machine uses the technology to learn and solve problems, typically by being trained on massive amounts of information and recognising patterns. \nHow has reacted to DeepSeek impact?"
1,https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/,0.58,0.04,1125,1528,,Get Britain's best politics newsletters\nRegister to get The Spectator's insight and opinion straight to your inbox. You can then read two free articles each week.\nAlready a subscriber? Log in\nComments\nJoin the debate for just $5 for 3 months\nBe part of the conversation with other Spectator readers by getting your first three months for $5.\nUNLOCK ACCESS Just $5 for 3 monthsAlready a subscriber? Log in
2,https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai,0.02,0.02,1115,17405,,"DeepSeek’s ChatGPT competitor quickly soared to the top of the App Store, and the company is disrupting financial markets, with shares of Nvidia dipping 17 percent to cut nearly $600 billion from its market cap on January 27th, which CNBC said is the biggest single-day drop in US history. \nHere’s all the latest on DeepSeek.\n- DeepSeek engineers have handed in their China passports.\nThe China-based AI startup that made headline news earlier this year after releasing a highly capable, cost-efficient AI model, has taken the passports of some key employees, per The Information. The travel ban is intended to prevent the leak of “confidential information that could constitute trade secrets or even state secrets,” the report says, citing three anonymous sources.\nDeepSeek, a National Treasure in China, is Now Being Closely Guarded[The Information]\n- The US might restrict DeepSeek on government devices.\nOfficials are also weighing banning DeepSeek from US app stores and “putting limits” on cloud providers’ ability to offer access to its cost-efficient AI models, according to a report from The Wall Street Journal. Both the Navy and NASA have already blocked the Chinese startup’s technology over security concerns.\n- South Korea blocks DeepSeek.\nThe Chinese AI app is no longer available on local app stores after acknowledging it had failed to meet Korea’s data protection laws. The web version is still accessible, and the app will return if and when it complies with the rules.\nItaly blocked the app on similar grounds earlier this month, while the US and other countries are exploring bans for government and military devices.\n- Apple is reportedly working with Alibaba to launch AI features in China.\nWhile Apple Intelligence has reached the EU -- and, according to some, devices where it had already been declined -- the company hasn’t launched its AI features in China yet. A report by The Information on Tuesday indicates it could be getting closer, saying that after evaluating models from Tencent, ByteDance, Alibaba, and DeepSeek, Apple has submitted some features co-developed with Alibaba for approval by Chinese regulators.\nThe Information[theinformation.com]\n- DeepSeek gets the TikTok treatment.\nA new bipartisan bill seeks to ban Chinese AI chatbot DeepSeek from US government-owned devices to “prevent our enemy from getting information from our government.” A similar ban on TikTok was proposed in 2020, one of the first steps on the path to its recent brief shutdown and forced sale.\nAustralia, Italy, and South Korea have already enacted similar bans, as has Texas, while the US Navy and NASA have blocked the app internally.\nNvidia says its new GPUs are the fastest for DeepSeek AI, which kind of misses the point\nNvidia is touting the performance of DeepSeek’s open source AI models on its just-launched RTX 50-series GPUs, claiming that they can “run the DeepSeek family of distilled models faster than anything on the PC market.” But this announcement from Nvidia might be somewhat missing the point.\nThis week, Nvidia’s market cap suffered the single biggest one-day market cap loss for a US company ever, a loss widely attributed to DeepSeek. DeepSeek said that its new R1 reasoning model didn’t require powerful Nvidia hardware to achieve comparable performance to OpenAI’s o1 model, letting the Chinese company train it at a significantly lower cost. What DeepSeek accomplished with R1 appears to show that Nvidia’s best chips may not be strictly needed to make strides in AI, which could affect the company’s fortunes in the future.\nRead Article >AI is ‘an energy hog,’ but DeepSeek could change that\nDeepSeek startled everyone last month with the claim that its AI model uses roughly one-tenth the amount of computing power as Meta’s Llama 3.1 model, upending an entire worldview of how much energy and resources it’ll take to develop artificial intelligence.\nTaken at face value, that claim could have tremendous implications for the environmental impact of AI. Tech giants are rushing to build out massive AI data centers, with plans for some to use as much electricity as small cities. Generating that much electricity creates pollution, raising fears about how the physical infrastructure undergirding new generative AI tools could exacerbate climate change and worsen air quality.\nRead Article >How DeepSeek crashed the AI party\nThe DeepSeek story contains multitudes. It’s a story about the stock market, whether there’s an AI bubble, and how important Nvidia has become to so many people’s financial future. It’s also a story about China, export controls, and American AI dominance. And then, somewhere in there, there’s a story about technology: about how a startup managed to build cheaper, more efficient AI models with few of the capital and technological advantages its competitors have.\nOn this episode of The Vergecast, we talk about all these angles and a few more, because DeepSeek is the story of the moment on so many levels. Nilay and David discuss whether companies like OpenAI and Anthropic should be nervous, why reasoning models are such a big deal, and whether all this extra training and advancement actually adds up to much of anything at all. (Nilay has a long comparison to Bluetooth, in case that helps you guess where we land.)\nRead Article >- The too-online finance dorks are at it again.\nIncredible kicker from FT Alphaville, on top of some truly bizarre memes from Deutsche Bank. We love our blogging brethren.\nDeutsche Bank has published a bunch of memes about DeepSeek[Financial Times]\nDeepSeek database left user data, chat histories exposed for anyone to see\nDeepSeek has secured a “completely open” database that exposed user chat histories, API authentication keys, system logs, and other sensitive information, according to cloud security firm Wiz. The security researchers said they found the Chinese AI startup’s publicly accessible database in “minutes,” with no authentication required.\nThe exposed information was housed within an open-source data management system called ClickHouse and consisted of more than 1 million log lines. As noted by Wiz, the exposure “allowed for full database control and potential privilege escalation within the DeepSeek environment,” which could’ve given bad actors access to the startup’s internal systems. These findings were first reported by Wired.\nRead Article >Inside Microsoft’s quick embrace of DeepSeek\nThe Chinese startup DeepSeek shook up the world of AI last week after showing its supercheap R1 model could compete directly with OpenAI’s o1. While it wiped nearly $600 billion off Nvidia’s market value, Microsoft engineers were quietly working at pace to embrace the partially open- source R1 model and get it ready for Azure customers. It was a decision that came from the very top of Microsoft.\nSources familiar with Microsoft’s DeepSeek R1 deployment tell me that the company’s senior leadership team and CEO Satya Nadella moved with haste to get engineers to test and deploy R1 on Azure AI Foundry and GitHub over the past 10 days. For a corporation the size of Microsoft, it was an unusually quick turnaround, but there are plenty of signs that Nadella was ready and waiting for this exact moment.\nRead Article >DeepSeek, Stargate, and the new AI arms race\nOn today’s episode of Decoder, we’re talking about the only thing the AI industry — and pretty much the entire tech world — has been able to talk about for the last week: that is, of course, DeepSeek, and how the open-source AI model built by a Chinese startup has completely upended the conventional wisdom around chatbots, what they can do, and how much they should cost to develop.\nDeepSeek, for those unaware, is a lot like ChatGPT — there’s a website and a mobile app, and you can type into a little text box and have it talk back to you. What makes it special is how it was built. On January 20th, the startup’s most recent major release, a reasoning model called R1, dropped just weeks after the company’s last model V3, both of which began showing some very impressive AI benchmark performance. It quickly became clear that DeepSeek’s models perform at the same level, or in some cases even better, as competing ones from OpenAI, Meta, and Google. Also: they’re totally free to use.\nRead Article >Mark Zuckerberg says Meta isn’t worried about DeepSeek\nNearly everyone seems to be suddenly freaking out about the rise of DeepSeek. Meta isn’t worried, though.\nThat was CEO Mark Zuckerberg’s message to investors during his company’s fourth-quarter earnings call on Wednesday. During the Q&A portion of the call with Wall Street analysts, Zuckerberg fielded multiple questions about DeepSeek’s impressive AI models and what the implications are for Meta’s AI strategy. He said that what DeepSeek was able to accomplish with relatively little money has “only strengthened our conviction that this is the right thing to be focused on.”\nRead Article >- Someone might be squatting on DeepSeek’s trademark.\nJust days before DeepSeek filed an application with the US Patent and Trademark Office for its name, a company called Delson Group swooped in and filed one before it, as reported by TechCrunch. The outlet found that Delson Group’s owner has a “history of trademark squatting,” which could prove inconvenient for DeepSeek.\nBut like my colleague Sarah Jeong writes, just because someone files for a trademark doesn’t mean they’ll actually get it.\nDeepSeek might have a trademark problem in the U.S. | TechCrunch[techcrunch.com]\nMicrosoft makes DeepSeek’s R1 model available on Azure AI and GitHub\nMicrosoft is bringing Chinese AI company DeepSeek’s R1 model to its Azure AI Foundry platform and GitHub today. The R1 model, which has rocked US financial markets this week because it can be trained at a fraction of the cost of leading models from OpenAI, is now part of a model catalog on Azure AI Foundry and GitHub — allowing Microsoft’s customers to integrate it into their AI applications.\n“One of the key advantages of using DeepSeek R1 or any other model on Azure AI Foundry is the speed at which developers can experiment, iterate, and integrate AI into their workflows,” says Asha Sharma, Microsoft’s corporate vice president of AI platform. “DeepSeek R1 has undergone rigorous red teaming and safety evaluations, including automated assessments of model behavior and extensive security reviews to mitigate potential risks.”\nRead Article >OpenAI has evidence that its models helped train China’s DeepSeek\nChinese artificial intelligence company DeepSeek disrupted Silicon Valley with the release of cheaply developed AI models that compete with flagship offerings from OpenAI — but the ChatGPT maker suspects they were built upon OpenAI data.\nOpenAI and Microsoft are investigating whether the Chinese rival used OpenAI’s API to integrate OpenAI’s AI models into DeepSeek’s own models, according to Bloomberg. The outlet’s sources said Microsoft security researchers detected that large amounts of data were being exfiltrated through OpenAI developer accounts in late 2024, which the company believes are affiliated with DeepSeek.\nRead Article >Why everyone is freaking out about DeepSeek\nIt took about a month for the finance world to start freaking out about DeepSeek, but when it did, it took more than half a trillion dollars — or one entire Stargate — off Nvidia’s market cap. It wasn’t just Nvidia, either: Tesla, Google, Amazon, and Microsoft tanked.\nDeepSeek’s two AI models, released in quick succession, put it on par with the best available from American labs, according to Alexandr Wang, Scale AI CEO. And DeepSeek seems to be working within constraints that mean it trained much more cheaply than its American peers. One of its recent models is said to cost just $5.6 million in the final training run, which is about the salary an American AI expert can command. Last year, Anthropic CEO Dario Amodei said the cost of training models ranged from $100 million to $1 billion. OpenAI’s GPT-4 cost more than $100 million, according to CEO Sam Altman. DeepSeek seems to have just upended our idea of how much AI costs, with potentially enormous implications across the industry.\nRead Article >- OpenAI CEO Sam Altman on DeepSeek R1: “an impressive model.”\nNow that a Chinese startup has captured a lot of the AI buzz, what happens next?\nThe ChatGPT boss says of his company, “we will obviously deliver much better models and also it’s legit invigorating to have a new competitor,” then, naturally, turns the conversation to AGI.\nTrump says he’ll put tariffs on imported chips ‘in the near future’\nWithout going into detail about what might happen to the $52 billion in subsidies from the CHIPS Act under his administration, Donald Trump said tariffs on foreign computer chips, semiconductors, and pharmaceuticals are coming “in the near future.” He also namechecked DeepSeek’s AI releases, saying, “...coming up with a faster method of AI and less expensive, that’s good. I view that as a positive if it is fact and it is true, and nobody knows, but I view that as a positive.”\nIn the speech at the House GOP Issues Conference held at the Trump National Doral Resort in Miami Monday afternoon, he said that to return the production of these goods to the US, “we don’t want to give them billions of dollars like this ridiculous program Biden has.” Instead the incentive for manufacturers will be “they will not want to pay a tax.”\nRead Article >- Nvidia’s market cap drops by almost $600 billion amid DeepSeek R1 hype.\nAs Chinese AI startup DeepSeek draws attention for open-source AI models that it says are cheaper than the competition while providing similar or better performance, AI chip king Nvidia’s stock price dropped today.\nCNBC said that after closing at $118.58, down 17 percent, this was “the biggest drop ever for a U.S. company.”\n- Nvidia responds to the DeepSeek hype.\nIn a statement to Bloomberg, an Nvidia spokesperson said DeepSeek is an “excellent AI advancement” and shows how a company can create new AI models using the test-time scaling method, while “leveraging widely-available models and compute that is fully export control compliant.”\n- DeepSeek says its newest AI model, Janus-Pro can outperform Stable Diffusion and DALL-E 3.\nAlready riding a wave of hype over its R1 “reasoning” AI that is atop the app store charts and shifting the stock market, Chinese startup DeepSeek has released another new open-source AI model: Janus-Pro.\nInput image analysis is limited to 384x384 resolution, but the company says the largest version, Janus-Pro-7b, beat comparable models on two AI benchmark tests.\nCorrection: As TechCrunch notes, Janus-Pro image input is listed as limited to low resolution, not its output.\nDeepSeek’s top-ranked AI app is restricting sign-ups due to ‘malicious attacks’\nAfter surging to the top of Apple’s App Store charts in the US, DeepSeek’s AI Assistant is now restricting new user sign-ups. According to an incident report page, registrations are being temporarily limited “due to large-scale malicious attacks on DeepSeek’s services,” though it’s unclear how these limitations are being applied.\n“Existing users can log in as usual,” DeepSeek said in its update. “Thanks for your understanding and support.” An alert banner on the DeepSeek web sign-up page says that “registration may be busy,” rather than entirely restricted, however, and encourages users to wait and “try again” if their application is unsuccessful.\nRead Article >China’s DeepSeek AI is hitting Nvidia where it hurts\nA chatbot made by Chinese artificial intelligence startup DeepSeek has rocketed to the top of Apple’s App Store charts in the US this week, dethroning OpenAI’s ChatGPT as the most downloaded free app. The eponymous AI assistant is powered by DeepSeek’s open-source models, which the company says can be trained at a fraction of the cost using far fewer chips than the world’s leading models. The claim has riled financial markets, with Nvidia’s share price dropping over 12 percent in pre-market trading.\nDownloads for the app exploded shortly after DeepSeek released its new R1 reasoning model on January 20th, which is designed for solving complex problems and reportedly performs as well as OpenAI’s o1 on certain benchmarks.\nRead Article >"
3,https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/,0.04,0.02,7302,8011,"Photo by Jaap Arriens/NurPhoto via AP Science & Technology Does China’s DeepSeek Represent a New—and Much Cheaper—Frontier in AI Technology? While not exactly like the Space Race, China’s bold advancement may herald a reckoning in the United States, BU computer science professor says\n\nAs tech companies in the United States collectively pour billions—soon maybe trillions—of dollars into developing powerful artificial intelligence tools, a small Chinese technology start-up has shown the world that it might be possible to do it for less. A lot less. Raising all sorts of questions about the future of AI.\n\nThe scrappy Chinese start-up DeepSeek splashed onto the scene and upended US financial markets when it recently revealed that DeepSeek-R1, an AI model that rivals the best technology from domestic companies such as Microsoft and Google, was built for about $6 million—a sliver of what Meta is spending on its latest AI program.\n \n \n \n \n Q & A with Mark Crovella \n\n\nThe other improvement that they made is that they’ve adopted a different strategy for training these models from a nearby technology called reinforcement learning. For computer engineers, this is a very well understood concept, but [the DeepSeek engineers] thought about using it in a slightly new way and it turned out to work extremely well. How big a deal is the DeepSeek technology? Can you put it into context? Crovella: There has been a scaling law that engineers have noticed over the past roughly 6 to 10 years. The scaling law says that every time we increase the amount of data and the amount of computation in these models—every time we make a major increase—we see a major increase in performance. And that’s been empirically borne out for quite a few years now. And so, that’s where the motivation comes from for people and companies to spend hundreds of billions of dollars, because they think that that gives them the potential for a corresponding improvement in performance, and that would give them a business advantage.\n\n\n\nAnd so the DeepSeek announcement doesn’t completely negate all of the empirical evidence from the past, but what we’re seeing is that maybe if you want to get a 10 times increase in performance, maybe you don’t have to buy 10 times as much hardware. Maybe there are algorithmic and methodological improvements that could get us there instead. BU Today:","Does China’s DeepSeek Represent a New—and Much Cheaper—Frontier in AI Technology?\nWhile not exactly like the Space Race, China’s bold advancement may herald a reckoning in the United States, BU computer science professor says\nAs tech companies in the United States collectively pour billions—soon maybe trillions—of dollars into developing powerful artificial intelligence tools, a small Chinese technology start-up has shown the world that it might be possible to do it for less. A lot less. Raising all sorts of questions about the future of AI. BU Today spoke with Crovella about the technology and what it means for the AI race.\nQ&A\nwith Mark Crovella The other improvement that they made is that they’ve adopted a different strategy for training these models from a nearby technology called reinforcement learning. For computer engineers, this is a very well understood concept, but [the DeepSeek engineers] thought about using it in a slightly new way and it turned out to work extremely well. big a deal s been empirically borne out for quite a few years now. And so, that’s where the moti \nBU Today: Tech venture capitalist Marc Andreessen said that DeepSeek is “AI’s Sputnik moment.” Do you agree?\nI think the thing to recognize is that there’s no real barrier to the flow of ideas. [The United States] tried to put up a barrier to the flow of hardware and prevent China from using our latest [graphics processing unit] hardware, but the ideas flow completely freely across borders, and so you can’t really stop folks in other countries from advancing. So, I don’t see it exactly like the Space Race, but I do see it as making clear that there’s now a critical mass of machine learning expertise in China that’s capable of, at least at times, creating advances that haven’t occurred to anyone in the United States.\nComments & Discussion\nBoston University moderates comments to facilitate an informed, substantive, civil conversation. Abusive, profane, self-promotional, misleading, incoherent or off-topic comments will be rejected. Moderators are staffed during regular business hours (EST) and can only accept comments written in English. Statistics or facts must include a citation or a link to the citation."



###conslusions
---

At this point it seems difficult to decide which is better for text extraction. In the first example, np extracted 'rubbish' info like *'4 February 2025 Share Save Kelly Ng, Brandon Drenon, Share Save\n\nGetty Images'* which seem to be a footnote for an image. But in the second example, it was tr which extracted useless info *'\nUNLOCK ACCESS Just $5 for 3 monthsAlready a subscriber? Log in'*

---



In [49]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

test_1 = compare_np_tr(test_url)
test_1

Unnamed: 0,URL,Newspaper3k_Time,Trafilatura_Time,Newspaper3k_Chars,Trafilatura_Chars,Only_in_Newspaper3k,Only_in_Trafilatura
0,https://www.nature.com/articles/d41586-025-00259-0,0.31,0.33,333,683,"In the meantime, to ensure continued support, we are displaying the site without styles and JavaScript.",


In [125]:
def get_trafilatura_text(urls):
    for url in urls:
        tr_text, tr_time = scrape_text_trafilatura(url)
        print (f"url: {url}")
        print (tr_text.strip())

get_trafilatura_text(test_url)

url: https://www.nature.com/articles/d41586-025-00259-0
- NEWS
How China created AI model DeepSeek and shocked the world
Access options
Access Nature and 54 other Nature Portfolio journals
Get Nature+, our best-value online-access subscription
$29.99 / 30 days
cancel any time
Subscribe to this journal
Receive 51 print issues and online access
$199.00 per year
only $3.90 per issue
Rent or buy this article
Prices vary by article type
from$1.95
to$39.95
Prices may be subject to local taxes which are calculated during checkout
Nature 638, 300-301 (2025)
doi: https://doi.org/10.1038/d41586-025-00259-0
Additional reporting by Elizabeth Gibney.
References
Liu, A. et al. Preprint at arXiv https://doi.org/10.48550/arXiv.2412.19437 (2024).


here it's evident that only the html headers get scrapped, let's try use selenium for JS

In [106]:
# !apt-get update -qq
# !apt-get install -qq -y chromium-chromedriver
# !cp /usr/lib/chromium-browser/chromedriver /usr/bin

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
cp: '/usr/lib/chromium-browser/chromedriver' and '/usr/bin/chromedriver' are the same file


In [120]:
def scrape_article_selenium(url):
    options = set_options()
    driver = gcs.Chrome(options=options)

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        stealth(driver,
                languages=["en-US", "en"],
                vendor="Google Inc.",
                platform="Win32",
                webgl_vendor="Intel Inc.",
                renderer="Intel Iris OpenGL Engine",
                fix_hairline=True,
               )

        try:
            cookie_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Accept')]"))
            )
            cookie_button.click()
            print("Cookie popup accepted.")
        except Exception as e:
            print("Cookie popup not found or already dismissed:", e)

        time.sleep(3)

        scroll_page(driver)

        html = driver.page_source

    except Exception as e:
        print("An error occurred:", e)
        raise

    finally:
        driver.quit()

    return html

In [121]:
url = "https://www.nature.com/articles/d41586-025-00259-0"
html = scrape_article_selenium(url)

extracted_text = trafilatura.extract(html)
print(extracted_text)

<IPython.core.display.Javascript object>

Cookie popup accepted.
- NEWS
How China created AI model DeepSeek and shocked the world
Access options
Access Nature and 54 other Nature Portfolio journals
Get Nature+, our best-value online-access subscription
$29.99 / 30 days
cancel any time
Subscribe to this journal
Receive 51 print issues and online access
$199.00 per year
only $3.90 per issue
Rent or buy this article
Prices vary by article type
from$1.95
to$39.95
Prices may be subject to local taxes which are calculated during checkout
Nature 638, 300-301 (2025)
doi: https://doi.org/10.1038/d41586-025-00259-0
Additional reporting by Elizabeth Gibney.
References
Liu, A. et al. Preprint at arXiv https://doi.org/10.48550/arXiv.2412.19437 (2024).


**conclusions**: we still got only headers, nature.com has anti-scraping measures

-
-
-

note: istnieja strony gdzie mozesz scrollowac wiecznie, wiec trzeba na pewno dac limity

##Indetify named entities

In [31]:
!pip install spacy -q
!python -m spacy download en_core_web_sm  # Small English model

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [32]:
def extract_entities(text):
    doc = nlp(text)
    return set((ent.text.strip(), ent.label_) for ent in doc.ents)

def compare_entities_bulk(urls):
    results = []

    for url in urls:
        np_text, np_time = scrape_text_newspaper3k(url)
        tr_text, tr_time = scrape_text_trafilatura(url)

        np_entities = extract_entities(np_text)
        tr_entities = extract_entities(tr_text)

        only_in_np = np_entities - tr_entities
        only_in_tr = tr_entities - np_entities

        results.append({
            'URL': url,
            'Newspaper3k_Entities_Count': len(np_entities),
            'Trafilatura_Entities_Count': len(tr_entities),
            'Entities_Only_in_Newspaper3k': only_in_np,
            'Entities_Only_in_Trafilatura': only_in_tr
        })

    return pd.DataFrame(results)

In [17]:
entity_comparison_df = compare_entities_bulk(test_urls_1)

entity_comparison_df

Unnamed: 0,URL,Newspaper3k_Entities_Count,Trafilatura_Entities_Count,Entities_Only_in_Newspaper3k,Entities_Only_in_Trafilatura
0,https://www.bbc.com/news/articles/c5yv5976z9po,74,73,"{(Brandon Drenon, PERSON), (February 2025, DATE), (Tom Gerken, PERSON), (BBC News Share Save, ORG), (Marc Cieslak, PERSON), (America, GPE), (Nvidia, ORG)}","{(the University of Technology Sydney, ORG), (West, LOC), (the Communist Party, ORG), (Marina Zhang, PERSON), (João da Silva, PERSON), (Liv McMahon, PERSON)}"
1,https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/,11,23,{},"{(Register, ORG), (Just $5, MONEY), (each week, DATE), (Britain, GPE), (5, MONEY), (first three months, DATE), (3, CARDINAL), (UNLOCK, ORG), (just $5, MONEY), (3 months, DATE), (two, CARDINAL), (The Spectator's, WORK_OF_ART)}"
2,https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai,17,151,{},"{(AGI, ORG), (earlier this month, DATE), (17, CARDINAL), (Delson Group’s, ORG), (Alexandr Wang, PERSON), (earlier this year, DATE), (Stargate, ORG), (more than $100 million, MONEY), (Tuesday, DATE), (two, CARDINAL), (ClickHouse, ORG), (R1, ORG), (Q&A, ORG), (Wired, GPE), (Navy, ORG), (last week, DATE), (Monday, DATE), (Dario Amodei, PERSON), (Donald Trump, PERSON), (NASA, ORG), (minutes, TIME), (David, PERSON), (DeepSeek’s R1, PERSON), (nearly $600 billion, MONEY), (the last week, DATE), (Nintendo, ORG), (R1, PRODUCT), (afternoon, TIME), (fourth-quarter, DATE), (Being Closely Guarded[The Information, ORG), (TikTok, ORG), (just weeks, DATE), (the past 10 days, DATE), (Starlink, ORG), (DeepSeek AI, PRODUCT), (Meta’s AI, ORG), (Miami, GPE), (17 percent, PERCENT), (January 27th, DATE), (Scale AI, PERSON), (TechCrunch[techcrunch.com, ORG), (bill seeks, PERSON), (Sarah Jeong, PERSON), (DeepSeek R1, WORK_OF_ART), (50, CARDINAL), (118.58, MONEY), (Bloomberg, GPE), (U.S., GPE), (DeepSeek R1, ORG), (Nvidia, PERSON), (DeepSeek, NORP), (the Trump National Doral Resort, ORG), (Delson Group, ORG), (the App Store, PERSON), (Mark Zuckerberg, PERSON), (Anthropic, NORP), (Air, ORG), (the US Navy, ORG), (OpenAI’s AI, ORG), (American, NORP), (Australia, GPE), (The Wall Street Journal, ORG), (DeepSeek, CARDINAL), (DeepSeek R1, PRODUCT), (Amazon, ORG), (Biden, PERSON), (Apple’s App Store, ORG), (GitHub, ORG), (Satya Nadella, PERSON), (EU, ORG), (South Korea, GPE), ($100 million to $1 billion, MONEY), (Trump, PERSON), (Alibaba, GPE), (Azure AI Foundry and, WORK_OF_ART), (Nvidia, ORG), (today, DATE), (Nilay, ORG), (Korea, GPE), (DeepSeek, LAW), (Decoder, PRODUCT), (just $5.6 million, MONEY), (almost $600 billion, MONEY), (Siri, GPE), (Deutsche Bank, ORG), (this week, DATE), (billions of dollars, MONEY), (OpenAI’s API, ORG), (API, ORG), (Nadella, PERSON), (GitHub, PRODUCT), (Just days, DATE), (roughly one-tenth, CARDINAL), (Apple, ORG), (TechCrunch, ORG), (Asha Sharma, PERSON), (Last year, DATE), (more than 1 million, QUANTITY), (Meta’s Llama 3.1, ORG), (three, CARDINAL), ...}"
3,https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/,51,54,"{(Mark Crovella BU, PERSON), (NurPhoto, ORG), (Jaap Arriens, PERSON), (Today, DATE), (AP Science & Technology Does, ORG)}","{(Mark Crovella\nBU, PERSON), (AI’s Sputnik, WORK_OF_ART), (Comments & Discussion\nBoston University, ORG), (The United States, GPE), (Marc Andreessen, PERSON), (regular business hours, TIME), (English, LANGUAGE), (BU Today, ORG)}"




---

It seems that Trafilatura extracts more text than Newspaper3k, it is also a little faster so perhaps we should focus on this library


---



##Extracing claims

###Let's start with rule-based method, for an easy, quick check (dependency parsing)

We'll be looking for VERB + ENTITY + NUMERIC/DATE patterns

In [82]:
def extract_potential_claims(text):
    doc = nlp(text)
    claims = []
    for sent in doc.sents:
        has_entity = any(tok.ent_type_ for tok in sent)
        has_verb = any(tok.pos_ == "VERB" for tok in sent)
        if has_entity and has_verb:
            claims.append(sent.text.strip())
    return claims

In [137]:
def compare_claims_aligned(urls):
    results = []

    for url in urls:
        np_text, _ = scrape_text_newspaper3k(url)
        tr_text, _ = scrape_text_trafilatura(url)

        np_claims = set(extract_potential_claims(np_text))
        tr_claims = set(extract_potential_claims(tr_text))

        all_claims = np_claims.union(tr_claims)

        for claim in all_claims:
            results.append({
                'URL': url,
                'Claim': claim,
                'Exists_in_Newspaper3k': claim in np_claims,
                'Exists_in_Trafilatura': claim in tr_claims
            })

    return pd.DataFrame(results)

In [154]:
df_claims = compare_claims_aligned([page_url_4])
pd.set_option('display.max_colwidth', None)

df_claims

Unnamed: 0,URL,Claim,Exists_in_Newspaper3k,Exists_in_Trafilatura
0,https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360,"Oxfam's domestic poverty lead Silvia Galandini said in a statement that the plan to slash the welfare bill by £5bn is ""another deplorable political choice"".",True,True
1,https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360,"""It unnecessarily risks pushing more people into poverty and hardship while the ballooning bank balances of the UK's super-rich once again escape scot-free,"" she said.",True,True
2,https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360,"She called for a 2% tax on people with assets of over £10m, which she said would raise £24bn each year.",True,True
3,https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360,"Meanwhile, executive director of strategy at disability charity Scope, James Taylor, said today's announcement ""should shame the government to its core"".",True,True
4,https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360,"""They are choosing to penalise some of the poorest people in our society. Almost half of families in poverty include someone who is disabled,"" he said, and went on to say that ""life costs more if you are disabled"".",True,True
5,https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360,"""Ripping £5bn out of the system by 2030 will be a catastrophe for disabled peoples' living standards and independence,"" he continued.",True,True
6,https://news.sky.com/story/politics-latest-live-starmer-speech-ukraine-zelenskyy-war-trump-welfare-cuts-tories-reform-12593360,"""The government will be picking up the pieces in other parts of the system with pressure on an already overwhelmed NHS and social care, as more disabled people are pushed into poverty.""",True,True


In [185]:
df_claims = compare_claims_aligned(test_url_2)
pd.set_option('display.max_colwidth', None)

df_claims

Unnamed: 0,URL,Claim,Exists_in_Newspaper3k,Exists_in_Trafilatura
0,https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex,The California Senate has approved a bill that bans schools from telling parents if their children want to change their pronouns unless the children give consent or other law requires it.,False,True
1,https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex,"The California Senate has approved a bill that bans schools from telling parents if their children want to change their pronouns unless the children give consent or other law requires it. AB-1955 passed along party lines, with all 29 Democrats voting in favor and their eight Republican colleagues voting against. It now moves to the state Assembly, where it must be passed by committees and on the floor before it can be sent to Gov. Gavin Newsom's desk.",True,False
2,https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex,"Seems like California is really, really interested in funneling kids into a lifetime of delusion, irreversible surgery, misery, and medical malpractice.",True,True
3,https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex,"So, in other words, if the bill passes then every parent in California will have to assume as a matter of course that their child's school might not be telling them about their child's dangerous LGBTQIA+ delusion.",True,True
4,https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex,"Gee, I wonder how likely it is that it'll pass and get signed by Comrade Newsom?",True,True
5,https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex,P.S. Now check out our latest video 👇,True,True
6,https://notthebee.com/article/california-is-preparing-to-forbid-school-employees-from-informing-parents-if-their-child-identifies-as-the-opposite-sex,"AB-1955 passed along party lines, with all 29 Democrats voting in favor and their eight Republican colleagues voting against. It now moves to the state Assembly, where it must be passed by committees and on the floor before it can be sent to Gov. Gavin Newsom's desk.",False,True


It seem that the Trafilatura is doing better as it didn't include text like '*P.S. Now check out our latest video*'.
However, let's see if we can extract more relevant claims using pre-trained claim detection models

###Pre-trained Claim Detection Models

In [155]:
!pip install transformers -qq torch -qq

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [158]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [165]:
from torch.nn.functional import softmax

In [187]:
text_1, _ = scrape_text_trafilatura(page_url_4)
test_1 = sent_tokenize(text_1)

text_2, _ = scrape_text_trafilatura(test_url_2[0])
test_2 = sent_tokenize(text_2)

####[Nithiwat/bert-base_claimbuster](https://huggingface.co/Nithiwat/bert-base_claimbuster)

In [166]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("Nithiwat/bert-base_claimbuster")
model = AutoModelForSequenceClassification.from_pretrained("Nithiwat/bert-base_claimbuster")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [171]:
def check_for_claims(sentences):
  for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=-1)
    claim_prob = probs[0][1].item()

    if claim_prob > 0.5:
      print(f"✅ CLAIM: {sentence} (Score: {claim_prob:.2f})")
    else:
      print(f"❌ NOT: {sentence} (Score: {claim_prob:.2f})")

In [172]:
check_for_claims(test_1)

❌ NOT: Charities say benefit cuts 'should shame government to its core'
We've had some reaction from charities to the government's welfare reforms - and it is not exactly positive. (Score: 0.00)
✅ CLAIM: Oxfam's domestic poverty lead Silvia Galandini said in a statement that the plan to slash the welfare bill by £5bn is "another deplorable political choice". (Score: 1.00)
✅ CLAIM: "It unnecessarily risks pushing more people into poverty and hardship while the ballooning bank balances of the UK's super-rich once again escape scot-free," she said. (Score: 1.00)
❌ NOT: "While it's critical to break down barriers to employment for everyone - including people with disabilities and long-term health issues who can and want to be in paid work - this cannot be achieved by further restricting and cutting an already inadequate social security system." (Score: 0.00)
✅ CLAIM: She called for a 2% tax on people with assets of over £10m, which she said would raise £24bn each year. (Score: 1.00)
❌ NOT:

In [188]:
check_for_claims(test_2)

❌ NOT: California really is the worst state, isn't it? (Score: 0.00)
✅ CLAIM: The California Senate has approved a bill that bans schools from telling parents if their children want to change their pronouns unless the children give consent or other law requires it. (Score: 1.00)
✅ CLAIM: AB-1955 passed along party lines, with all 29 Democrats voting in favor and their eight Republican colleagues voting against. (Score: 1.00)
❌ NOT: It now moves to the state Assembly, where it must be passed by committees and on the floor before it can be sent to Gov. (Score: 0.02)
❌ NOT: Gavin Newsom's desk. (Score: 0.00)
❌ NOT: Gee, I wonder how likely it is that it'll pass and get signed by Comrade Newsom? (Score: 0.00)
✅ CLAIM: It's always best in these cases to take a look at what the legislation says, and rest assured, it very much says that,
This bill would prohibit school districts, county offices of education, charter schools, and the state special schools, and a member of the governing board o