# This is a web scraping demo, for:


*   extracting all images from an article (url)
*   filtering them (e.g. ignoring small icons, duplicates etc.)
* testing bulk requests to the external Hive API
* aggregating the results into a "trustworthiness score" for an article/ post etc.



##Environment Config

In [1]:
!pip install -q selenium
!pip install -q webdriver-manager
!pip install -q google-colab-selenium

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import requests
import re
import time
import google_colab_selenium as gcs

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

In [3]:
from google.colab import userdata
API_KEY = userdata.get('API_KEY')

##Extract all images

###basic version

In [12]:
def get_images_from_page_basic(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    img_tags = soup.find_all("img")

    img_urls = [img.get("src") for img in img_tags if img.get("src")]
    img_urls = [urljoin(url, img_url) for img_url in img_urls]

    return img_urls

In [13]:
#page_url = "https://www.politico.com/news/2025/03/11/trump-pardons-white-collar-defendant-interest-00219801"
page_url = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

image_urls = get_images_from_page_basic(page_url)

print(f"Found {len(image_urls)} images:")
for img_url in image_urls:
    print(img_url)

Found 6 images:
https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop.jpg
https://www.bu.edu/files/2022/08/resize-22-1553-MARCOMHEAD-161-3-500x500.jpg
https://secure.gravatar.com/avatar/4c5df37ceca4c17f8e763e49d4ea689e?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/4fea0a9368359cc8611acb0921f361b0?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/89d0948216fdd4952797c389d36c7b39?s=150&d=mm&r=g
https://www.bu.edu/wp-content/plugins/bu-sharing/images/loading.gif


###more advanced version

In [None]:
def click_element(driver, xpath, timeout=3):
  try:
    element = WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((By.XPATH, xpath)))
    print(f"Clicking {xpath}")
    element.click()
    time.sleep(0.5)
    return True
  except Exception:
    return False



---



In [4]:
def set_options():
  options = Options();
  options.add_argument("--headless=new")
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-dev-shm-usage")
  options.add_argument("--disable-blink-features=AutomationControlled")
  options.binary_location = "/usr/bin/google-chrome"
  return options

In [5]:
def scroll_page(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [23]:
def get_images_from_page(url):
  options = set_options()
  driver = gcs.Chrome(options=options)
  driver.get(url)

  print("waiting for doc loading")
  WebDriverWait(driver, 10).until(lambda d: d.execute_script("return document.readyState") == "complete")

  print("waiting for removing pop-ups")
  driver.execute_script("document.querySelectorAll('.popup-class').forEach(e => e.remove());")

  print("scrolling...")
  scroll_page(driver)

  print("extracting Imgs")
  img_urls = driver.execute_script("""
    return Array.from(document.querySelectorAll('img'))
        .map(img => {
            let src = img.getAttribute('src');
            let srcset = img.getAttribute('srcset');

            let width = img.getAttribute('width');
            let height = img.getAttribute('height');
            let className = img.getAttribute('class') || "";
            let alt = img.getAttribute('alt') || "";

            if (!src || !src.startsWith('http')) {
                if (srcset) {
                    let srcsetUrls = srcset.split(',').map(s => s.trim().split(' ')[0]);
                    src = srcsetUrls.find(url => url.startsWith('http')) || srcsetUrls[0];
                }
            }

            let minSize = 100;
            if ((width && width < minSize) || (height && height < minSize)) return null;
            if (className.includes("avatar") || className.includes("loading") || className.includes("icon")) return null;
            if (alt.toLowerCase().includes("loading")) return null;

            return src;
        })
        .filter(src => src !== null && src.startsWith('http'));
  """)


  driver.quit()

  return img_urls


Filter rules:


*   skip **small** images (less than 100)
* skip classes/ alt containing: **avatar, loading, icon**



In [7]:

def print_found_imgs(images):
  print(f"Found {len(images)} images:")
  for img in images:
      print(img)

In [24]:
#page_url = "https://www.politico.com/news/2025/03/11/trump-pardons-white-collar-defendant-interest-00219801"
page_url = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"
#page_url = "https://www.bbc.com/news/articles/c5yv5976z9po"
#page_url = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"
#page_url = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"

images = get_images_from_page(page_url)

print_found_imgs(images)

<IPython.core.display.Javascript object>

waiting for doc loading
waiting for removing pop-ups
['<img width="2000" height="1335" src="/files/2025/02/deepseek-butoday_feat-crop.jpg" class="" alt="Photo: The logo of DeepSeek overlayed on a pixel background" srcset="https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop.jpg 2000w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-636x425.jpg 636w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-1024x684.jpg 1024w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-768x512.jpg 768w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-1536x1025.jpg 1536w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-1498x1000.jpg 1498w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-900x601.jpg 900w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-450x300.jpg 450w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-600x401.jpg 600w, https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop-220x147.jpg 220