# This is a web scraping demo, for:


*   extracting all images from an article (url)
*   filtering them (e.g. ignoring small icons, duplicates etc.)
* testing bulk requests to the external Hive API
* aggregating the results into a "trustworthiness score" for an article/ post etc.



##Environment Config

In [1]:
!pip install -q selenium
!pip install -q webdriver-manager
!pip install -q google-colab-selenium

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import requests
import re
import time
import google_colab_selenium as gcs

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

In [3]:
from google.colab import userdata
API_KEY = userdata.get('API_KEY')

##Extract all images

###basic version

In [12]:
def get_images_from_page_basic(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    img_tags = soup.find_all("img")

    img_urls = [img.get("src") for img in img_tags if img.get("src")]
    img_urls = [urljoin(url, img_url) for img_url in img_urls]

    return img_urls

In [28]:
#page_url = "https://www.politico.com/news/2025/03/11/trump-pardons-white-collar-defendant-interest-00219801"
page_url = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"

image_urls = get_images_from_page_basic(page_url)

print(f"Found {len(image_urls)} images:")
for img_url in image_urls:
    print(img_url)

Found 52 images:
https://www.google-analytics.com/g/collect?v=2&tid=G-C3QZPB4GVE&cid=555&en=noscript_page_view
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/author_profile_images/195810/EMMA_ROTH.0.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/author_profile_images/197794/dominic-preston.0.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/0007835275_20.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/author_profile_images/197794/dominic-preston.0.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/author_profile_images/195819/JAY_PETERS.0.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/uploads/chorus_asset/file/25728971/STK

###more advanced version

In [None]:
def click_element(driver, xpath, timeout=3):
  try:
    element = WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((By.XPATH, xpath)))
    print(f"Clicking {xpath}")
    element.click()
    time.sleep(0.5)
    return True
  except Exception:
    return False



---



In [4]:
def set_options():
  options = Options();
  options.add_argument("--headless=new")
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-dev-shm-usage")
  options.add_argument("--disable-blink-features=AutomationControlled")
  options.binary_location = "/usr/bin/google-chrome"
  return options

In [5]:
def scroll_page(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [46]:
def get_images_from_page(url):
  options = set_options()
  driver = gcs.Chrome(options=options)
  driver.get(url)

  print("waiting for doc loading")
  WebDriverWait(driver, 10).until(lambda d: d.execute_script("return document.readyState") == "complete")

  print("waiting for removing pop-ups")
  driver.execute_script("document.querySelectorAll('.popup-class').forEach(e => e.remove());")

  print("scrolling...")
  scroll_page(driver)

  print("extracting Imgs")
  img_urls = driver.execute_script("""
    let imgSet = new Set();

    Array.from(document.querySelectorAll('img'))
        .map(img => {
            let src = img.getAttribute('src');
            let srcset = img.getAttribute('srcset');

            let width = img.getAttribute('width');
            let height = img.getAttribute('height');

            let className = img.getAttribute('class') || "";
            let alt = img.getAttribute('alt') || "";
            let style = img.getAttribute('style') || "";

            if (!src || !src.startsWith('http')) {
                if (srcset) {
                  let srcsetUrls = srcset.split(',')
                    .map(s => s.trim().split(' ')[0])
                    .filter(url => url && url.startsWith('http')); // Filter out undefined values
                  src = srcsetUrls.length > 0 ? srcsetUrls[0] : null;
                }
            }
            if (!src) return null;

            let minSize = 100;
            if ((width && width < minSize) || (height && height < minSize)) return null;

            if (style.includes("display: none") || style.includes("opacity: 0")) return;

            if (src.includes("avatar")
                    || src.includes("loading")
                    || src.includes("icon")
                    || src.includes("thumbnail")
                    || src.includes("logo")
                ) return null;
            if (className.includes("avatar")
                    || className.includes("loading")
                    || className.includes("icon")
                    || className.includes("thumbnail")
                    || className.includes("logo")
                ) return null;
            if (alt.toLowerCase().includes("avatar")
                    || alt.toLowerCase().includes("loading")
                    || alt.toLowerCase().includes("icon")
                    || alt.toLowerCase().includes("thumbnail")
                    || alt.toLowerCase().includes("logo")
                ) return null;

            imgSet.add(src);
        })

        .filter(src => src !== null && src !== undefined && src.startsWith('http'));
        return Array.from(imgSet);
  """)


  driver.quit()

  return img_urls


Filter rules:


*   skip **small** images (less than 100)
* skip classes/ alt containing: **avatar, loading, icon, thumbnail, logo**
* skip **invisible** images



In [7]:

def print_found_imgs(images):
  print(f"Found {len(images)} images:")
  for img in images:
      print(img)

In [47]:
#page_url = "https://www.politico.com/news/2025/03/11/trump-pardons-white-collar-defendant-interest-00219801"
#page_url = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"
#page_url = "https://www.bbc.com/news/articles/c5yv5976z9po"
#page_url = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"
page_url = "https://www.theverge.com/24353060/deepseek-ai-china-nvidia-openai"

images = get_images_from_page(page_url)

print_found_imgs(images)

<IPython.core.display.Javascript object>

waiting for doc loading
waiting for removing pop-ups
scrolling...
extracting Imgs
Found 10 images:
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_B.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/VST_0131_Site.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/247141_NOTEPAD_DEEPSEEK_AI_MICROSOFT_CVIRGINIA-1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/DCD_013024_v1.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/chorus/uploads/chorus_asset/file/25546251/STK169_Mark_Zuckerburg_CVIRGINIA_C.jpg?quality=90&strip=all&crop=0%2C0%2C100%2C100&w=2400
https://platform.theverge.com/wp-content/uploads/sites/2/2025/01/STKB320_DEEPSEEK_AI_CVIRGINIA_D.jpg?quality=90&strip=all&c