# This is a web scraping demo, for:


*   extracting all images from an article (url)
*   filtering them (e.g. ignoring small icons, duplicates etc.)
* testing bulk requests to the external Hive API
* aggregating the results into a "trustworthiness score" for an article/ post etc.



##Environment Config

In [1]:
!pip install -q selenium
!pip install -q webdriver-manager

In [None]:
!apt-get update -qq
!apt-get install -q google-chrome-stable
!apt-get install -q chromedriver

chrome_binary = "/usr/bin/google-chrome-stable"
chrome_driver = "/usr/lib/chromium-browser/chromedriver"

In [47]:
import requests
import re
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor

In [3]:
from google.colab import userdata
API_KEY = userdata.get('API_KEY')

##Extract all images

###basic version

In [None]:
def get_images_from_page_basic(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    img_tags = soup.find_all("img")

    img_urls = [img.get("src") for img in img_tags if img.get("src")]
    img_urls = [urljoin(url, img_url) for img_url in img_urls]

    return img_urls

In [None]:
#page_url = "https://www.politico.com/news/2025/03/11/trump-pardons-white-collar-defendant-interest-00219801"
page_url = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

image_urls = get_images_from_page_basic(page_url)

print(f"Found {len(image_urls)} images:")
for img_url in image_urls:
    print(img_url)

Found 6 images:
https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop.jpg
https://www.bu.edu/files/2022/08/resize-22-1553-MARCOMHEAD-161-3-500x500.jpg
https://secure.gravatar.com/avatar/4c5df37ceca4c17f8e763e49d4ea689e?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/4fea0a9368359cc8611acb0921f361b0?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/89d0948216fdd4952797c389d36c7b39?s=150&d=mm&r=g
https://www.bu.edu/wp-content/plugins/bu-sharing/images/loading.gif


###more advanced version

In [25]:
def set_options():
  options = Options();
  options.add_argument("--headless=new")
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-dev-shm-usage")
  options.add_argument("--disable-blink-features=AutomationControlled")
  options.add_experimental_option("excludeSwitches", ["enable-automation"])
  options.add_experimental_option("useAutomationExtension", False)
  return options

In [48]:
def click_element(driver, xpath, timeout=3):
  try:
    element = WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((By.XPATH, xpath)))
    print(f"Clicking {xpath}")
    element.click()
    time.sleep(0.5)
    return True
  except Exception:
    return False

In [49]:
def get_images_from_page(url):
  options = set_options()
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
  driver.get(url)

  WebDriverWait(driver, 5).until(lambda d: d.execute_script("return document.readyState") == "complete")

  click_element(driver, "//*[contains(@class, 'close') or contains(@ng-click, 'close') or contains(text(), 'close')]")  # Close button
  click_element(driver, "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agree')]")  # Cookie popup
  click_element(driver, "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'continue') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'close')]") #Newsletter

  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  time.sleep(1)

  soup = BeautifulSoup(driver.page_source, "html.parser")

  img_urls = [urljoin(url, img.get("src")) for img in soup.find_all("img") if img.get("src")]

  driver.quit()

  return img_urls


In [50]:
def print_found_imgs(images):
  print(f"Found {len(images)} images:")
  for img in images:
      print(img)

In [53]:
#page_url = "https://www.politico.com/news/2025/03/11/trump-pardons-white-collar-defendant-interest-00219801"
page_url = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"
#page_url = "https://www.bbc.com/news/articles/c5yv5976z9po"
#page_url = "https://www.spectator.co.uk/article/is-this-new-chinese-ai-even-better-than-deepseek/"


images = get_images_from_page(page_url)

print_found_imgs(images)

Found 6 images:
https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop.jpg
https://www.bu.edu/files/2022/08/resize-22-1553-MARCOMHEAD-161-3-500x500.jpg
https://secure.gravatar.com/avatar/4c5df37ceca4c17f8e763e49d4ea689e?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/4fea0a9368359cc8611acb0921f361b0?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/89d0948216fdd4952797c389d36c7b39?s=150&d=mm&r=g
https://www.bu.edu/wp-content/plugins/bu-sharing/images/loading.gif
