# This is a web scraping demo, for:


*   extracting all images from an article (url)
*   filtering them (e.g. ignoring small icons, duplicates etc.)
* testing bulk requests to the external Hive API
* aggregating the results into a "trustworthiness score" for an article/ post etc.



##Environment Config

In [None]:
from google.colab import userdata
API_KEY = userdata.get('API_KEY')

In [None]:
import requests
from bs4 import BeautifulSoup

import re

from urllib.parse import urljoin

##Extract all images

In [None]:
def get_images_from_page(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    img_tags = soup.find_all("img")

    img_urls = [img.get("src") for img in img_tags if img.get("src")]
    img_urls = [urljoin(url, img_url) for img_url in img_urls]

    return img_urls

In [None]:
#page_url = "https://www.politico.com/news/2025/03/11/trump-pardons-white-collar-defendant-interest-00219801"
page_url = "https://www.bu.edu/articles/2025/does-chinas-deepseek-represent-a-new-frontier-in-ai/"

image_urls = get_images_from_page(page_url)

print(f"Found {len(image_urls)} images:")
for img_url in image_urls:
    print(img_url)

Found 6 images:
https://www.bu.edu/files/2025/02/deepseek-butoday_feat-crop.jpg
https://www.bu.edu/files/2022/08/resize-22-1553-MARCOMHEAD-161-3-500x500.jpg
https://secure.gravatar.com/avatar/4c5df37ceca4c17f8e763e49d4ea689e?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/4fea0a9368359cc8611acb0921f361b0?s=150&d=mm&r=g
https://secure.gravatar.com/avatar/89d0948216fdd4952797c389d36c7b39?s=150&d=mm&r=g
https://www.bu.edu/wp-content/plugins/bu-sharing/images/loading.gif
