### Google Image Search Scraper

In [96]:
import pandas as pd
import time

In [1]:
from dotenv import load_dotenv
import os
from serpapi import GoogleSearch
load_dotenv() 
SCRAPERAPI_KEY = os.getenv("serp_api_key")

In [None]:
top_search_people = ['Donald Trump','Kamala Harris','JD Vance','Joe Biden','Catherine, Princess of Wales','Tim Walz','Mike Tyson','Jill Stein','Usher','Imane Khelif']

In [None]:
params = {
    "engine": "google_images",
    "hl": "en",
    "gl": "us",
    "api_key":  SCRAPERAPI_KEY 
    }

In [11]:
all_results = []

for ppl in top_search_people:
    params["q"] = ppl
    search = GoogleSearch(params)
    results = search.get_dict()

    images_results = results.get("images_results", [])  # use .get to avoid errors if missing

    for image in images_results:
        all_results.append({
            "ppl": ppl,
            "position": image.get("position"),
            "thumbnail": image.get("thumbnail"),
            "source": image.get("source"),
            "source_logo": image.get("source_logo"),
            "title": image.get("title"),
            "link": image.get("link"),
            "original": image.get("original")
        })

df = pd.DataFrame(all_results)

    
    

In [13]:
df.shape

(1000, 8)

In [20]:
df.head()

Unnamed: 0,ppl,position,thumbnail,source,source_logo,title,link,original
0,Donald Trump,1,https://encrypted-tbn0.gstatic.com/images?q=tb...,"Wikipedia, the free encyclopedia",https://encrypted-tbn1.gstatic.com/faviconV2?u...,Donald Trump - Wikipedia,https://en.wikipedia.org/wiki/Donald_Trump,https://upload.wikimedia.org/wikipedia/commons...
1,Donald Trump,2,https://encrypted-tbn0.gstatic.com/images?q=tb...,"Wikipedia, the free encyclopedia",https://encrypted-tbn1.gstatic.com/faviconV2?u...,Donald Trump - Wikipedia,https://en.wikipedia.org/wiki/Donald_Trump,https://upload.wikimedia.org/wikipedia/commons...
2,Donald Trump,3,https://encrypted-tbn0.gstatic.com/images?q=tb...,The White House,https://encrypted-tbn1.gstatic.com/faviconV2?u...,President Donald J. Trump,https://www.whitehouse.gov/administration/dona...,https://www.whitehouse.gov/wp-content/uploads/...
3,Donald Trump,4,https://encrypted-tbn0.gstatic.com/images?q=tb...,History.com,https://encrypted-tbn1.gstatic.com/faviconV2?u...,"Donald Trump: Presidency, Family, Education | ...",https://www.history.com/articles/donald-trump,https://res.cloudinary.com/aenetworks/image/up...
4,Donald Trump,5,https://encrypted-tbn0.gstatic.com/images?q=tb...,Biography,https://encrypted-tbn1.gstatic.com/faviconV2?u...,"Donald Trump: Biography, U.S. President, Busin...",https://www.biography.com/political-figures/do...,https://hips.hearstapps.com/hmg-prod/images/ge...


In [15]:
df.to_csv('top_search_ppl.csv', index=False)

### sightengine

In [14]:
import requests
import json

In [17]:
load_dotenv() 
api_secret = os.getenv("sight_engine")


In [19]:
def check_ai_generated(url, results_list):
    try:
        params = {
            'url': url,
            'models': 'genai',
            'api_user': '909396887',
  'api_secret': api_secret
        }
        r = requests.get('https://api.sightengine.com/1.0/check.json', params=params)
        output = json.loads(r.text)
        score = output['type']['ai_generated']
        results_list.append(score)

    except Exception as e:
        score = "error"
        results_list.append(score)


In [22]:
urls = df['thumbnail'].tolist()

In [23]:
sightengine = []
for url in urls:
    check_ai_generated(url, sightengine)
df['sightengine'] = sightengine

In [46]:
df['sightengine'] = pd.to_numeric(df['sightengine'], errors='coerce')
df[df['sightengine'] > 0.1]

Unnamed: 0,ppl,position,thumbnail,source,source_logo,title,link,original,sightengine


In [29]:
df[df['sightengine'] > 0.5]['thumbnail'].to_list()

[]

In [36]:
df.to_csv('google_top_search_ppl.csv', index=False)

### Bing Top people search

In [38]:
params = {
    "engine": "bing_images",
    "hl": "en",
    "gl": "us",
    "api_key":  SCRAPERAPI_KEY 
    }

In [41]:
all_results = []

for ppl in top_search_people:
    params["q"] = ppl
    search = GoogleSearch(params)
    results = search.get_dict()

    images_results = results.get("images_results", [])  # use .get to avoid errors if missing

    for image in images_results:
        all_results.append({
            "ppl": ppl,
            "position": image.get("position"),
            "thumbnail": image.get("thumbnail"),
            "source": image.get("source"),
            "source_logo": image.get("source_logo"),
            "title": image.get("title"),
            "link": image.get("link"),
            "original": image.get("original")
        })

df2 = pd.DataFrame(all_results)

In [43]:
urls = df2['thumbnail'].tolist()
sightengine = []
for url in urls:
    check_ai_generated(url, sightengine)
df2['sightengine'] = sightengine


In [48]:
df2['sightengine'] = pd.to_numeric(df2['sightengine'], errors='coerce')
df2[df2['sightengine'] > 0.3]

Unnamed: 0,ppl,position,thumbnail,source,source_logo,title,link,original,sightengine
452,Mike Tyson,20,https://ts2.explicit.bing.net/th?id=OIP.eQmvmF...,,,,https://www.bing.com/images/search?view=detail...,https://tattoogunmachine.com/wp-content/upload...,0.99
508,Jill Stein,11,https://tse3.mm.bing.net/th/id/OIP._X5tf5rs9M8...,https://www.deviantart.com/suitan760/art/Jill-...,,Jill Valentine - RE3 Remake by Suitan760 on De...,https://www.bing.com/images/search?view=detail...,https://images-wixmp-ed30a86b8c4ca887773594c2....,0.99


In [49]:
df2.to_csv('bing_top_search_ppl.csv', index=False)

In [50]:
df2[df2['sightengine'] > 0.3]['thumbnail'].to_list()

['https://ts2.explicit.bing.net/th?id=OIP.eQmvmF7Bnu_exh1WWXNE1wHaHa&pid=15.1',
 'https://tse3.mm.bing.net/th/id/OIP._X5tf5rs9M8R9o0-6NOXxAHaLZ?w=194&h=299&c=7&r=0&o=7&pid=1.7&rm=3']

### important people

In [68]:
important_ppl = ['Hillary Clinton', 'Bill Clinton', 'George Soros', 'Alex Soros', 'Vladimir Putin', 'Volodymyr Zelenskyy', 'Benjamin "Bibi"','Netanyahu', 'Mike Johnson', 'Stephen Miller', 'Karoline Leavitt', 'Eric Adams', 'Zohran Mamdani', 'Elon Musk']

In [None]:
# Search for images of important people using Bing Images
params = {
    "engine": "bing_images",
    "hl": "en",
    "gl": "us",
    "api_key":  SCRAPERAPI_KEY 
    }
all_results = []

for ppl in important_ppl:
    params["q"] = ppl
    search = GoogleSearch(params)
    results = search.get_dict()

    images_results = results.get("images_results", [])  # use .get to avoid errors if missing

    for image in images_results:
        all_results.append({
            "ppl": ppl,
            "position": image.get("position"),
            "thumbnail": image.get("thumbnail"),
            "source": image.get("source"),
            "source_logo": image.get("source_logo"),
            "title": image.get("title"),
            "link": image.get("link"),
            "original": image.get("original")
        })

df3 = pd.DataFrame(all_results)

In [56]:
urls = df3['thumbnail'].tolist()
sightengine = []
for url in urls:
    check_ai_generated(url, sightengine)
df3['sightengine'] = sightengine

In [60]:
df3['sightengine'] = pd.to_numeric(df3['sightengine'], errors='coerce')
df3[df3['sightengine'] > 0.4]

Unnamed: 0,ppl,position,thumbnail,source,source_logo,title,link,original,sightengine


In [63]:
df3.to_csv('bing_important_ppl.csv', index=False)

In [70]:
# search for images of important people using duckduckgo Images
params = {
    "engine": "duckduckgo",
    "api_key":  SCRAPERAPI_KEY 
    }
all_results = []

for ppl in important_ppl:
    params["q"] = ppl
    search = GoogleSearch(params)
    results = search.get_dict()

    images_results = results.get("inline_images", [])  # use .get to avoid errors if missing

    for image in images_results:
        all_results.append({
            "ppl": ppl,
            "position": image.get("position"),
            "image": image.get("image"),
            "title": image.get("title"),
            "link": image.get("link")
        })

df4 = pd.DataFrame(all_results)


In [74]:
df4.to_csv('duckduckgo_important_ppl.csv', index=False)

# Youtube

In [None]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio
nest_asyncio.apply()

async def scrape_thumbnails(search_query, scroll_times=3):
    thumbnails = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        search_url = f"https://www.youtube.com/results?search_query={search_query.replace(' ', '+')}"
        await page.goto(search_url)
        await page.wait_for_selector("ytd-thumbnail img")

        for _ in range(scroll_times):
            await page.mouse.wheel(0, 3000)
            await asyncio.sleep(2)

        elements = await page.query_selector_all("ytd-thumbnail img")
        for el in elements:
            src = await el.get_attribute("src")
            if src and "https://" in src:
                thumbnails.append(src)

        await browser.close()
        return thumbnails


In [99]:
all_results = []
for ppl in top_search_people:
    print(f"Scraping thumbnails for {ppl}")
    time.sleep(5) 
    thumbnails = await scrape_thumbnails(ppl, scroll_times=20)
    for thumbnail in thumbnails:
        all_results.append({
            "ppl": ppl,
            "thumbnail": thumbnail
        })

df7 = pd.DataFrame(all_results)

Scraping thumbnails for Donald Trump
Scraping thumbnails for Kamala Harris
Scraping thumbnails for JD Vance
Scraping thumbnails for Joe Biden
Scraping thumbnails for Catherine, Princess of Wales
Scraping thumbnails for Tim Walz
Scraping thumbnails for Mike Tyson
Scraping thumbnails for Jill Stein
Scraping thumbnails for Usher
Scraping thumbnails for Imane Khelif


In [101]:
df7.to_csv('youtube_top_search_ppl.csv', index=False)

# Rumble

In [None]:
async def scrape_rumble_thumbnails(search_query, scroll_times=3):
    thumbnails = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Search URL for Rumble
        search_url = f"https://rumble.com/search/video?q={search_query.replace(' ', '+')}"
        await page.goto(search_url)
        await page.wait_for_selector(".video-item--a")

        # Scroll to load more
        for _ in range(scroll_times):
            await page.mouse.wheel(0, 3000)
            await asyncio.sleep(2)

        # Get thumbnail <img> elements
        elements = await page.query_selector_all(".video-item--a img")
        for el in elements:
            src = await el.get_attribute("src")
            if src and "https://" in src and not src.endswith("placeholder.jpg"):
                thumbnails.append(src)

        await browser.close()
        return thumbnails

Unnamed: 0,Person,Image_URL,sightengine
6,Donald Trump,https://i.ytimg.com/vi/Ytsnss5oJns/maxresdefau...,0.99
32,Donald Trump,https://i.ytimg.com/vi/PExqrkMVmek/maxres2.jpg...,0.99
42,Donald Trump,https://i.ytimg.com/vi/q6OAzlc-9VM/maxres2.jpg...,0.99


In [None]:
all_results = []
for ppl in top_search_people:
    print(f"Scraping thumbnails for {ppl}")
    time.sleep(5) 
    thumbnails = await scrape_rumble_thumbnails(search_term, scroll_times=20)
    for thumbnail in thumbnails:
        all_results.append({
            "ppl": ppl,
            "thumbnail": thumbnail
        })
df = pd.DataFrame(all_results)

In [None]:
df.to_csv('rumble_top_ppl_search.csv')

In [102]:
arts = pd.read_csv('Arts_Entertainmenet.csv')
arts.shape

(1578, 2)

In [104]:
auto = pd.read_csv('Auto.csv')
auto.shape

(1271, 2)

In [103]:
Beauty = pd.read_csv('Beauty.csv')
Beauty.shape

(1595, 2)

In [106]:
Books = pd.read_csv('Book.csv')
Books.shape

(1481, 2)

In [107]:
Computers = pd.read_csv('Computer.csv')
Computers.shape

(1222, 2)

In [108]:
Finance = pd.read_csv('Finance.csv')
Finance.shape

(1027, 2)