In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from skimage.metrics import structural_similarity as ssim

import cv2
import numpy as np
import requests

In [None]:
URL='https://pixabay.com/images/search/portrait%20woman%20people/'
IMAGE_REFERENCE_URL='photo6.jpg'
DEPTH_SCRAPPING=0
ACCURACY=0.8

In [None]:
def go_to_url(url):
    driver.get(url)
    driver.implicitly_wait(10)

In [None]:
def get_images_urls(url):
    try:
        images = driver.find_elements(By.TAG_NAME, 'img')
        image_urls = []
        for img in images:
            try:
                src = img.get_attribute('src')
                if src and (src.endswith('.png') or src.endswith('.jpeg') or src.endswith('.jpg') or src.endswith('.webp')):
                    image_urls.append(src)
            except Exception as e:
                print(f"Error obteniendo datos de una imagen")

        unique_image_urls = list(set(image_urls))
        return unique_image_urls
    finally:
        print(f"Se obtuvieron {len(image_urls)} imágenes de: {url}")

In [None]:
def get_anchor_urls(url):
    try:
        anchors = driver.find_elements(By.TAG_NAME, 'a')
        anchor_urls = []
        for anchor in anchors:
            try:
                href = anchor.get_attribute('href')
                if href:
                    anchor_urls.append(href)
            except Exception as e:
                print(f"Error obteniendo datos de un anchor")
        return anchor_urls
    finally:
        print(f"Se obtuvieron {len(anchor_urls)} anclas de: {url}")

In [None]:
def download_and_format_image(url):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        img_array = np.asarray(bytearray(response.content), dtype=np.uint8)
        image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        return image
    else:
        raise Exception(f"No se pudo descargar la imagen desde la URL: {url}")

In [None]:
def compare_images(reference_image_url, image_url):
    image_local = cv2.imread(reference_image_url)
    if image_local is None:
        raise Exception(f"No se pudo cargar la imagen local: {reference_image_url}")

    image_url = download_and_format_image(image_url)
    
    image_local_resized = cv2.resize(image_local, (300, 300))
    image_url_resized = cv2.resize(image_url, (300, 300))
    
    gray_local = cv2.cvtColor(image_local_resized, cv2.COLOR_BGR2GRAY)
    gray_url = cv2.cvtColor(image_url_resized, cv2.COLOR_BGR2GRAY)
    
    score, _ = ssim(gray_local, gray_url, full=True)
    return score


In [None]:
def scrap_and_analyze(url):
    go_to_url(URL)
    images = get_images_urls(URL)
    anchors = get_anchor_urls(URL)
    driver.quit()

    for image_url in images:
        try:
            similarity_score = compare_images(IMAGE_REFERENCE_URL, image_url)
            #print(f"Puntuación de similitud (SSIM): {similarity_score}")
            if similarity_score > ACCURACY:
                print(f"Coincidencia encontrada! : {image_url}")
        except Exception as e:
            print(f"Error al comparar imagenes") 

In [None]:
def scrape_and_analyze_recursive(url, depth=0, max_depth=2):
    if depth > max_depth:
        return
    
    print(f"Accediendo a {url} (nivel {depth})")
    go_to_url(url)
    images = get_images_urls(url)
    anchors = get_anchor_urls(url)

    for image_url in images:
        try:
            similarity_score = compare_images(IMAGE_REFERENCE_URL, image_url)
            if similarity_score > 0.7:
                print(f"Coincidencia encontrada en {image_url} con puntuación SSIM: {similarity_score}")
        except Exception as e:
            print(f"Error comparando imagen")
    
    # Llamada recursiva para seguir los enlaces encontrados
    for anchor_url in anchors:
        if anchor_url not in visited_urls:
            visited_urls.add(anchor_url)
            scrape_and_analyze_recursive(anchor_url, depth + 1, max_depth)

In [None]:
driver = webdriver.Chrome()
visited_urls = set()
visited_urls.add(URL)

try:
    scrape_and_analyze_recursive(URL, depth=0, max_depth=DEPTH_SCRAPPING)
except Exception as e:
    print(f"...")
finally:
    driver.quit()