# Descomposición del DOM

In [1]:
from bs4 import BeautifulSoup
from collections import Counter
from pathlib import Path

## Definiciones y utilidades estructurales

In [2]:
MAX_DEPTH = 12

IGNORED_TAGS = {
    "script", "style", "link", "meta", "svg", "noscript"
}

def normalize_classes(tag):
    return tuple(sorted(tag.get("class", [])))


def structural_fingerprint(tag, depth):
    if depth > MAX_DEPTH:
        return None
    if not tag.name or tag.name in IGNORED_TAGS:
        return None

    return (
        tag.name,
        normalize_classes(tag),
        depth
    )


def children_fingerprints(tag, depth):
    fps = []
    for child in tag.children:
        if getattr(child, "name", None):
            fp = structural_fingerprint(child, depth + 1)
            if fp:
                fps.append(fp)
    return Counter(fps)


In [3]:
def extract_structure(soup):
    structure = {}

    def walk(tag, depth=0):
        fp = structural_fingerprint(tag, depth)
        if not fp:
            return

        structure.setdefault(fp, []).append(
            children_fingerprints(tag, depth)
        )

        for child in tag.children:
            if getattr(child, "name", None):
                walk(child, depth + 1)

    walk(soup.body)
    return structure

In [4]:
def compare_structures(struct_a, struct_b):
    dynamic_candidates = []

    for key in set(struct_a) & set(struct_b):
        if struct_a[key] != struct_b[key]:
            dynamic_candidates.append(key)

    return dynamic_candidates

In [5]:
def load_html(path):
    return BeautifulSoup(
        Path(path).read_text(encoding="utf-8"),
        "html.parser"
    )

## Carga del HTML y extracción estructural

In [None]:
soup1 = load_html("data/raw/chrome.html")
soup2 = load_html("data/raw/firefox.html")

struct1 = extract_structure(soup1)
struct2 = extract_structure(soup2)

Tag: div, Classes: ('MjjYud',), Depth: 9
Tag: div, Classes: ('A6K0A',), Depth: 10
Tag: div, Classes: (), Depth: 8
Tag: div, Classes: (), Depth: 10
Tag: div, Classes: ('e9EfHf',), Depth: 2
Tag: div, Classes: (), Depth: 3
Tag: div, Classes: ('dURPMd',), Depth: 8
Tag: div, Classes: (), Depth: 12
Tag: span, Classes: (), Depth: 6
Tag: span, Classes: ('oUAcPd',), Depth: 10
Tag: div, Classes: ('Tg0csd',), Depth: 3
Tag: div, Classes: (), Depth: 9
Tag: div, Classes: (), Depth: 11
Tag: div, Classes: ('Fgyi2e', 'caNvfd', 'rZj61'), Depth: 3
Tag: div, Classes: (), Depth: 4
Tag: div, Classes: (), Depth: 6


## Identificación de nodos dinámicos

In [None]:
dynamic_nodes = compare_structures(struct1, struct2)

for tag, classes, depth in dynamic_nodes:
    print(f"Tag: {tag}, Classes: {classes}, Depth: {depth}")

## Filtrado semántico de candidatos

In [7]:
def is_relevant_candidate(tag, classes, depth):
    if not classes:
        return False
    if tag not in {"div", "section", "article"}:
        return False
    if depth < 2 or depth > 9:
        return False
    return True

In [8]:
print("Zonas dinámicas candidatas:\n")

for tag, classes, depth in dynamic_nodes:
    if is_relevant_candidate(tag, classes, depth):
        print(f"Tag: {tag}, Classes: {classes}, Depth: {depth}")

Zonas dinámicas candidatas:

Tag: div, Classes: ('MjjYud',), Depth: 9
Tag: div, Classes: ('e9EfHf',), Depth: 2
Tag: div, Classes: ('dURPMd',), Depth: 8
Tag: div, Classes: ('Tg0csd',), Depth: 3
Tag: div, Classes: ('Fgyi2e', 'caNvfd', 'rZj61'), Depth: 3


## Análisis de patrones de hijos

In [9]:
def item_fingerprint(tag):
    """
    Huella ligera para detectar items repetidos dentro de un contenedor
    """
    return (
        tag.name,
        tuple(sorted(tag.get("class", [])))
    )


In [10]:
def extract_child_patterns(tag):
    patterns = []

    for child in tag.children:
        if getattr(child, "name", None):
            if child.name in IGNORED_TAGS:
                continue
            patterns.append(item_fingerprint(child))

    return Counter(patterns)


In [11]:
def is_feed_container(child_counter, min_items=3, dominance_ratio=0.6):
    """
    - min_items: mínimo de hijos para considerar feed
    - dominance_ratio: porcentaje mínimo de repetición
    """
    total = sum(child_counter.values())

    if total < min_items:
        return False

    most_common_count = child_counter.most_common(1)[0][1]

    return (most_common_count / total) >= dominance_ratio


In [None]:
def find_real_nodes(soup, target_fp):
    tag_name, classes, depth = target_fp
    results = []

    def walk(tag, current_depth=0):
        if current_depth > MAX_DEPTH:
            return

        if (
            tag.name == tag_name and
            tuple(sorted(tag.get("class", []))) == classes and
            current_depth == depth
        ):
            results.append(tag)

        for child in tag.children:
            if getattr(child, "name", None):
                walk(child, current_depth + 1)

    walk(soup.body)
    return results

## Emisión de selectores CSS estructurales

In [26]:
def build_css_selector(tag, classes):
    if classes:
        return tag + "." + ".".join(classes)
    return tag


In [27]:
print("Feeds dinámicos detectados:\n")

feeds = []

for fp in dynamic_nodes:
    tag, classes, depth = fp

    # filtros semánticos finales
    if not classes or depth < 2 or depth > 9:
        continue

    # buscamos el nodo real en una de las sesiones
    nodes = find_real_nodes(soup1, fp)
    if not nodes:
        continue

    node = nodes[0]

    child_patterns = extract_child_patterns(node)

    
    if is_feed_container(child_patterns):
        (item_tag, item_classes), _ = child_patterns.most_common(1)[0]

        feed = {
            "container": build_css_selector(tag, classes),
            "item_selector": "> " + build_css_selector(item_tag, item_classes)
        }

        feeds.append(feed)
feeds

Feeds dinámicos detectados:



[{'container': 'div.dURPMd', 'item_selector': '> div.MjjYud'}]

# Motor de Inferencia

In [None]:
from dotenv import load_dotenv
import google.generativeai as genai
import zipfile
import os
import re

In [None]:
load_dotenv()

api_key = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.5-flash")