In [None]:
!pip -q install ipywidgets==8.1.2
from google.colab import output
output.enable_custom_widget_manager()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.4/139.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# -*- coding: utf-8 -*-
# Scraper creokitchens.it — UI a checkbox + log essenziali
# - Selezioni le cucine (solo URL /it/cucine/<slug>/)
# - Imposti "Immagini/cucina"
# - Scarica descrizione + N immagini full-size (href degli <a.gb-item-link> in .gb-media-wrapper)
# - Pulisce la cartella di output all'inizio
# - Log ridotti all'essenziale

import os, re, time, csv, io, zipfile, shutil, traceback
from urllib.parse import urljoin, urlparse
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup

# Abilita widget manager (Colab)
try:
    from google.colab import output as colab_output
    colab_output.enable_custom_widget_manager()
except Exception:
    pass

import ipywidgets as widgets
from IPython.display import display

BASE_URL   = "https://www.creokitchens.it/it/cucine"
SITE_ROOT  = "https://www.creokitchens.it"
OUTPUT_DIR = "/content/creo_cucine"
ZIP_PATH   = "/content/creo_cucine.zip"

USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122 Safari/537.36 (+personal non-commercial scraping)"
REQUEST_TIMEOUT = (10, 25)
SLEEP_BETWEEN_REQUESTS = 0.3
MAX_IMAGE_BYTES = 40 * 1024 * 1024
GLOBAL_PER_PAGE_TIMEOUT = 120

def build_session():
    s = requests.Session()
    s.headers.update({"User-Agent": USER_AGENT})
    retries = Retry(total=4, connect=4, read=4, backoff_factor=0.5,
                    status_forcelist=[429,500,502,503,504],
                    allowed_methods=frozenset(["GET","HEAD"]))
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("http://", adapter); s.mount("https://", adapter)
    return s

session = build_session()

def slugify(text, maxlen=80):
    import re
    text = re.sub(r"\s+", " ", text or "").strip()
    text = text.replace("/", "-").replace("\\", "-")
    text = re.sub(r"[^0-9A-Za-zÀ-ÖØ-öø-ÿ _\-\.\(\)]", "", text)
    text = text[:maxlen]
    text = re.sub(r"\s+", " ", text).strip()
    return text or "senza_nome"

def get_soup(url):
    try:
        r = session.get(url, timeout=REQUEST_TIMEOUT)
        if r.status_code != 200:
            return None
        return BeautifulSoup(r.text, "html.parser")
    except requests.RequestException:
        return None

def head_ok_image(url):
    try:
        hr = session.head(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        if hr.status_code >= 400:
            return False
        ct = (hr.headers.get("Content-Type") or "").lower()
        if "image" not in ct:
            return False
        clen = hr.headers.get("Content-Length")
        if clen and clen.isdigit() and int(clen) > MAX_IMAGE_BYTES:
            return False
        return True
    except requests.RequestException:
        return False

def infer_ext(url):
    path = urlparse(url).path
    ext = os.path.splitext(path)[1]
    return ext.split("?")[0] if (ext and len(ext) <= 5) else ".jpg"

def download_image(url, dest_path):
    if not head_ok_image(url):
        return False
    try:
        with session.get(url, stream=True, timeout=REQUEST_TIMEOUT) as r:
            r.raise_for_status()
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            total = 0
            with open(dest_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024*64):
                    if not chunk: continue
                    f.write(chunk); total += len(chunk)
                    if total > MAX_IMAGE_BYTES:
                        return False
        return True
    except requests.RequestException:
        return False

# ---------- UI ----------
title = widgets.HTML("<h3>Seleziona le cucine da scaricare</h3>")
status = widgets.HTML("")
out = widgets.Output()
display(title, out)

def log(s): out.append_stdout(str(s) + "\n")

listing = get_soup(BASE_URL)
if not listing:
    log("[ERRORE] Impossibile caricare la pagina elenco.")
    raise SystemExit

# Filter: only /it/cucine/<slug>/
def is_kitchen_detail_url(href_abs: str) -> bool:
    try:
        u = urlparse(href_abs)
        if u.netloc != urlparse(SITE_ROOT).netloc or u.query or u.fragment:
            return False
        path = u.path
        if not path.startswith("/it/cucine/"): return False
        seg = [s for s in path.split("/") if s]
        if len(seg) != 3: return False  # ["it","cucine","slug"]
        if "." in seg[-1]: return False
        return True
    except Exception:
        return False

slug2url = {}
for a in listing.select("a.gb-item-link"):
    href = a.get("href") or ""
    abs_url = urljoin(BASE_URL, href)
    if is_kitchen_detail_url(abs_url):
        slug = urlparse(abs_url).path.rstrip("/").split("/")[-1]
        slug2url.setdefault(slug, abs_url)

resolved = []
for slug, url in slug2url.items():
    s = get_soup(url); time.sleep(0.02)
    name = slugify(s.find("h1").get_text(strip=True)) if (s and s.find("h1")) else slugify(slug)
    resolved.append((name, url))
resolved.sort(key=lambda t: t[0].lower())

checkboxes = [widgets.Checkbox(value=False, description=name, indent=False) for name, _ in resolved]
select_all = widgets.ToggleButton(value=False, description="Seleziona/Deseleziona tutto", icon="check")
btn_confirm = widgets.Button(description="Conferma selezione", button_style="primary", icon="check")
img_num = widgets.BoundedIntText(value=3, min=1, max=99, step=1, description="Immagini/cucina:")
btn_start = widgets.Button(description="Avvia download", button_style="success", icon="play")

box_checks = widgets.GridBox(
    checkboxes,
    layout=widgets.Layout(grid_template_columns="repeat(2, 48%)", grid_gap="6px")
)

def on_toggle_all(change):
    for cb in checkboxes:
        cb.value = select_all.value
select_all.observe(on_toggle_all, 'value')

selected_pairs = []

def on_confirm_clicked(b):
    global selected_pairs
    chosen = [(name, url) for cb, (name, url) in zip(checkboxes, resolved) if cb.value]
    if not chosen:
        status.value = "<span style='color:#b00'>Seleziona almeno una cucina.</span>"
        return
    selected_pairs = chosen
    status.value = f"<span style='color:#060'>Selezionate {len(chosen)} cucine. Imposta il numero di immagini e premi 'Avvia download'.</span>"
    select_all.disabled = True
    for cb in checkboxes: cb.disabled = True
    img_num.layout.display = "block"
    btn_start.layout.display = "inline-block"

btn_confirm.on_click(on_confirm_clicked)

img_num.layout.display = "none"
btn_start.layout.display = "none"

def scrape_sync(b):
    btn_start.disabled = True
    btn_confirm.disabled = True
    img_num.disabled = True

    try:
        if not selected_pairs:
            log("[ERRORE] Nessuna cucina selezionata.")
            return

        # pulizia output
        if os.path.exists(OUTPUT_DIR):
            shutil.rmtree(OUTPUT_DIR)
        os.makedirs(OUTPUT_DIR, exist_ok=True)

        log(f"Avvio: {len(selected_pairs)} cucine | {img_num.value} immagini/cucina")

        manifest = []

        for idx, (name, url) in enumerate(selected_pairs, 1):
            start_t = time.time()
            soup = get_soup(url)
            if not soup:
                log(f"- {idx}/{len(selected_pairs)} {name}: pagina non caricata, salto.")
                continue

            h1 = soup.find("h1")
            name2 = slugify(h1.get_text(strip=True)) if h1 else slugify(name)
            kdir = os.path.join(OUTPUT_DIR, name2)
            os.makedirs(kdir, exist_ok=True)

            # descrizione
            desc_container = soup.select_one(".gb-text-and-link")
            paras = [p.get_text(" ", strip=True) for p in (desc_container.find_all("p") if desc_container else []) if p.get_text(" ", strip=True)]
            desc = "\n\n".join(paras).strip()
            with io.open(os.path.join(kdir, "descrizione.txt"), "w", encoding="utf-8") as f:
                f.write(desc)

            # immagini dai link della gallery
            wrappers = soup.select(".gb-media-wrapper")
            anchors = []
            for w in wrappers:
                anchors.extend(w.select("a.gb-item-link"))

            seen = set(); ordered = []
            for a in anchors:
                href = a.get("href")
                if not href: continue
                abs_url = urljoin(url, href)
                if abs_url.lower().endswith((".jpg", ".jpeg", ".png", ".webp")) and abs_url not in seen:
                    seen.add(abs_url)
                    ordered.append(abs_url)
                if len(ordered) >= img_num.value:
                    break

            saved = 0
            for i, img_url in enumerate(ordered, 1):
                if time.time() - start_t > GLOBAL_PER_PAGE_TIMEOUT:
                    break
                dest = os.path.join(kdir, f"{i:02d}{infer_ext(img_url)}")
                if download_image(img_url, dest):
                    saved += 1
                time.sleep(0.1)

            manifest.append({"kitchen_name": name2, "url": url, "description_chars": len(desc), "images_saved": saved})
            log(f"- {idx}/{len(selected_pairs)} {name2}: immagini trovate {len(ordered)}, salvate {saved}")

            time.sleep(SLEEP_BETWEEN_REQUESTS)

        # manifest & zip
        man_path = os.path.join(OUTPUT_DIR, "manifest.csv")
        with io.open(man_path, "w", encoding="utf-8", newline="") as f:
            w = csv.DictWriter(f, fieldnames=["kitchen_name","url","description_chars","images_saved"])
            w.writeheader(); w.writerows(manifest)

        if os.path.exists(ZIP_PATH):
            os.remove(ZIP_PATH)
        with zipfile.ZipFile(ZIP_PATH, "w", zipfile.ZIP_DEFLATED) as z:
            for root, _, files in os.walk(OUTPUT_DIR):
                for file in files:
                    full = os.path.join(root, file)
                    rel = os.path.relpath(full, OUTPUT_DIR)
                    z.write(full, arcname=os.path.join("creo_cucine", rel))

        log(f"ZIP pronto: {ZIP_PATH}")


        if os.path.exists(ZIP_PATH):
          from google.colab import files
          files.download(ZIP_PATH)

    except Exception:
        log("[ERRORE] Qualcosa è andato storto:\n" + traceback.format_exc())

btn_start.on_click(scrape_sync)

ui = widgets.VBox([
    widgets.HTML("<h3>Seleziona le cucine da scaricare</h3>"),
    select_all,
    box_checks,
    widgets.HBox([btn_confirm]),
    widgets.HBox([img_num, btn_start]),
    status,
    widgets.HTML("<hr>"),
    widgets.HTML("<b>Log</b>"),
    out
])
display(ui)

HTML(value='<h3>Seleziona le cucine da scaricare</h3>')

Output()

VBox(children=(HTML(value='<h3>Seleziona le cucine da scaricare</h3>'), ToggleButton(value=False, description=…