In [None]:
# PROWOOD Scraper (Voilà-friendly per Binder)
# - Prompt: quante Moderne, quante Classiche, quanti Arredi
# - Moderne: https://prowoodsrl.it/cucine-moderne
# - Classiche: https://prowoodsrl.it/cucine-classiche/
#   -> seleziona random N link 'a.zolo_portfolio_link'
#   -> in pagina: immagini da 'li.slick-slide:not(.slick-cloned) img' (usa solo src)
#      descrizione = primo <p> dentro '.wpb_text_column'
# - Arredi: 7 pagine -> raccoglie prodotti come caroselli '.vc_slide.vc_images_carousel'
#      per ogni prodotto scarica TUTTE le immagini in '.vc_item img' (usa solo src)
#      nome prodotto dall'H3 in '.zolo_heading_element' più vicino precedente (fallback title/slug)
# - ZIP finale con tre cartelle: cucine_moderne, cucine_classiche, arredi
# - Controlli download robusti: FileDownload + data-URI + link /voila/files e ../files

import os, re, io, time, json, shutil, base64, random
from urllib.parse import urljoin, urlparse
from datetime import datetime

import requests
from bs4 import BeautifulSoup
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# ---------- Config ----------
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123 Safari/537.36 (+educational scraper)"
REQUEST_TIMEOUT = 30
DELAY_SEC = 0.6  # cortesia

URL_MODERNE   = "https://prowoodsrl.it/cucine-moderne"
URL_CLASSICHE = "https://prowoodsrl.it/cucine-classiche/"
ARREDI_PAGINE = [
    "https://prowoodsrl.it/zt_portfolio/pianca/",
    "https://prowoodsrl.it/zt_portfolio/connubia/",
    "https://prowoodsrl.it/zt_portfolio/baxar/",
    "https://prowoodsrl.it/zt_portfolio/neff/",
    "https://prowoodsrl.it/zt_portfolio/turati-t4/",
    "https://prowoodsrl.it/zt_portfolio/barzaghi/",
    "https://prowoodsrl.it/zt_portfolio/radice-mobili/",
]

BTN_STYLE = "display:inline-block;padding:10px 18px;background:#1e7e34;color:#fff;text-decoration:none;border-radius:8px;font-weight:600;font-size:15px;border:1px solid #17642a;"

# ---------- HTTP session ----------
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT, "Accept-Language": "it-IT,it;q=0.9"})

def fetch_soup(url: str) -> BeautifulSoup:
    r = session.get(url, timeout=REQUEST_TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")

# ---------- Utils ----------
def slugify(text: str, maxlen=90) -> str:
    text = (text or "").strip()
    text = re.sub(r"\s+", " ", text)
    text = text.replace("/", "-").replace("\\", "-")
    text = re.sub(r"[^0-9A-Za-zÀ-ÖØ-öø-ÿ _\-\.\(\)]", "", text)
    text = text[:maxlen]
    text = re.sub(r"\s+", " ", text).strip()
    return text or "senza_nome"

def ensure_dir(path: str) -> str:
    os.makedirs(path, exist_ok=True)
    return path

def download_file(url: str, dest_path: str, retries=2) -> bool:
    for attempt in range(retries + 1):
        try:
            with session.get(url, stream=True, timeout=REQUEST_TIMEOUT) as r:
                r.raise_for_status()
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
                with open(dest_path, "wb") as f:
                    for chunk in r.iter_content(64 * 1024):
                        if chunk:
                            f.write(chunk)
            return True
        except Exception:
            if attempt == retries:
                return False
            time.sleep(0.8)

# helper per riconoscere solo immagini “reali” della media library
def _looks_upload(u: str) -> bool:
    return bool(re.search(r"/wp-content/uploads/.*\.(jpe?g|png|webp)(\?.*)?$", u, re.I))

# ---------- Parsers Prowood ----------
def collect_portfolio_links(list_url: str) -> list:
    """
    Dalla pagina 'cucine-*' raccoglie tutti i link 'a.zolo_portfolio_link' (assoluti).
    """
    soup = fetch_soup(list_url)
    out, seen = [], set()
    for a in soup.select("a.zolo_portfolio_link[href]"):
        href = urljoin(list_url, a.get("href"))
        if href not in seen:
            seen.add(href); out.append(href)
    return out

def parse_kitchen_detail(detail_url: str) -> dict:
    """
    In pagina cucina:
    - nome: H1 se presente, altrimenti <title>, altrimenti slug dell'URL
    - immagini: SOLO da 'src'
        1) prova: 'li.slick-slide:not(.slick-cloned) img'
        2) fallback: <img> in sezioni tipiche (.portfolio_single, .zt-portfolio-single, .single-portfolio, .wpb_gallery, .content-area)
           filtrando solo path in /wp-content/uploads/
           (riordina dando priorità a URL con 'slider')
           🔹 ESCLUDE quelle dentro .related_portfolio_list
    - descrizione: primo <p> dentro '.wpb_text_column'
    """
    soup = fetch_soup(detail_url)

    # ---- nome
    h1 = soup.select_one("h1")
    if h1 and h1.get_text(strip=True):
        name = h1.get_text(strip=True)
    elif soup.title and soup.title.string:
        name = soup.title.string.strip()
    else:
        name = urlparse(detail_url).path.rstrip("/").split("/")[-1]
    name = slugify(name)

    # ---- immagini (solo src) - tentativo slider “canonico”
    img_urls, seen = [], set()
    for img in soup.select("li.slick-slide:not(.slick-cloned) img"):
        src = img.get("src")
        if not src or src.startswith("data:"):
            continue
        absu = urljoin(detail_url, src)
        if _looks_upload(absu) and absu not in seen:
            seen.add(absu); img_urls.append(absu)

    # ---- fallback se lo slider è vuoto
    if not img_urls:
        containers = soup.select(
            ".portfolio_single, .zt-portfolio-single, .single-portfolio, .wpb_gallery, .content-area"
        ) or [soup]

        slider_like = re.compile(r"slider", re.I)

        for cont in containers:
            for img in cont.select("img"):
                src = img.get("src")
                if not src or src.startswith("data:"):
                    continue

                # 🔹 escludi immagini dentro .related_portfolio_list
                if img.find_parent(".related_portfolio_list"):
                    continue

                absu = urljoin(detail_url, src)
                # prendi solo immagini della media library WP
                if _looks_upload(absu) and absu not in seen:
                    seen.add(absu); img_urls.append(absu)

        # (facoltativo) privilegia le immagini con 'slider' nel percorso
        img_urls.sort(key=lambda u: (0 if slider_like.search(u) else 1, u))

    # ---- descrizione
    descr = ""
    text_col = soup.select_one(".wpb_text_column")
    if text_col:
        p = text_col.find("p")
        if p:
            descr = p.get_text(" ", strip=True)

    return {"name": name, "images": img_urls, "description": descr}

def collect_arredi_products(pages: list) -> list:
    """
    Raccoglie PRODOTTI (non singole immagini) dalle pagine arredi.
    Per ogni '.vc_slide.vc_images_carousel' costruisce:
      { "name": <nome_prodotto>, "images": [url1, url2, ...] }

    - name: h3 dentro '.zolo_heading_element' più vicino PRIMA del carousel (fallback: title/slug)
    - images: tutte le img dai '.vc_item img' dentro il carousel (usa SOLO 'src')
    """
    products = []
    for pg in pages:
        soup = fetch_soup(pg)

        carousels = soup.select(".vc_slide.vc_images_carousel")
        if not carousels:
            carousels = soup.select(".vc_images_carousel")

        if not carousels:
            # fallback: nessun carousel; prova a raggruppare per nome pagina
            imgs = []
            for img in soup.select("img"):
                cand = img.get("src")
                if cand:
                    imgs.append(urljoin(pg, cand))
            title = soup.title.string.strip() if (soup.title and soup.title.string) else urlparse(pg).path.rstrip("/").split("/")[-1]
            products.append({"name": slugify(title), "images": imgs})
            continue

        for car in carousels:
            # Trova heading precedente (fratelli)
            product_name = None
            prev = car
            for _ in range(15):
                prev = prev.find_previous_sibling()
                if not prev: break
                h3 = prev.select_one(".zolo_heading_element h3") or prev.find("h3")
                if h3 and h3.get_text(strip=True):
                    product_name = h3.get_text(strip=True)
                    break

            # Fallback antenati
            if not product_name:
                parent = car.parent
                hops = 0
                while parent and hops < 5:
                    h3 = parent.select_one(".zolo_heading_element h3") or parent.find("h3")
                    if h3 and h3.get_text(strip=True):
                        product_name = h3.get_text(strip=True)
                        break
                    parent = parent.parent
                    hops += 1

            # Fallback finale
            if not product_name:
                product_name = soup.title.string.strip() if (soup.title and soup.title.string) else urlparse(pg).path.rstrip("/").split("/")[-1]
            product_name = slugify(product_name)

            # immagini del prodotto: tutte le .vc_item img (usa SOLO src)
            img_urls, seen = [], set()
            for img in car.select(".vc_item img"):
                cand = img.get("src")
                if cand:
                    absu = urljoin(pg, cand)
                    if absu not in seen:
                        seen.add(absu); img_urls.append(absu)

            products.append({"name": product_name, "images": img_urls})

        time.sleep(DELAY_SEC)
    return products

# ---------- Download controls (robusti) ----------
download_area = widgets.Output()

def _voila_prefix():
    root = os.environ.get("JUPYTERHUB_SERVICE_PREFIX", "/")
    if not root.endswith("/"): root += "/"
    return root

def _show_download_controls(zip_path: str):
    with download_area:
        download_area.clear_output()
        name = os.path.basename(zip_path)
        size_mb = os.path.getsize(zip_path) / (1024*1024)
        print(f"[INFO] ZIP: {name} ({size_mb:.2f} MB)")

        # 1) FileDownload se disponibile
        if hasattr(widgets, "FileDownload"):
            def _zip_bytes():
                with open(zip_path, "rb") as f:
                    return f.read()
            btn = widgets.FileDownload(
                data=_zip_bytes,
                filename=name,
                description="⬇️ Scarica ZIP",
                button_style="primary",
                icon="download"
            )
            display(btn)
        else:
            print("[INFO] ipywidgets<8: uso fallback senza FileDownload")

        # 2) data-URI (sempre)
        with open(zip_path, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("ascii")
        BTN_STYLE = "display:inline-block;margin-top:30px;padding:10px 20px;border-radius:25px;background-color:#333333;color:#fff;text-decoration:none;font-weight:bold;font-size:16px;"
        html_data = f'<p><a download="{name}" href="data:application/zip;base64,{b64}" style="{BTN_STYLE}">Scarica ZIP</a></p>'
        display(HTML(html_data))

        # 3) link /voila/files e relativo ../files
        pref = _voila_prefix()
        abs_href = f"{pref}voila/files/{name}"
        rel_href = f"../files/{name}"
        html_extra = (
            f'<p>Altri link (se necessario): '
            f'<a href="{abs_href}" target="_blank">{abs_href}</a> | '
            f'<a href="{rel_href}" target="_blank">{rel_href}</a></p>'
        )
        display(HTML(html_extra))

# ---------- UI ----------
title = widgets.HTML("<h3>Scarica cucine moderne/classiche e arredi da prowoodsrl.it</h3>")
moderne_n   = widgets.BoundedIntText(value=2, min=0, max=50, step=1, description="Moderne:")
classiche_n = widgets.BoundedIntText(value=2, min=0, max=50, step=1, description="Classiche:")
arredi_n    = widgets.BoundedIntText(value=4, min=0, max=200, step=1, description="Arredi:")
btn_start   = widgets.Button(description="Avvia scraping", button_style="success", icon="play")
out         = widgets.Output()

def run_scraper(b):
    out.clear_output()
    with out:
        N_mod = int(moderne_n.value)
        N_cla = int(classiche_n.value)
        N_arr = int(arredi_n.value)

        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        root = ensure_dir(f"./prowood_export_{ts}")
        dir_mod = ensure_dir(os.path.join(root, "cucine_moderne"))
        dir_cla = ensure_dir(os.path.join(root, "cucine_classiche"))
        dir_arr = ensure_dir(os.path.join(root, "arredi"))
        manifest = {"moderne": [], "classiche": [], "arredi": []}

        # --- Moderne ---
        try:
            links_mod = collect_portfolio_links(URL_MODERNE)
            print(f"[Moderne] trovati {len(links_mod)} link")
            sample_mod = random.sample(links_mod, k=min(N_mod, len(links_mod))) if links_mod else []
            for idx, u in enumerate(sample_mod, 1):
                info = parse_kitchen_detail(u)
                kfolder = ensure_dir(os.path.join(dir_mod, info["name"]))
                # descrizione
                with open(os.path.join(kfolder, "descrizione.txt"), "w", encoding="utf-8") as f:
                    f.write(info["description"] or "")
                # immagini
                saved = 0
                for j, img in enumerate(info["images"], 1):
                    ext = os.path.splitext(urlparse(img).path)[1] or ".jpg"
                    ext = ext[:5]
                    dest = os.path.join(kfolder, f"{j:03d}{ext}")
                    if download_file(img, dest): saved += 1
                print(f"  - {idx}/{len(sample_mod)} {info['name']}: img {saved}/{len(info['images'])}")
                manifest["moderne"].append({"url": u, "name": info["name"], "images": saved})
                time.sleep(DELAY_SEC)
        except Exception as e:
            print("[Moderne] ERRORE:", e)

        # --- Classiche ---
        try:
            links_cla = collect_portfolio_links(URL_CLASSICHE)
            print(f"[Classiche] trovati {len(links_cla)} link")
            sample_cla = random.sample(links_cla, k=min(N_cla, len(links_cla))) if links_cla else []
            for idx, u in enumerate(sample_cla, 1):
                info = parse_kitchen_detail(u)
                kfolder = ensure_dir(os.path.join(dir_cla, info["name"]))
                with open(os.path.join(kfolder, "descrizione.txt"), "w", encoding="utf-8") as f:
                    f.write(info["description"] or "")
                saved = 0
                for j, img in enumerate(info["images"], 1):
                    ext = os.path.splitext(urlparse(img).path)[1] or ".jpg"
                    ext = ext[:5]
                    dest = os.path.join(kfolder, f"{j:03d}{ext}")
                    if download_file(img, dest): saved += 1
                print(f"  - {idx}/{len(sample_cla)} {info['name']}: img {saved}/{len(info['images'])}")
                manifest["classiche"].append({"url": u, "name": info["name"], "images": saved})
                time.sleep(DELAY_SEC)
        except Exception as e:
            print("[Classiche] ERRORE:", e)

        # --- Arredi ---
        try:
            products = collect_arredi_products(ARREDI_PAGINE)  # list of {name, images[]}
            print(f"[Arredi] prodotti trovati: {len(products)}")
            sample_arr = random.sample(products, k=min(N_arr, len(products))) if products else []

            for idx, prod in enumerate(sample_arr, 1):
                pname = prod["name"]
                pfolder = ensure_dir(os.path.join(dir_arr, slugify(pname)))
                urls = prod["images"]
                saved = 0
                for j, img in enumerate(urls, 1):
                    ext = os.path.splitext(urlparse(img).path)[1] or ".jpg"
                    ext = ext[:5]
                    dest = os.path.join(pfolder, f"{j:03d}{ext}")
                    if download_file(img, dest): saved += 1
                print(f"  - {idx}/{len(sample_arr)} {pname}: img {saved}/{len(urls)}")
                manifest["arredi"].append({"name": pname, "images": saved})
                time.sleep(0.2)
        except Exception as e:
            print("[Arredi] ERRORE:", e)

        # manifest + zip
        with open(os.path.join(root, "manifest.json"), "w", encoding="utf-8") as f:
            json.dump(manifest, f, ensure_ascii=False, indent=2)

        zip_path = shutil.make_archive(root, "zip", root)
        print("\n[OK] ZIP pronto:", zip_path)
        _show_download_controls(zip_path)

btn_start.on_click(run_scraper)

display(widgets.VBox([
    title,
    widgets.HBox([moderne_n, classiche_n, arredi_n]),
    btn_start,
    widgets.HTML("<hr>"),
    out,
    download_area
]))
