In [None]:
# PROWOOD SCRAPER NOTEBOOK (Binder/Voila friendly)

import os, re, io, time, json, shutil, random, base64
from urllib.parse import urljoin, urlparse
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# ====================== CONFIG ==========================
USER_AGENT = "Mozilla/5.0 (compatible; prowood-scraper/1.0; +https://example.com)"
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT, "Accept-Language": "it-IT,it;q=0.9"})

# urls di partenza
URL_MODERNE = "https://prowoodsrl.it/cucine-moderne/"
URL_CLASSICHE = "https://prowoodsrl.it/cucine-classiche/"
URL_ARREDI = [
    "https://prowoodsrl.it/zt_portfolio/pianca/",
    "https://prowoodsrl.it/zt_portfolio/connubia/",
    "https://prowoodsrl.it/zt_portfolio/baxar/",
    "https://prowoodsrl.it/zt_portfolio/neff/",
    "https://prowoodsrl.it/zt_portfolio/turati-t4/",
    "https://prowoodsrl.it/zt_portfolio/barzaghi/",
    "https://prowoodsrl.it/zt_portfolio/radice-mobili/"
]

# =================== UTILS =============================
def fetch_soup(url):
    r = session.get(url, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")

def slugify(text, maxlen=80):
    text = re.sub(r"\s+", "-", text.strip())
    text = re.sub(r"[^a-zA-Z0-9\-_]+", "", text)
    return (text[:maxlen] or "item").strip("-_")

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)
    return path

def download_file(url, dest_path, retries=2):
    for attempt in range(retries+1):
        try:
            with session.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                with open(dest_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
            return True
        except Exception:
            if attempt == retries:
                return False
            time.sleep(1)

# -------- funzioni scraping -------------------
def _abs(u, base): 
    return urljoin(base, u) if u else None

def _looks_upload(u: str) -> bool:
    return bool(re.search(r"/wp-content/uploads/.*\.(jpe?g|png|webp)(\?.*)?$", u, re.I))

def parse_kitchen_detail(detail_url: str) -> dict:
    soup = fetch_soup(detail_url)
    # name
    h1 = soup.select_one("h1")
    if h1 and h1.get_text(strip=True):
        name = h1.get_text(strip=True)
    elif soup.title and soup.title.string:
        name = soup.title.string.strip()
    else:
        name = urlparse(detail_url).path.rstrip("/").split("/")[-1]
    name = slugify(name)

    seen, img_urls = set(), []
    # tentativo slider
    for img in soup.select("li.slick-slide:not(.slick-cloned) img"):
        src = img.get("src")
        if src:
            u = _abs(src, detail_url)
            if u and _looks_upload(u) and u not in seen:
                seen.add(u); img_urls.append(u)
    # fallback
    if not img_urls:
        containers = soup.select(".portfolio_single, .zt-portfolio-single, .single-portfolio, .wpb_gallery") or [soup]
        slider_like = re.compile(r"slider", re.I)
        for cont in containers:
            for img in cont.select("img"):
                src = img.get("src")
                if not src:
                    continue
                u = _abs(src, detail_url)
                if not u or u in seen:
                    continue
                if not _looks_upload(u):
                    continue
                seen.add(u); img_urls.append(u)
        img_urls.sort(key=lambda x: (0 if slider_like.search(x) else 1, x))
    # descrizione
    descr = ""
    text_col = soup.select_one(".wpb_text_column")
    if text_col:
        p = text_col.find("p")
        if p:
            descr = p.get_text(" ", strip=True)
    return {"name": name, "images": img_urls, "description": descr}

def parse_arredo_detail(arredo_url: str) -> list:
    soup = fetch_soup(arredo_url)
    items = []
    for block in soup.select(".vc_slide.vc_images_carousel"):
        # nome prodotto = h3 sopra il blocco
        h3 = block.find_previous("h3")
        if not h3:
            continue
        name = slugify(h3.get_text(strip=True))
        img_urls, seen = [], set()
        for vc_item in block.select(".vc_item img"):
            src = vc_item.get("src")
            if not src: continue
            u = _abs(src, arredo_url)
            if u not in seen:
                seen.add(u); img_urls.append(u)
        if img_urls:
            items.append({"name": name, "images": img_urls})
    return items

def extract_random_links(list_url: str, selector: str, n=5):
    soup = fetch_soup(list_url)
    links = [urljoin(list_url, a["href"]) for a in soup.select(selector) if a.get("href")]
    random.shuffle(links)
    return links[:n]

# ========== UI PROMPT =========================
n_moderne = widgets.IntText(value=2, description='Cucine moderne:')
n_classiche = widgets.IntText(value=2, description='Cucine classiche:')
n_arredi = widgets.IntText(value=2, description='Arredi:')
btn_start = widgets.Button(description="Avvia scraping", button_style="success")
download_area = widgets.Output()

def _voila_prefix():
    return os.environ.get("VOILA_BASE_URL", "/")

def _show_download_controls(zip_path: str):
    with download_area:
        download_area.clear_output()
        name = os.path.basename(zip_path)
        size_mb = os.path.getsize(zip_path)/(1024*1024)
        print(f"[INFO] ZIP: {name} ({size_mb:.2f} MB)")
        # data URI
        with open(zip_path, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("ascii")
        html_data = (
            f'<p><a download="{name}" href="data:application/zip;base64,{b64}" '
            f'style="background:#4CAF50;color:white;padding:10px 15px;text-decoration:none;border-radius:5px;">'
            f'⬇️ Scarica ZIP</a></p>'
        )
        display(HTML(html_data))
        # link /voila/files
        pref = _voila_prefix()
        abs_href = f"{pref}voila/files/{name}"
        rel_href = f"../files/{name}"
        html_extra = (
            f'<div>Altri link: '
            f'<a href="{abs_href}" target="_blank">{abs_href}</a> | '
            f'<a href="{rel_href}" target="_blank">{rel_href}</a></div>'
        )
        display(HTML(html_extra))

def scrape_action(b):
    clear_output(wait=True)
    display(n_moderne, n_classiche, n_arredi, btn_start, download_area)
    print(f"Avvio: {n_moderne.value} moderne, {n_classiche.value} classiche, {n_arredi.value} arredi")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    root_dir = ensure_dir(f"./prowood_{timestamp}")
    mod_dir = ensure_dir(os.path.join(root_dir, "Cucine_Moderne"))
    class_dir = ensure_dir(os.path.join(root_dir, "Cucine_Classiche"))
    arr_dir = ensure_dir(os.path.join(root_dir, "Arredi"))

    # cucine moderne
    links_mod = extract_random_links(URL_MODERNE, "a.zolo_portfolio_link", n_moderne.value)
    for idx,u in enumerate(links_mod,1):
        data = parse_kitchen_detail(u)
        fdir = ensure_dir(os.path.join(mod_dir, f"{idx:02d}_{data['name']}"))
        with open(os.path.join(fdir, "descrizione.txt"),"w",encoding="utf-8") as f: f.write(data["description"] or "")
        for j,img in enumerate(data["images"],1):
            ext = os.path.splitext(urlparse(img).path)[1] or ".jpg"
            download_file(img, os.path.join(fdir,f"img_{j:03d}{ext}"))

    # cucine classiche
    links_class = extract_random_links(URL_CLASSICHE, "a.zolo_portfolio_link", n_classiche.value)
    for idx,u in enumerate(links_class,1):
        data = parse_kitchen_detail(u)
        fdir = ensure_dir(os.path.join(class_dir, f"{idx:02d}_{data['name']}"))
        with open(os.path.join(fdir, "descrizione.txt"),"w",encoding="utf-8") as f: f.write(data["description"] or "")
        for j,img in enumerate(data["images"],1):
            ext = os.path.splitext(urlparse(img).path)[1] or ".jpg"
            download_file(img, os.path.join(fdir,f"img_{j:03d}{ext}"))

    # arredi
    all_arredi=[]
    for url in URL_ARREDI:
        all_arredi.extend(parse_arredo_detail(url))
    random.shuffle(all_arredi)
    for idx,item in enumerate(all_arredi[:n_arredi.value],1):
        fdir = ensure_dir(os.path.join(arr_dir, f"{idx:02d}_{item['name']}"))
        for j,img in enumerate(item["images"],1):
            ext = os.path.splitext(urlparse(img).path)[1] or ".jpg"
            download_file(img, os.path.join(fdir,f"img_{j:03d}{ext}"))

    zip_path = shutil.make_archive(root_dir,"zip",root_dir)
    print(f"[OK] ZIP pronto: {zip_path}")
    _show_download_controls(zip_path)

btn_start.on_click(scrape_action)

display(n_moderne, n_classiche, n_arredi, btn_start, download_area)
