In [None]:
# NOTEBOOK Voilà-friendly (Binder)
# Scarica <p> dopo il divider successivo all'H2 "Descrizione" + immagini galleria da milanocasa.com

import os, re, io, time, json, shutil, base64
from urllib.parse import urljoin, urlparse
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# ====== Parametri base ======
BASE_LIST_URL = "https://www.milanocasa.com/immobili"
DELAY_SEC = 1
USER_AGENT = "Mozilla/5.0 (compatible; milanocasa-scraper/1.0; +https://example.com)"

session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT, "Accept-Language": "it-IT,it;q=0.9"})

# ---- Funzioni varie scraping ----
def slugify(text, maxlen=80):
    text = re.sub(r"\s+", "-", text.strip())
    text = re.sub(r"[^a-zA-Z0-9\-_]+", "", text)
    return (text[:maxlen] or "item").strip("-_")

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)
    return path

def fetch_soup(url):
    r = session.get(url, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")

def is_property_link(href: str, base_domain: str) -> bool:
    if not href:
        return False
    p = urlparse(href)
    domain_ok = (p.netloc == "" or p.netloc.endswith(base_domain))
    bad_paths = ("/immobili", "/category/", "/categoria/", "/tag/")
    looks_like_detail = re.search(r"/immobil|/property|/scheda|/vendita|/affitto", p.path or "", re.I)
    return domain_ok and not any((p.path or "").startswith(b) for b in bad_paths) and bool(looks_like_detail)

def extract_last_n_property_links(list_url: str, n=10):
    base_domain = urlparse(list_url).netloc or "milanocasa.com"
    soup = fetch_soup(list_url)
    links, seen = [], set()
    for a in soup.select("a[href]"):
        href_abs = urljoin(list_url, a.get("href") or "")
        if is_property_link(href_abs, base_domain) and href_abs not in seen:
            seen.add(href_abs); links.append(href_abs)
        if len(links) >= n: break
    return links

def extract_title(soup: BeautifulSoup):
    h1 = soup.select_one("h1")
    if h1 and h1.get_text(strip=True):
        return h1.get_text(strip=True)
    if soup.title and soup.title.string:
        return soup.title.string.strip()
    return "Immobile"

def extract_gallery_images(soup: BeautifulSoup, base_for_urls: str):
    urls, out, seen = [], [], set()
    for gw in soup.select(".elementor-widget-pp-image-gallery"):
        for img in gw.select("img"):
            cand = img.get("data-src") or img.get("data-lazy-src") or img.get("src")
            if not cand and img.get("srcset"):
                cand = img.get("srcset").split(",")[0].strip().split(" ")[0]
            if cand:
                urls.append(cand)
        for a in gw.select("a[href]"):
            href = a["href"]
            if re.search(r"\.(jpe?g|png|webp|gif)(\?.*)?$", href, re.I):
                urls.append(href)
        for tag in gw.find_all(True):
            for _, val in list(tag.attrs.items()):
                if isinstance(val, str) and re.search(r"\.(jpe?g|png|webp|gif)(\?.*)?$", val, re.I):
                    urls.append(val)
    for u in urls:
        u_abs = urljoin(base_for_urls, u)
        if u_abs not in seen:
            seen.add(u_abs); out.append(u_abs)
    return out

def is_divider_like(tag):
    if not getattr(tag, "name", None):
        return False
    if tag.name.lower() == "hr":
        return True
    classes = " ".join(tag.get("class", []))
    return any(k in classes for k in ["elementor-divider", "divider", "elementor-divider__separator"])

def extract_description(soup: BeautifulSoup):
    for hdr in soup.find_all(["h2", "h3"]):
        hdr_text = hdr.get_text(" ", strip=True).lower()
        if "descrizione" in hdr_text:
            divider = None
            for tag in hdr.find_all_next():
                if tag is hdr:
                    continue
                if getattr(tag, "name", "").lower() in {"h1","h2","h3","h4","h5","h6"}:
                    break
                if is_divider_like(tag):
                    divider = tag
                    break
            if divider:
                for tag in divider.find_all_next():
                    if getattr(tag, "name", "").lower() in {"h1","h2","h3","h4","h5","h6"}:
                        break
                    if getattr(tag, "name", "").lower() == "p":
                        text = tag.get_text(separator="\n", strip=True)
                        if text:
                            return re.sub(r"\n{3,}", "\n\n", text).strip()
            for tag in hdr.find_all_next():
                if getattr(tag, "name", "").lower() in {"h1","h2","h3","h4","h5","h6"}:
                    break
                if getattr(tag, "name", "").lower() == "p":
                    text = tag.get_text(separator="\n", strip=True)
                    if text:
                        return re.sub(r"\n{3,}", "\n\n", text).strip()
            return ""
    return ""

def download_file(url, dest_path, retries=2):
    for attempt in range(retries+1):
        try:
            with session.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                with open(dest_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
            return True
        except Exception:
            if attempt == retries:
                return False
            time.sleep(1)

# ====== Download controls robusti ======
download_area = widgets.Output()

def _voila_prefix():
    root = os.environ.get("JUPYTERHUB_SERVICE_PREFIX", "/")
    if not root.endswith("/"):
        root += "/"
    return root

def _show_download_controls(zip_path: str):
    with download_area:
        download_area.clear_output()
        name = os.path.basename(zip_path)
        size_mb = os.path.getsize(zip_path) / (1024*1024)
        print(f"[INFO] ZIP: {name} ({size_mb:.2f} MB)")

        if hasattr(widgets, "FileDownload"):
            def _zip_bytes():
                with open(zip_path, "rb") as f:
                    return f.read()
            btn = widgets.FileDownload(
                data=_zip_bytes,
                filename=name,
                description="⬇️ Scarica ZIP",
                button_style="primary",
                icon="download"
            )
            display(btn)
        else:
            print("[INFO] ipywidgets<8: uso fallback senza FileDownload")

        with open(zip_path, "rb") as f:
            b64 = base64.b64encode(f.read()).decode("ascii")
        html_data = (
            f'<p><a download="{name}" href="data:application/zip;base64,{b64}">'
            f'⬇️ Scarica ZIP</a></p>'
        )
        display(HTML(html_data))

        pref = _voila_prefix()
        abs_href = f"{pref}voila/files/{name}"
        rel_href = f"../files/{name}"
        html_extra = (
            f'<p>Altri link (se il data-URI avesse problemi): '
            f'<a href="{abs_href}" target="_blank">{abs_href}</a> | '
            f'<a href="{rel_href}" target="_blank">{rel_href}</a></p>'
        )
        # display(HTML(html_extra))

# ====== Widget interfaccia ======
n_items_widget = widgets.BoundedIntText(value=5, min=1, max=50, step=1, description="Annunci:")
btn_start = widgets.Button(description="Avvia scraping", button_style="success")
out = widgets.Output()

def run_scraper(b):
    out.clear_output()
    with out:
        N_ITEMS = n_items_widget.value
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        root_dir = ensure_dir(f"./milanocasa_export_{timestamp}")
        manifest = []

        links = extract_last_n_property_links(BASE_LIST_URL, N_ITEMS)
        print(f"Trovati {len(links)} link. Avvio download...\n")

        for idx, url in enumerate(links, 1):
            print(f"[{idx}/{len(links)}] {url}")
            item = {"url": url, "images": [], "title": None, "folder": None, "error": None}
            try:
                soup = fetch_soup(url)
                title = extract_title(soup)
                item["title"] = title
                folder_name = f"{idx:02d}_{slugify(title)}"
                item["folder"] = folder_name
                folder_path = ensure_dir(os.path.join(root_dir, folder_name))

                description = extract_description(soup)
                with open(os.path.join(folder_path, "testo.txt"), "w", encoding="utf-8") as f:
                    f.write(description or "")

                img_urls = extract_gallery_images(soup, url)
                for j, img_url in enumerate(img_urls, 1):
                    ext = os.path.splitext(urlparse(img_url).path)[1] or ".jpg"
                    ext = ext[:5]
                    img_name = f"img_{j:03d}{ext}"
                    img_dest = os.path.join(folder_path, img_name)
                    ok = download_file(img_url, img_dest)
                    if ok:
                        item["images"].append({"url": img_url, "file": img_name})
                time.sleep(DELAY_SEC)
            except Exception as e:
                item["error"] = str(e)
                print("  -> ERRORE:", e)
            manifest.append(item)

        with open(os.path.join(root_dir, "manifest.json"), "w", encoding="utf-8") as f:
            json.dump(manifest, f, ensure_ascii=False, indent=2)

        zip_path = shutil.make_archive(root_dir, "zip", root_dir)
        print("\nPronto!")
        print("Cartella:", root_dir)
        print("ZIP:", zip_path)
        _show_download_controls(zip_path)

btn_start.on_click(run_scraper)

display(widgets.VBox([n_items_widget, btn_start, out, download_area]))