In [None]:
#Imports
import logging
import time
import random
from typing import List, Dict, Optional

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options


class ImmoweltScraper:
    """
    Immowelt production scraper using Selenium for JS rendering.
    Extracts all field data directly from listing attributes,
    based on latest Immowelt layout (2025).
    """

    def __init__(self, delay_range: tuple = (2, 5), headless: bool = False):
        self.delay_range = delay_range
        self.scraped_data = []

        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

        # Setup Selenium
        options = Options()
        if headless:
            options.add_argument("--headless")
        self.driver = webdriver.Firefox(options=options)

    def scrape_page(self, url: str) -> Optional[BeautifulSoup]:
        """Scrape a single page with Selenium and return BeautifulSoup object"""
        try:
            self.logger.info(f"Opening {url}")
            self.driver.get(url)
            time.sleep(random.uniform(*self.delay_range))  # human-like wait
            html = self.driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            return soup
        except Exception as e:
            self.logger.error(f"Failed to scrape {url}: {e}")
            return None

    def extract_data(self, soup: BeautifulSoup) -> List[Dict]:
        listings_data = []
        links = soup.select('a[data-testid="card-mfe-covering-link-testid"]')
        images = soup.select('img[aria-label="Hauptbild"]')

        for i, link in enumerate(links):
            try:
                detail_url = link.get("href")
                title_attr = link.get("title", "")
                alt_text = images[i].get("alt", "") if i < len(images) else ""
                info_str = alt_text if alt_text else title_attr

                listings_data.append({
                    "detail_url": detail_url,
                    "raw_info": info_str
                })
            except Exception as e:
                self.logger.warning(f"Error parsing a listing: {e}")

        self.logger.info(f"Extracted {len(listings_data)} listings from page")
        return listings_data

    def save_data(self, filename: str, fmt: str = "csv"):
        """Save scraped data to file"""
        if not self.scraped_data:
            self.logger.warning("No data to save")
            return

        df = pd.DataFrame(self.scraped_data)
        if fmt.lower() == "csv":
            df.to_csv(filename, index=False)
        elif fmt.lower() == "json":
            df.to_json(filename, orient="records", indent=2)
        elif fmt.lower() == "excel":
            df.to_excel(filename, index=False)
        self.logger.info(f"Data saved to {filename}")

    def run_scraper(self, urls: List[str]):
        """Main loop to scrape multiple pages"""
        self.logger.info(f"Starting scraper for {len(urls)} URL(s)")
        for url in urls:
            soup = self.scrape_page(url)
            if soup:
                page_data = self.extract_data(soup)
                self.scraped_data.extend(page_data)
        self.logger.info(f"Scraping complete. Total listings: {len(self.scraped_data)}")

    def close(self):
        """Close Selenium browser"""
        self.driver.quit()


if __name__ == "__main__":
    # Base URL without page number
    base_url_template = (
        "https://www.immowelt.de/classified-search"
        "?distributionTypes=Rent"
        "&estateTypes=House,Apartment"
        "&locations=AD08DE8634"
        "&projectTypes=New_Build,Flatsharing,Stock"
        "&page={}"
    )

    # Define how many pages you want to scrape
    start_page = 1
    end_page = 4  # change to full page count or desired limit, but better few pages for a run

    # Create all page URLs
    urls = [base_url_template.format(p) for p in range(start_page, end_page + 1)]

    scraper = ImmoweltScraper(delay_range=(20, 40), headless=False) # Bigger delay helps avoid being blocked immediately

    scraper.run_scraper(urls)
    output_filename = f"immowelt_pages_{start_page}_{end_page}.csv"
    scraper.save_data(output_filename, fmt="csv")
    scraper.close()
    print(f"✅ Saved {output_filename}")
