## 1. scraping one property using selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from webdriver_manager.firefox import GeckoDriverManager

# URL to scrape
urls = [
    "https://immovlan.be/en/detail/villa/for-sale/9051/sint-denijs-westrem/rbu55821",
]

# Firefox options (headless)
options = Options()
options.add_argument("--headless")

# Loop through URLs
for url in urls:
    driver = None
    try:
        # Launch Firefox
        service = Service(GeckoDriverManager().install())
        driver = webdriver.Firefox(service=service, options=options)
        driver.get(url)

        # -----------------------
        # Extract property ID
        # -----------------------
        try:
            property_id = url.split('/')[-1]
        except:
            property_id = None

        # -----------------------
        # Extract postal code & locality
        # -----------------------
        try:
            city_tag = driver.find_element(By.CSS_SELECTOR, ".city-line")
            city_text = city_tag.text.strip()
            postal_code = city_text.split()[0]
            locality_name = " ".join(city_text.split()[1:])
        except:
            postal_code = None
            locality_name = None

        # -----------------------
        # Extract price
        # -----------------------
        try:
            price_tag = driver.find_element(By.CSS_SELECTOR, ".detail__header_price_data")
            price = price_tag.text.strip()
        except:
            price = None

        # -----------------------
        # Extract property type & subtype
        # -----------------------
        try:
            type_tag = driver.find_element(By.CSS_SELECTOR, ".detail__header_title_main")
            type_words = type_tag.text.strip().split()
            property_type = type_words[0]
            subtype = " ".join(type_words[1:]) if len(type_words) > 1 else ""
        except:
            property_type = None
            #subtype = None
            

        # -----------------------
        # Print results
        # -----------------------
        print("Property ID:", property_id)
        print("Postal Code:", postal_code)
        print("Locality:", locality_name)
        print("Price:", price)
        print("Type:", property_type)
        print("Subtype:", subtype)
        print("-" * 40)

    except:
        print("Error loading page:", url)
    finally:
        if driver:
            driver.quit()

Property ID: rbu55821
Postal Code: 9051
Locality: Sint-Denijs-Westrem
Price: 860 000 €
Type: Villa
Subtype: for sale - Sint-Denijs-Westrem RBU55821
----------------------------------------


## 2. A property scraping function using selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait    # added
from selenium.webdriver.support import expected_conditions as EC #added
import csv  # <-- added for saving results


def scrape_properties(urls, output_file="properties.csv"):  # <-- added function definition
    # Firefox options (headless)
    options = Options()
    options.add_argument("--headless")

    # Prepare CSV file
    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["URL", "Terrace", "Garden", "Garden area (m²)", "Property ID", "Postal Code", "Locality", "Price", "Type"])

        # Loop through URLs
        for url in urls:
            driver = None
            try:
                # Launch Firefox
                service = Service(GeckoDriverManager().install())
                driver = webdriver.Firefox(service=service, options=options)
                driver.get(url)

                # ---- Handle cookie consent automatically ----
                try:
                    # Wait for the "Agree and close" button to be clickable
                    accept_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.ID, "didomi-notice-agree-button"))
                    )
                    accept_button.click()
                    print("Cookies accepted.")
                except Exception as e:
                    print("No cookie popup found or could not click the accept button:", e)
                # ---- End cookie handling ----

                try:
                    terrace_tag = driver.find_element(By.CSS_SELECTOR, "h4:contains('Terrace') + p")
                    terrace_text = terrace_tag.text.strip()
                    terrace = 1 if "yes" in terrace_text.lower() else 0
                except:
                    terrace = None 

                # GARDEN: 1 = Yes, 0 = No
                try:
                    garden_blocks = driver.find_elements(By.CSS_SELECTOR, "div")
                    garden = 0  # default
                    for block in garden_blocks:
                        try:
                            h4_text = block.find_element(By.CSS_SELECTOR, "h4").text.strip().lower()
                            if "garden" == h4_text:  # exact match for Garden
                                garden_text = block.find_element(By.CSS_SELECTOR, "p").text.strip()
                                garden = 1 if "yes" in garden_text.lower() else 0
                                break
                        except:
                            continue
                except:
                    garden = 0

                # GARDEN AREA (m²): integer or None
                try:
                    garden_area = None
                    for block in garden_blocks:
                        try:
                            h4_text = block.find_element(By.CSS_SELECTOR, "h4").text.strip().lower()
                            if "surface garden" in h4_text:
                                garden_area_text = block.find_element(By.CSS_SELECTOR, "p").text.strip()
                                garden_area = int(garden_area_text.replace("m²", "").replace("m", "").strip())
                                break
                        except:
                            continue
                except:
                    garden_area = None

                # -----------------------
                # Extract property ID
                # -----------------------
                try:
                    property_id = url.split('/')[-1]
                except:
                    property_id = None

                # -----------------------
                # Extract postal code & locality
                # -----------------------
                try:
                    city_tag = driver.find_element(By.CSS_SELECTOR, ".city-line")
                    city_text = city_tag.text.strip()
                    postal_code = city_text.split()[0]
                    locality_name = " ".join(city_text.split()[1:])
                except:
                    postal_code = None
                    locality_name = None

                # -----------------------
                # Extract price
                # -----------------------
                try:
                    price_tag = driver.find_element(By.CSS_SELECTOR, ".detail__header_price_data")
                    price = price_tag.text.strip()
                except:
                    price = None

                # -----------------------
                # Extract property type & subtype
                # -----------------------
                try:
                    type_tag = driver.find_element(By.CSS_SELECTOR, ".detail__header_title_main")
                    type_words = type_tag.text.strip().split()
                    property_type = type_words[0]
                except:
                    property_type = None

                # -----------------------
                # Print results
                # -----------------------
                print("Terrace:", terrace)
                print("Garden:", garden)
                print("Garden area (m²):", garden_area)
                print("Property ID:", property_id)
                print("Postal Code:", postal_code)
                print("Locality:", locality_name)
                print("Price:", price)
                print("Type:", property_type)
                print("-" * 40)

                # Write row to CSV file
                writer.writerow([url, terrace, garden, garden_area, property_id, postal_code, locality_name, price, property_type])

            except:
                print("Error loading page:", url)
            finally:
                if driver:
                    driver.quit()


# Example use:
urls = [
    "https://immovlan.be/en/detail/villa/for-sale/9051/sint-denijs-westrem/rbu55821", "https://immovlan.be/en/detail/residence/for-sale/4900/spa/vbd48417","https://immovlan.be/en/detail/apartment/for-sale/9000/gent/rbu59338"
]
scrape_properties(urls)  # <-- added function call


Cookies accepted.
Terrace: None
Garden: 1
Garden area (m²): 470
Property ID: rbu55821
Postal Code: 9051
Locality: Sint-Denijs-Westrem
Price: 860 000 €
Type: Villa
----------------------------------------
Cookies accepted.
Terrace: None
Garden: 1
Garden area (m²): 472
Property ID: vbd48417
Postal Code: 4900
Locality: Spa
Price: 199 000 €
Type: Residence
----------------------------------------
Cookies accepted.
Terrace: None
Garden: 0
Garden area (m²): None
Property ID: rbu59338
Postal Code: 9000
Locality: Gent
Price: 325 000 €
Type: Apartment
----------------------------------------


## 3. Collect apartment URLs, including new construction apartments, based on keywords.

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
from webdriver_manager.firefox import GeckoDriverManager
import csv
import time

# -----------------------
# Handle cookies popup
# -----------------------
def handle_cookies(driver, timeout=10):
    selectors = [
        "#onetrust-accept-btn-handler",
        "#didomi-notice-agree-button",
    ]
    for selector in selectors:
        try:
            cookie_btn = WebDriverWait(driver, timeout).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
            )
            try:
                cookie_btn.click()
                print(f"Cookies accepted ({selector})")
                return
            except ElementClickInterceptedException:
                driver.execute_script("arguments[0].scrollIntoView(true);", cookie_btn)
                time.sleep(0.5)
                cookie_btn.click()
                print(f"Cookies accepted after scrolling ({selector})")
                return
        except TimeoutException:
            continue
    print("No cookie popup found.")

# -----------------------
# Collect apartment URLs with precise filter
# -----------------------
def collect_apartment_urls(search_url, output_file="apartment_urls.csv"):
    """
    Collect apartment URLs, including new construction apartments, based on keywords.
    """
    options = Options()
    options.headless = True
    service = Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(service=service, options=options)
    all_urls = set()

    # Keywords for apartment-related listings
    apartment_keywords = [
        "/apartment/",
        "groundfloor",
        "residence",
        "penthouse",
        "studio",
        "duplex",
        "loft",
        "triplex",
        "isnewconstruction=only",
        "/projectdetail/"
    ]

    try:
        driver.get(search_url)
        handle_cookies(driver)
        time.sleep(2)

        while True:
            print("Scraping current page for apartment URLs...")

            # Wait for property cards
            try:
                WebDriverWait(driver, 20).until(
                    EC.presence_of_all_elements_located(
                        (By.CSS_SELECTOR, "article.list-view-item h2.card-title a")
                    )
                )
            except TimeoutException:
                print("No property links found on this page. Skipping...")
                continue

            # Scroll for lazy-loading
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            # Collect apartment links
            links = driver.find_elements(By.CSS_SELECTOR, "article.list-view-item h2.card-title a")
            for link in links:
                href = link.get_attribute("href")
                if href and any(keyword in href.lower() for keyword in apartment_keywords):
                    all_urls.add(href.split("?")[0])

            print(f"Found {len(links)} links on this page (total apartments so far: {len(all_urls)})")

            # Go to next page if available
            try:
                next_button = driver.find_element(
                    By.XPATH, "//a[contains(@class,'pagination__next') and not(contains(@class,'disabled'))]"
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                time.sleep(1)
                next_button.click()
                time.sleep(3)
            except:
                print("Reached the last page.")
                break

    finally:
        driver.quit()

    # Save URLs to CSV
    with open(output_file, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["URL"])
        for u in sorted(all_urls):
            writer.writerow([u])

    print(f"Saved {len(all_urls)} unique apartment URLs to '{output_file}'")
    return list(sorted(all_urls))


# -----------------------
# Example usage
# -----------------------
if __name__ == "__main__":
    search_url = (
        "https://immovlan.be/en/real-estate?transactiontypes=for-sale,in-public-sale&propertytypes=apartment&propertysubtypes=apartment,penthouse,studio,ground-floor,duplex,loft,triplex&towns=4900-spa&noindex=1"
    )
    apartment_urls = collect_apartment_urls(search_url)
    print(f"Total collected apartment URLs: {len(apartment_urls)}")
    for u in apartment_urls[:]:
        print(u)


No cookie popup found.
Scraping current page for apartment URLs...
Found 20 links on this page (total apartments so far: 20)
Reached the last page.
Saved 20 unique apartment URLs to 'apartment_urls.csv'
Total collected apartment URLs: 20
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbb82041
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc15133
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc15134
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc15136
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc15137
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc15150
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc15158
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc53316
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc63045
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbc97369
https://immovlan.be/en/detail/apartment/for-sale/4900/spa/vbd26412
https://immovlan.be/en/de

## 4. collecting URL of 5 pages