In [1]:
import time
import csv
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Configure Selenium WebDriver
driver = webdriver.Chrome()
url = "https://www.realestate.com.kh/map/?property_type=residential&search_type=sale"
driver.get(url)

# Initialize CSV file with additional fields
with open('realestate_data_extended.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow([
        'ID', 'Title', 'Price', 'Location', 'Bedrooms', 'Bathrooms', 
        'Floor Area', 'Land Area', 'Floor Number', 'Total Floors', 
        'Unit Number', 'Property Type', 'URL'
    ])

    page = 1
    while True:
        print(f"Processing Page {page}...")

        # Scroll to load all listings
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Parse listings
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.select('div.listingCard')

        for listing in listings:
            try:
                # Extract ID (try data-id attribute or from URL)
                id_attr = listing.get('data-id')
                rel_url = listing.select_one('a.listingCard')['href']
                if not id_attr:
                    id_match = re.search(r'/(\d+)/', rel_url)
                    id_attr = id_match.group(1) if id_match else "N/A"
                full_url = f"https://www.realestate.com.kh{rel_url}"

                # Extract basic info from listing card
                title = listing.select_one('h2.listingCard-title').text.strip()
                price = listing.select_one('div.listingCard-price').text.strip() if listing.select_one('div.listingCard-price') else "N/A"
                location = listing.select_one('div.listingCard-location').text.strip() if listing.select_one('div.listingCard-location') else "N/A"
                
                features = listing.select('div.listingCard-feature')
                bed = bath = "N/A"
                for feature in features:
                    text = feature.text.strip()
                    if 'bed' in text.lower(): bed = text
                    elif 'bath' in text.lower(): bath = text
                
                prop_type = listing.select_one('div.listingCard-propertyType').text.strip() if listing.select_one('div.listingCard-propertyType') else "N/A"

                # Now visit the detail page to get additional information
                driver.execute_script(f"window.open('{full_url}', '_blank');")
                driver.switch_to.window(driver.window_handles[1])
                time.sleep(3)  # Allow detail page to load
                
                # Initialize additional fields
                floor_area = land_area = floor_number = total_floors = unit_number = "N/A"
                
                try:
                    detail_soup = BeautifulSoup(driver.page_source, 'html.parser')
                    
                    # Extract from property details section
                    details = detail_soup.select('div.property-detail-feature')
                    for detail in details:
                        label = detail.select_one('span.label').text.strip().lower() if detail.select_one('span.label') else ""
                        value = detail.select_one('span.value').text.strip() if detail.select_one('span.value') else ""
                        
                        if 'size' in label or 'sqm' in label or 'sqft' in label:
                            floor_area = value
                        elif 'land' in label:
                            land_area = value
                        elif 'floor' in label and 'number' in label:
                            floor_number = value
                        elif 'total floors' in label:
                            total_floors = value
                        elif 'unit' in label:
                            unit_number = value
                    
                    # Close the detail tab
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    
                except Exception as e:
                    print(f"Error processing detail page: {e}")
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])

                writer.writerow([
                    id_attr, title, price, location, bed, bath, 
                    floor_area, land_area, floor_number, total_floors, 
                    unit_number, prop_type, full_url
                ])
                
            except Exception as e:
                print(f"Error processing listing: {e}")
                if len(driver.window_handles) > 1:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])

        # Pagination
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.pagination-next'))
            )
            if "disabled" in next_button.get_attribute('class'):
                break
            next_button.click()
            page += 1
            time.sleep(3)
        except Exception:
            break

driver.quit()
print("Scraping completed with extended details!")

Processing Page 1...
Scraping completed with extended details!
