# Import

In [None]:
!pip install unidecode selenium webdriver_manager
!apt-get update
!apt-get install -y chromium-chromedriver

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.

# Load more

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# --- RELEVANT CHANGES ARE HERE ---

def load_full_page_headless(url, clicks_to_perform):
    """
    Automates a HEADLESS browser to click the 'Show More' button and save the HTML.
    """
    print("Initializing headless Chrome browser...")

    # 1. Setup Chrome Options for Headless Mode
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # This is the primary flag for headless mode
    options.add_argument('--no-sandbox') # Required for running in a Colab/Linux root environment
    options.add_argument('--disable-dev-shm-usage') # Overcomes limited resource problems in Docker/Colab

    # 2. Initialize the Driver with the new options
    # When running in Colab after the apt-get install, Selenium finds the driver automatically.
    driver = webdriver.Chrome(options=options)

    print(f"Navigating to {url}...")
    driver.get(url)

    show_more_button_id = "btn-load-more"

    for i in range(clicks_to_perform):
        try:
            wait = WebDriverWait(driver, 10)
            button = wait.until(EC.element_to_be_clickable((By.ID, show_more_button_id)))

            # 3. (Optional but Recommended) Use a JavaScript click for better reliability in headless mode
            driver.execute_script("arguments[0].click();", button)

            print(f"Clicked 'Show More' button {i + 1}/{clicks_to_perform} times...")
            time.sleep(1.5) # Wait for new content to load

        except TimeoutException:
            print("Could not find the 'Show More' button. Assuming all content is loaded.")
            break
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

    print("\nFinished clicking. Saving the fully loaded HTML...")

    output_filename = 'guland_hanoi_fully_loaded.html'
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(driver.page_source)

    print(f"Successfully saved the complete HTML to '{output_filename}'")

    driver.quit()


# --- Example of how to call it ---
if __name__ == "__main__":
    target_url = 'https://guland.vn/bat-dong-san/ha-noi'
    number_of_clicks = 10000 # Set how many times you want to click

    load_full_page_headless(target_url, number_of_clicks)

Initializing headless Chrome browser...
Navigating to https://guland.vn/bat-dong-san/ha-noi...
Clicked 'Show More' button 1/1000 times...
Clicked 'Show More' button 2/1000 times...
Clicked 'Show More' button 3/1000 times...
Clicked 'Show More' button 4/1000 times...
Clicked 'Show More' button 5/1000 times...
Clicked 'Show More' button 6/1000 times...
Clicked 'Show More' button 7/1000 times...
Clicked 'Show More' button 8/1000 times...
Clicked 'Show More' button 9/1000 times...
Clicked 'Show More' button 10/1000 times...
Clicked 'Show More' button 11/1000 times...
Clicked 'Show More' button 12/1000 times...
Clicked 'Show More' button 13/1000 times...
Clicked 'Show More' button 14/1000 times...
Clicked 'Show More' button 15/1000 times...
Clicked 'Show More' button 16/1000 times...
Clicked 'Show More' button 17/1000 times...
Clicked 'Show More' button 18/1000 times...
Clicked 'Show More' button 19/1000 times...
Clicked 'Show More' button 20/1000 times...
Clicked 'Show More' button 21/1000

# Scraping

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import ArcGIS # CHANGED: Imported ArcGIS instead of Nominatim
from geopy.extra.rate_limiter import RateLimiter

def scrape_and_geocode_arcgis(file_path):
    """
    Parses a local HTML file, scrapes property listings,
    geocodes the addresses USING ARCGIS, and saves the data to a CSV file.
    """
    print("Reading HTML file...")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Make sure it's in the same folder as the script.")
        return

    print("Parsing HTML content...")
    soup = BeautifulSoup(html_content, 'html.parser')

    listings = soup.find_all('div', class_='l-sdb-list__single')

    if not listings:
        print("Could not find any property listings. The HTML structure might have changed.")
        return

    scraped_data = []
    print(f"Found {len(listings)} listings. Starting extraction...")

    # Loop through each listing and extract data (This part remains the same)
    for listing in listings:
        try:
            title_tag = listing.find('div', class_='c-sdb-card__tle').find('a')
            title = title_tag.get_text(strip=True) if title_tag else 'N/A'
            link = title_tag['href'] if title_tag else 'N/A'

            price_tag = listing.find('span', class_='data-color-1')
            price = price_tag.get_text(strip=True) if price_tag else 'N/A'

            area_tag = listing.find('span', class_='data-size-lg')
            area = area_tag.get_text(strip=True) if area_tag else 'N/A'

            address_tag = listing.find('div', class_='data-type-adr')
            if address_tag:
                address = ', '.join([part.strip() for part in address_tag.stripped_strings])
            else:
                address = 'N/A'

            description_tag = listing.find('div', class_='c-sdb-card__exc')
            description = description_tag.get_text(strip=True) if description_tag else 'N/A'

            scraped_data.append({
                'Title': title,
                'Price': price,
                'Area': area,
                'Address': address,
                'Description': description,
                'Link': link
            })
        except Exception as e:
            print(f"Skipping a listing due to an error: {e}")

    print(f"Successfully extracted data for {len(scraped_data)} listings.")

    # --- MODIFIED GEOCODING SECTION ---
    print("Starting geocoding process using ArcGIS (this may take a while)...")
    df = pd.DataFrame(scraped_data)

    # CHANGED: Initialize ArcGIS geolocator. No API key needed for basic use.
    geolocator = ArcGIS(user_agent="real_estate_scraper_arcgis", timeout=10)

    # Use RateLimiter to avoid overwhelming the service. ArcGIS is often faster,
    # so we can use a shorter delay than with Nominatim.
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.5)

    # Apply the geocode function
    df['location'] = df['Address'].apply(lambda x: geocode(f"{x}, Hanoi, Vietnam") if x != 'N/A' else None)

    # Extract latitude and longitude
    df['Latitude'] = df['location'].apply(lambda loc: loc.latitude if loc else None)
    df['Longitude'] = df['location'].apply(lambda loc: loc.longitude if loc else None)

    # Clean up the dataframe
    df.drop('location', axis=1, inplace=True)

    print("Geocoding complete.")

    # --- Save to CSV ---
    output_filename = 'guland_hanoi_listings_arcgis1.csv' # CHANGED: New output filename
    try:
        df.to_csv(output_filename, index=False, encoding='utf-8-sig')
        print(f"Data successfully saved to '{output_filename}'")
    except Exception as e:
        print(f"Error saving to CSV: {e}")


# --- Main execution ---
if __name__ == "__main__":
    # Ensure the HTML file is named correctly and is in the same directory
    html_file = 'guland_hanoi_fully_loaded.html'
    scrape_and_geocode_arcgis(html_file)

Reading HTML file...
Parsing HTML content...
Found 24024 listings. Starting extraction...
Successfully extracted data for 24024 listings.
Starting geocoding process using ArcGIS (this may take a while)...




Geocoding complete.
Data successfully saved to 'guland_hanoi_listings_arcgis1.csv'


# Clean up

In [None]:
df = df.dropna()

In [None]:
df[df['Price'].str.contains("triệu", na=False)]

Unnamed: 0,Title,Price,Area,Address,Description,Link,Latitude,Longitude
192,"Bán căn góc view sông đuống 82,1m2 3pn2wc tại ...",46 triệu,82m²,Đường Từ Ngã Tư Nhà Máy Ôtô 1/5 Đi Nhà Máy Ô T...,"Em chuyển nhượng căn góc diện tích 82,1 m² thô...",https://guland.vn/post/ban-can-goc-view-song-d...,21.084104,105.871990
193,48tr sở hữu căn góc view vin cổ loa trục 01 vi...,48 triệu,96m²,Đường Từ Ngã Tư Nhà Máy Ôtô 1/5 Đi Nhà Máy Ô T...,"Tìm chủ nhân mới cho căn góc diện tích 96,4m²...",https://guland.vn/post/48tr-so-huu-can-goc-vie...,21.084104,105.871990
194,"Bán căn góc 82,1m2 3pn2wc tại chung cư eurowin...",48 triệu,81m²,Đường Từ Ngã Tư Nhà Máy Ôtô 1/5 Đi Nhà Máy Ô T...,"Em chuyển nhượng căn góc diện tích 82,1 m² thô...",https://guland.vn/post/ban-can-goc-821m2-3pn2w...,21.084104,105.871990
196,"Đất đấu giá dư dụ, thanh oai rẻ nhất thị trường",65 triệu,77m²,"Xã Thanh Thùy, ,, Huyện Thanh Oai, ,, Hà Nội","Đấu giá Dư Dụ, Thanh Thùy, Thanh Oai\nDt 77,5m...",https://guland.vn/post/dat-dau-gia-du-du-thanh...,20.870540,105.806465
230,Cơ hội sở hữu lô đất 105m thôn đồi sen giá tốt...,33 triệu,105m²,"Đường Đoạn Đường 420 - Thị Trấn, ,, Xã Bình Yê...",Chủ cần tiền bán 106m Full thổ cư\nThôn Đồi Se...,https://guland.vn/post/co-hoi-so-huu-lo-dat-10...,21.040741,105.548152
...,...,...,...,...,...,...,...,...
23661,"Nhận booking toà f2, f6 căn hộ 1pn, 2pn, 3pn d...",130 triệu,166m²,"Đường Phạm Hùng, ,, Phường Dịch Vọng Hậu, ,, Q...",Căn hộ hạng sang 6* Sun Feliza Suites Cầu giấy...,https://guland.vn/post/nhan-booking-toa-f2-f6-...,21.034694,105.780455
23664,"Chính thức mở bán bảng giá căn hộ 1pn, 2pn, 3p...",130 triệu,166m²,"Đường Phạm Hùng, ,, Phường Dịch Vọng Hậu, ,, Q...","CHÍNH THỨC MỞ BÁN BẢNG GIÁ + MẶT BẰNG (1PN, 2P...",https://guland.vn/post/chinh-thuc-mo-ban-bang-...,21.034694,105.780455
23683,Bán nhanh khu đô thị hanoi garden city duplex ...,58 triệu,188m²,"Phường Thạch Bàn, ,, Quận Long Biên, ,, Hà Nội",Khu đô thị Garden City CH Duplex 188m² - full ...,https://guland.vn/post/ban-nhanh-khu-do-thi-ha...,21.018836,105.911297
23809,Cam kết rẻ nhất thị trường 2 ngủ/70m2 bán giá ...,44 triệu,70m²,"Đường Tố Hữu, ,, Phường La Khê, ,, Quận Hà Đôn...",Chính chủ tôi bán căn 2 ngủ toà The Pride Hải ...,https://guland.vn/post/cam-ket-re-nhat-thi-tru...,20.979617,105.763552


In [None]:
df[df['Price'].str.contains("tỷ", na=False)]['Price']

Unnamed: 0,Price
0,25 tỷ
1,6.7 tỷ
2,4.98 tỷ
3,6.5 tỷ
4,11.8 tỷ
...,...
24019,37 tỷ
24020,138 tỷ
24021,35 tỷ
24022,189 tỷ


# Map

In [None]:
import folium

hanoi_map = folium.Map(location=[21.0278, 105.8342], zoom_start=13)
for index, row in df.iterrows():
    # Create a marker for each location
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        # The popup will display the name of the location when you click on the marker
        popup=row['Title'],
        # The tooltip will display on hover
        tooltip=row['Title']
    ).add_to(hanoi_map)

hanoi_map

# Scratch pad

In [None]:
df = pd.read_csv('guland_hanoi_listings_arcgis1.csv')

In [None]:
df.head()

Unnamed: 0,Title,Price,Area,Address,Description,Link,Latitude,Longitude
0,BÁN NHÀ LẠC LONG QUÂN – PHÂN LÔ – LÔ GÓC VỈA H...,25 tỷ,80m²,"Quận Tây Hồ, ,, Hà Nội",- Vị trí đắc địa khu phân lô vỉa hè otô trán...,https://guland.vn/post/ban-nha-lac-long-quan-p...,21.069448,105.810853
1,"Bán nhà Phố Vĩnh Tuy, Dương Văn Bé, 35m2 x 5 t...",6.7 tỷ,35m²,"Phường Hai Bà Trưng, ,, Hà Nội","- Vị trí nhà nằm ngay gần trường học Vĩnh Tuy,...",https://guland.vn/post/ban-nha-pho-vinh-tuy-du...,21.014789,105.848261
2,"HIẾM, DÂN XÂY ĐỘC LẬP 3PN, PHÚC LỢI, LONG BIÊN...",4.98 tỷ,40m²,"Phường Phúc Lợi, ,, Hà Nội","- Vị trí đẹp tại Phúc Lợi, Long Biên, sát cạnh...",https://guland.vn/post/hiem-dan-xay-doc-lap-3p...,21.045396,105.929435
3,CẦN BÁN CĂN NHÀ 46M_4 TẦNG_MT 4.0M ĐƯỜNG OTO K...,6.5 tỷ,46m²,"Đường Quốc Lộ 32, ,, Huyện Hoài Đức, ,, Hà Nội",🏡 CHÍNH CHỦ CẦN BÁN NHÀ 4 TẦNG TẠI ĐỨC THƯỢNG ...,https://guland.vn/post/can-ban-can-nha-46m-4-t...,21.072572,105.702753
4,SIÊU PHẨM PHÂN LÔ – ĐẠI TỪ HOÀNG MAI –SIÊU PHẨ...,11.8 tỷ,40m²,"Đường Đại Từ, ,, Quận Hoàng Mai, ,, Hà Nội",SIÊU PHẨM PHÂN LÔ – ĐẠI TỪ HOÀNG MAI – Ô TÔ SU...,https://guland.vn/post/sieu-pham-phan-lo-dai-t...,20.968615,105.827617
