In [25]:
# Install required packages
!pip install selenium pandas openpyxl webdriver-manager requests



In [None]:
import pandas as pd
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import urllib.parse
import re

# ===== CONFIGURATION =====
CONFIG = {
    "district": "Colombo",
    "city": "Nugegoda",
    "country": "Sri Lanka",
    "industry": "gyms",
    "max_results": 50
}

def setup_driver():
    """Setup Chrome WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

def search_on_google_maps(driver, district, city, country, industry):
    """Search businesses on Google Maps"""
    search_query = f"{industry} in {city}, {district}, {country}"
    encoded_query = urllib.parse.quote(search_query)
    maps_url = f"https://www.google.com/maps/search/{encoded_query}"
    driver.get(maps_url)
    time.sleep(8)
    return True

def scroll_and_load_results(driver):
    """Scroll results panel to load all businesses"""
    try:
        scrollable = driver.find_element(By.XPATH, '//div[@role="feed"]')
    except:
        return

    last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable)
    for _ in range(20):
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable)
        time.sleep(2)
        new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable)
        if new_height == last_height:
            break
        last_height = new_height

def get_business_links(driver):
    """Get business links"""
    elements = driver.find_elements(By.XPATH, '//a[contains(@href, "/maps/place/")]')
    links = [e.get_attribute("href") for e in elements if e.get_attribute("href")]
    return list(set(links))[:CONFIG['max_results']]

def extract_business_details(driver, business_url):
    """Extract details for one business"""
    details = {
        'Name': '',
        'Phone': '',
        'Rating': '',
        'Address': '',
        'Google_Maps_Link': business_url
    }

    driver.get(business_url)
    time.sleep(4)

    # Name
    try:
        details['Name'] = driver.find_element(By.XPATH, '//h1').text.strip()
    except:
        pass

    # Rating
    try:
        rating_element = driver.find_element(By.XPATH, '//span[contains(@aria-label,"stars")]')
        rating_text = rating_element.text or rating_element.get_attribute('aria-label')
        match = re.search(r'(\d+\.?\d*)', rating_text)
        if match:
            details['Rating'] = match.group(1)
    except:
        pass

    # Phone
    try:
        phone_element = driver.find_element(By.XPATH, '//a[contains(@href, "tel:")]')
        details['Phone'] = phone_element.get_attribute('href').replace("tel:", "").strip()
    except:
        pass

    # Address
    try:
        details['Address'] = driver.find_element(By.XPATH, '//button[@data-item-id="address"]').text.strip()
    except:
        pass

    return details if details['Name'] else None

def save_to_excel(business_data_list, city, industry):
    """Save results to Excel"""
    if not business_data_list:
        print("❌ No data to save!")
        return

    df = pd.DataFrame(business_data_list)
    output_file = f"{industry}_{city}.xlsx".replace(" ", "_")

    with pd.ExcelWriter(output_file, engine='openpyxl', mode='w') as writer:
        df.to_excel(writer, sheet_name=f"{industry}_{city}"[:31], index=False)

    print(f"✅ Data saved: {output_file} ({len(df)} records)")

def main():
    print(f"🎯 Scraping {CONFIG['industry']} in {CONFIG['city']}, {CONFIG['district']}...")
    driver = setup_driver()
    all_business_data = []

    try:
        search_on_google_maps(driver, CONFIG['district'], CONFIG['city'], CONFIG['country'], CONFIG['industry'])
        scroll_and_load_results(driver)
        links = get_business_links(driver)

        print(f"🔗 Found {len(links)} businesses")

        for i, link in enumerate(links, 1):
            print(f"[{i}/{len(links)}] Extracting...")
            details = extract_business_details(driver, link)
            if details:
                all_business_data.append(details)
            time.sleep(random.uniform(2, 4))

        save_to_excel(all_business_data, CONFIG['city'], CONFIG['industry'])

    finally:
        driver.quit()
        print("🔒 Browser closed")

if __name__ == "__main__":
    main()


🎯 Scraping gyms in Nugegoda, Colombo...
