In [2]:
# Install required packages
!pip install selenium pandas openpyxl webdriver-manager requests

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading webdriver_manager-4.0.

In [10]:
import pandas as pd
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
import urllib.parse
import re
from datetime import datetime

# ===== CONFIGURATION SECTION =====
CONFIG = {
    "district": "Colombo",
    "city": "Kotte",
    "country": "Sri Lanka",
    "industry": "gyms",
    "max_results": 10,
    "output_file": None  # will be set dynamically
}

def setup_driver():
    print("Setting up Chrome WebDriver...")
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--disable-software-rasterizer')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
    except:
        chrome_options.add_argument('--remote-debugging-port=9222')
        driver = webdriver.Chrome(options=chrome_options)

    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    print("WebDriver setup completed")
    return driver

def force_full_maps(driver):
    print("Checking for full Google Maps version...")
    try:
        if "lite" in driver.current_url or "preview" in driver.current_url:
            print("Lite mode detected, forcing full version...")
            try:
                larger_map_link = driver.find_element(By.PARTIAL_LINK_TEXT, "View larger map")
                larger_map_link.click()
                time.sleep(5)
                print("Switched to full version")
                return True
            except:
                current_url = driver.current_url
                if "lite" in current_url:
                    full_url = current_url.replace("/lite/", "/").replace("lite.", "")
                    driver.get(full_url)
                    time.sleep(5)
                    print("URL modified to full version")
                    return True
    except:
        return False

def search_on_google_maps(driver, district, city, country, industry):
    search_query = f"{industry} in {city}, {district}, {country}"
    print(f"Searching for: {search_query}")
    encoded_query = urllib.parse.quote(search_query)
    maps_url = f"https://www.google.com/maps/search/{encoded_query}"

    try:
        driver.get(maps_url)
        print("Loading Google Maps...")
        time.sleep(10)
        force_full_maps(driver)

        result_selectors = [
            '//div[@role="feed"]',
            '//div[@id="pane"]',
            '//div[contains(@class, "Nv2PK")]'
        ]

        wait = WebDriverWait(driver, 30)
        print("Looking for search results...")
        for selector in result_selectors:
            try:
                wait.until(EC.presence_of_element_located((By.XPATH, selector)))
                print("Search results found")
                return True
            except TimeoutException:
                continue
        print("No search results found")
        return False
    except:
        print("Error during search")
        return False

def scroll_and_load_results(driver):
    print("Scrolling to load all results...")
    try:
        scrollable_element = driver.find_element(By.XPATH, '//div[@role="feed"]')
    except:
        print("No scrollable element found, skipping scroll")
        return True

    last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_element)
    scroll_count = 0
    for i in range(20):
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_element)
        time.sleep(3)
        new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_element)
        scroll_count += 1
        print(f"Scroll {scroll_count}/20 - Loading more results...")
        if new_height == last_height:
            print("No more results to load")
            break
        last_height = new_height
    print("Scrolling completed")
    return True

def get_business_links(driver):
    print("Extracting business links...")
    business_links = []
    elements = driver.find_elements(By.XPATH, '//a[contains(@href, "/maps/place/")]')
    for element in elements:
        href = element.get_attribute('href')
        if href and '/maps/place/' in href and href not in business_links:
            business_links.append(href)
    unique_links = list(set(business_links))[:CONFIG['max_results']]
    print(f"Found {len(unique_links)} business links")
    return unique_links

def extract_business_details(driver, business_url):
    details = {
        'Name': '',
        'Phone': '',
        'Rating': '',
        'Review_Count': '',
        'Address': '',
        'Website': '',
        'Google_Maps_Link': business_url
    }
    try:
        driver.get(business_url)
        time.sleep(5)

        # Name
        try:
            name_selectors = [
                '//h1[@class="DUwDvf lfPIob"]',
                '//h1[contains(@class, "fontHeadlineLarge")]',
                '//div[contains(@class, "fontHeadlineLarge")]',
                '//h1',
                '//span[contains(@class, "DUwDvf")]'
            ]
            for selector in name_selectors:
                try:
                    details['Name'] = driver.find_element(By.XPATH, selector).text.strip()
                    if details['Name']:
                        break
                except:
                    continue
        except:
            pass

        # Rating and Review Count
        try:
            rating_selectors = [
                '//span[@class="MW4etd"]',
                '//div[contains(@class, "F7nice")]//span[contains(@aria-label, "stars")]',
                '//span[contains(@aria-label, "stars")]'
            ]

            for selector in rating_selectors:
                try:
                    rating_element = driver.find_element(By.XPATH, selector)
                    rating_text = rating_element.text or rating_element.get_attribute('aria-label')

                    # Extract rating
                    rating_match = re.search(r'(\d+\.?\d*)', rating_text)
                    if rating_match:
                        details['Rating'] = rating_match.group(1)

                    # Try to find review count in nearby elements
                    try:
                        # Look for review count in parent or sibling elements
                        parent = rating_element.find_element(By.XPATH, './..')
                        parent_text = parent.text

                        # Multiple patterns for review count
                        review_patterns = [
                            r'(\d{1,3}(?:,\d{3})*)\s*(?:reviews?|Reviews?)',
                            r'(\d+)\s*(?:reviews?|Reviews?)',
                            r'\((\d{1,3}(?:,\d{3})*)\)',
                            r'Based on (\d{1,3}(?:,\d{3})*)'
                        ]

                        for pattern in review_patterns:
                            review_match = re.search(pattern, parent_text)
                            if review_match:
                                details['Review_Count'] = review_match.group(1).replace(',', '')
                                break

                        # If not found in parent, try grandparent
                        if not details['Review_Count']:
                            try:
                                grandparent = parent.find_element(By.XPATH, './..')
                                grandparent_text = grandparent.text
                                for pattern in review_patterns:
                                    review_match = re.search(pattern, grandparent_text)
                                    if review_match:
                                        details['Review_Count'] = review_match.group(1).replace(',', '')
                                        break
                            except:
                                pass

                    except:
                        pass

                    if details['Rating']:
                        break
                except:
                    continue
        except:
            pass

        # Alternative review count search
        if not details['Review_Count']:
            try:
                review_selectors = [
                    '//span[contains(text(), "reviews")]',
                    '//span[contains(text(), "Reviews")]',
                    '//button[contains(@aria-label, "reviews")]',
                    '//*[contains(text(), "reviews") or contains(text(), "Reviews")]'
                ]

                for selector in review_selectors:
                    try:
                        elements = driver.find_elements(By.XPATH, selector)
                        for element in elements:
                            text = element.text or element.get_attribute('aria-label')
                            if text:
                                review_match = re.search(r'(\d{1,3}(?:,\d{3})*)\s*(?:reviews?|Reviews?)', text)
                                if review_match:
                                    details['Review_Count'] = review_match.group(1).replace(',', '')
                                    break
                        if details['Review_Count']:
                            break
                    except:
                        continue
            except:
                pass

        # Phone
        try:
            phone_selectors = [
                '//button[contains(@data-item-id, "phone")]//div[contains(@class, "Io6YTe")]',
                '//span[contains(@aria-label, "Phone")]',
                '//a[contains(@href, "tel:")]'
            ]
            for selector in phone_selectors:
                try:
                    phone_element = driver.find_element(By.XPATH, selector)
                    phone_text = phone_element.text or phone_element.get_attribute('href')
                    if phone_text:
                        details['Phone'] = phone_text.replace('tel:', '').strip()
                        break
                except:
                    continue
        except:
            pass

        # Address
        try:
            address_selectors = [
                '//button[@data-item-id="address"]//div[contains(@class, "Io6YTe")]',
                '//span[contains(@aria-label, "Address")]',
                '//div[contains(@class, "rogA2c")]//div[contains(@class, "Io6YTe")]'
            ]
            for selector in address_selectors:
                try:
                    address_element = driver.find_element(By.XPATH, selector)
                    details['Address'] = address_element.text.strip()
                    if details['Address']:
                        break
                except:
                    continue
        except:
            pass

        # Website
        try:
            website_selectors = [
                '//a[@data-item-id="authority"]//div[contains(@class, "Io6YTe")]',
                '//a[contains(@href, "http") and not(contains(@href, "google"))]'
            ]
            for selector in website_selectors:
                try:
                    website_element = driver.find_element(By.XPATH, selector)
                    website = website_element.get_attribute('href') or website_element.text
                    if website and 'google' not in website.lower():
                        details['Website'] = website.strip()
                        break
                except:
                    continue
        except:
            pass

        if details['Name']:
            print(f"Extracted: {details['Name']} (Rating: {details['Rating']}, Reviews: {details['Review_Count']})")
            return details
        else:
            print("Failed to extract business name")
        return None
    except Exception as e:
        print(f"Error extracting business details: {str(e)}")
        return None

def save_to_excel(business_data_list, district, city, industry):
    if not business_data_list:
        print("No data to save")
        return
    df = pd.DataFrame(business_data_list)
    output_file = f"{industry}_{city}.xlsx".replace(" ", "_")
    sheet_name = f"{industry}_{district}_{city}".replace(" ", "_")[:31]
    print(f"Saving data to {output_file}...")
    with pd.ExcelWriter(output_file, engine='openpyxl', mode='w') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)
    print(f"Data saved to {output_file} ({len(df)} records)")

def main():
    print(f"Starting scraping process for {CONFIG['industry']} in {CONFIG['city']}, {CONFIG['district']}")
    print(f"Target: {CONFIG['max_results']} results")
    print("-" * 50)

    driver = None
    all_business_data = []
    try:
        driver = setup_driver()
        if not search_on_google_maps(driver, CONFIG['district'], CONFIG['city'], CONFIG['country'], CONFIG['industry']):
            print("Search failed, exiting...")
            return

        scroll_and_load_results(driver)
        business_links = get_business_links(driver)

        if not business_links:
            print("No business links found")
            return

        print(f"Processing {len(business_links)} businesses:")
        print("-" * 50)

        for i, business_url in enumerate(business_links, 1):
            print(f"[{i}/{len(business_links)}] Processing business...")
            business_details = extract_business_details(driver, business_url)
            if business_details:
                all_business_data.append(business_details)
                print(f"Successfully extracted data for {business_details['Name']}")
            else:
                print("Failed to extract business data")
            time.sleep(random.uniform(2, 4))

        print("-" * 50)
        if all_business_data:
            save_to_excel(all_business_data, CONFIG['district'], CONFIG['city'], CONFIG['industry'])
            print(f"Scraping completed successfully! Total records: {len(all_business_data)}")
        else:
            print("No valid business data extracted")

    except Exception as e:
        print(f"Error occurred: {str(e)}")
    finally:
        if driver:
            driver.quit()
            print("Browser closed")

if __name__ == "__main__":
    main()

Starting scraping process for gyms in Kotte, Colombo
Target: 10 results
--------------------------------------------------
Setting up Chrome WebDriver...
WebDriver setup completed
Searching for: gyms in Kotte, Colombo, Sri Lanka
Loading Google Maps...
Checking for full Google Maps version...
Looking for search results...
Search results found
Scrolling to load all results...
Scroll 1/20 - Loading more results...
Scroll 2/20 - Loading more results...
Scroll 3/20 - Loading more results...
Scroll 4/20 - Loading more results...
Scroll 5/20 - Loading more results...
Scroll 6/20 - Loading more results...
Scroll 7/20 - Loading more results...
Scroll 8/20 - Loading more results...
Scroll 9/20 - Loading more results...
Scroll 10/20 - Loading more results...
Scroll 11/20 - Loading more results...
Scroll 12/20 - Loading more results...
Scroll 13/20 - Loading more results...
Scroll 14/20 - Loading more results...
Scroll 15/20 - Loading more results...
Scroll 16/20 - Loading more results...
No more 