## READ ME - Selenium webscraper with Google Chrome

This notebook is a set of **Selenium** scripts designed to access specific web portals.

Each cell is dedicated to a specific webscrape operation, i.e. a particular webpage. This is because each web portal is set up differently with different types of selection boxes and steps to navigate them.

Therefore, each time you use this code for a new webpage, **please copy and paste into a new cell and adapt the script to your use case**

## Southampton SOLID services directory

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import itertools

# Set up WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (remove for debugging)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open the website
url = 'https://solinked.org.uk/location/citywide/?post_types=community_services'
driver.get(url)
wait = WebDriverWait(driver, 10)

# Filters (short)
#service_areas = [
#    'Access to Food', 'Advice', 'Advice, Information & Guidance',
#    'Children & Young People', 'Information & Guidance',
#    'Learning Disability and Health Conditions', 'Nature and Animals',
#    'Older People', 'Physical Disability, Learning Disability and Health Conditions',
#    'Statutory Services'
#]
# Filters (long)
service_areas = [
    'Access to Food', 'Advice', 'Advice, Information & Guidance', 'Arts & Heritage & Culture', 'Carers', 'Computer and IT Support',
    'Children & Young People', 'Counselling', 'Domestic Abuse','Events', 'Information & Guidance',
    'Learning Disability and Health Conditions','LGBTQI+','Mental Health', 'Nature and Animals',
    'Older People','Physical Activity','Physical Disability', 'Physical Disability, Learning Disability and Health Conditions','Social Isolation',
    'Statutory Services', 'Substance Misuse Support', 'Volunteering & Training'
]
service_types = ["Groups & Services", "Volunteering"]
locations = ["Citywide", "SO14", "SO15", "SO16", "SO17", "SO18", "SO19"]

# Data storage
data = []

# Iterate through all combinations of filters
for service_area, service_type, location in itertools.product(service_areas, service_types, locations):
    print(f"\n🔎 Searching for: {service_area} | {service_type} | {location}")

    try:
        # Refresh the page before applying new filters
        driver.get(url)
        time.sleep(5)  # Allow full reload

        # Locate dropdowns again after refresh
        service_area_dropdown = wait.until(EC.presence_of_element_located((By.ID, "ofservice_area")))
        service_type_dropdown = wait.until(EC.presence_of_element_located((By.ID, "ofservice_type")))
        location_dropdown = wait.until(EC.presence_of_element_located((By.ID, "oflocation")))

        select_service_area = Select(service_area_dropdown)
        select_service_type = Select(service_type_dropdown)
        select_location = Select(location_dropdown)

        # Select filters
        select_service_area.select_by_visible_text(service_area)
        time.sleep(1)
        select_service_type.select_by_visible_text(service_type)
        time.sleep(1)
        select_location.select_by_visible_text(location)
        time.sleep(1)

        # Click "Search" button
        try:
            search_button = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Search']")
            search_button.click()
            print("✅ Clicked 'Search' button.")
            time.sleep(5)  # Allow time for new results to load
        except:
            print("❌ 'Search' button not found!")

        # Scroll down to ensure all results load
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract services
        services = driver.find_elements(By.CSS_SELECTOR, "article.elementor-post")

        if not services:
            print(f"❌ No services found for {service_area} | {service_type} | {location}")
        else:
            for service in services:
                title_tag = service.find_element(By.CSS_SELECTOR, "h3.elementor-heading-title a")
                title = title_tag.text.strip()
                service_url = title_tag.get_attribute("href")

                desc_div = service.find_element(By.CSS_SELECTOR, "div.elementor-widget-container")
                description = desc_div.text.strip()

                data.append([title, description, service_url, f"{service_area} | {service_type} | {location}"])

    except Exception as e:
        print(f"🚨 Error with filters {service_area} | {service_type} | {location}: {e}")

# Close browser
driver.quit()

# Convert data to DataFrame
df = pd.DataFrame(data, columns=["Title", "Description", "URL", "Search Filters"])

# Save to CSV
df.to_csv("results.csv", index=False, encoding="utf-8")

print("\n✅ Results saved to 'results.csv'!")

## Living Well Warrington

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

# Set up WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Set to False if you want to see the browser
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 20)


driver.get("https://livingwellwarrington.org/activities")

# Try closing cookie banner
try:
    accept_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "button[title='Accept All Cookies']"))
    )
    accept_btn.click()
    print("✅ Cookie banner dismissed.")
    time.sleep(2)
except:
    print("ℹ️ No cookie banner found.")

# Scroll to load all results
#last_height = driver.execute_script("return document.body.scrollHeight")
#while True:
#    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#    time.sleep(3)
#    new_height = driver.execute_script("return document.body.scrollHeight")
#    if new_height == last_height:
#        break
#    last_height = new_height

#NEW CODE TO TRY AT HOME!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
results = driver.find_elements(By.CSS_SELECTOR, "a.stream__feed__item__title-area__bglink")

from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException, NoSuchElementException

print("🔄 Loading all results...")

while True:
    try:
        load_more_btn = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[wire\\:click='loadMore']"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_btn)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", load_more_btn)
        print("➕ Clicked 'Load more'")
        time.sleep(2)  # Wait for new content to load
    except (TimeoutException, NoSuchElementException):
        print("✅ No more 'Load more' button. Done loading.")
        break
    except ElementClickInterceptedException as e:
        print("⚠️ Element click intercepted. Trying to scroll more.")
        driver.execute_script("window.scrollBy(0, 300);")
        time.sleep(1)

print("📜 Finished scrolling.")

# Find result links
results = driver.find_elements(By.CSS_SELECTOR, "a.stream__feed__item__title-area__bglink")
urls = [r.get_attribute("href") for r in results]
print(f"📌 Found {len(urls)} result URLs.")

# Extract data from each URL
data = []

for url in urls:
    driver.get(url)
    time.sleep(3)

    try:
        title = driver.find_element(By.CSS_SELECTOR, "#main > div > div.margin-bottom.activity_page > div.margin-bottom.activity_page__heading > h1").text.strip()
    except:
        title = ""

    try:
        content = driver.find_element(By.CSS_SELECTOR, "#main > div > div.margin-bottom.activity_page > div.margin-bottom.activity_page__content > div.page-content.contains_vid").text.strip()
    except:
        content = ""

    try:
        location = driver.find_element(By.CSS_SELECTOR, "#main > div > div.margin-bottom.activity_page > div.activity_page__imageandinfo > div.margin-bottom.activity_page__imageandinfo__info > div:nth-child(3) > div.activity_page__imageandinfo__info__row__details").text.strip()
    except:
        location = ""

    try:
        neighbourhoods = driver.find_elements(By.CSS_SELECTOR, "#main > div > div.margin-bottom.activity_page > div.activity_page__imageandinfo > div.margin-bottom.activity_page__imageandinfo__info > div:nth-child(4) > div.activity_page__imageandinfo__info__row__details > span > ul > li")
        neighbourhoods_text = ", ".join([n.text.strip() for n in neighbourhoods])
    except:
        neighbourhoods_text = ""

    data.append({
        "Title": title,
        "URL": url,
        "Page Content": content,
        "Location": location,
        "Neighbourhoods": neighbourhoods_text
    })

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("livingwellwarrington_results.csv", index=False, encoding="utf-8")
print("✅ Data saved. Found:", len(df), "items.")

driver.quit()


ℹ️ No cookie banner found.
🔄 Loading all results...
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
➕ Clicked 'Load more'
✅ No more 'Load more' button. Done loading.
📜 Finished scrolling.
📌 Found 592 result URLs.
✅ Data saved. Found: 592 items.
