# Web Scraping with Python and Selenium

## Libraries and settings

In [None]:
# Libraries
import os
import re
import json
import time
import random
import pandas as pd

from bs4 import BeautifulSoup

from prettytable import from_csv

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC

# Set up Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Path to ChromeDriver
service = Service('/usr/local/bin/chromedriver')

# Settings
import warnings
warnings.filterwarnings("ignore")

# Current working directory
print(f'Current working directory: {os.getcwd()}')

## Import a list with user agents

In [None]:
# Liste mit User-Agents für Rotation
ua_path = "user_agents.txt"
ua_list = [line.rstrip('\n') for line in open(ua_path)]
ua_list[:5]

## Chrome headless mode (without graphical user interface)

In [None]:
# Initialize the driver
driver = webdriver.Chrome(service=service, options=options)

# Website to scrape
driver.get('https://store.dji.com')

# Get current url
print(driver.current_url)

# Return the full page HTML code
# print(driver.page_source)

# Gets the page's title
print(driver.title)

# Close driver
driver.close()


## Creating screenshots from a website

In [None]:
# Chrome-Driver
driver = webdriver.Chrome(service=service, options=options)

# Website to scrape
driver.get('https://store.dji.com')

# Screenshot
driver.save_screenshot('screenshot.png')

# Plot image
img = mpimg.imread('screenshot.png')
plt.figure(figsize=(8,8))
plt.axis('off')
imgplot = plt.imshow(img)

# Close driver
driver.close()

## Locating elements

There are different methods available in the Selenium API to select elements on the page. You can use:

- Name
- Tag name
- Class name
- ID
- XPath
- CSS selectors

Use the inspect element function in Chrome to get these infos:
https://www.hostinger.com/tutorials/website/how-to-inspect-and-change-style-using-google-chrome

### Locating elements using XPath

In [None]:
# Chrome-Driver
driver = webdriver.Chrome(service=service, options=options)

# Website to scrape
driver.get('https://de.wikipedia.org/wiki/Hot_Rod')

# Screenshot
driver.save_screenshot('screenshot.png')

# Plot image
img = mpimg.imread('screenshot.png')
plt.figure(figsize=(8,8))
plt.axis('off')
imgplot = plt.imshow(img)

# Find element using xpath
text = driver.find_element(By.XPATH, '/html/body/div[2]/div/div[3]/main/div[3]/div[3]/div[2]/figure[1]/figcaption').text
print(text)

# Close driver
driver.close()

### Locating elements using tag names

In [None]:
# Chrome-Driver
driver = webdriver.Chrome(service=service, options=options)

# Website to scrape
driver.get('https://www.tagesschau.de/wissen/klima/bienen-impfung-faulbrut-107.html')

# Screenshot
driver.save_screenshot('screenshot.png')

# Plot image
img = mpimg.imread('screenshot.png')
plt.figure(figsize=(8,8))
plt.axis('off')
imgplot = plt.imshow(img)

# Find element using tag names
text = driver.find_element(By.TAG_NAME, 'h3').text
print(text)

# Close driver
driver.close()

## Pagination handling on https://www.minergie.ch

In [None]:
# Chrome-Driver
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 25, poll_frequency=0.3)

def accept_cookies_once():
    # minimalistisch – klappt häufig; stillschweigend weiter, wenn nichts da.
    for sel in [
        "#onetrust-accept-btn-handler", 
        "button[aria-label*='zustimmen' i]", 
        "button[class*='accept' i]",
        "#consent_manager-accept-all, .consent_manager-accept-all"
    ]:
        try:
            btn = WebDriverWait(driver, 6).until(EC.element_to_be_clickable((By.CSS_SELECTOR, sel)))
            btn.click()
            return
        except Exception:
            pass

def collect_page(page):
    base = ("https://www.minergie.ch/de/gebaeude/gebaeudeliste/"
            "?canton=&country=&zip_place=&street_nr=&gid=&participator=&typeofuse="
            "&constructiontype=&year=&sortby=date_asc&numres=12&p={page}")
    url = base.format(page=page)
    driver.get(url)
    if page == 1:
        accept_cookies_once()

    # 1) Container sichtbar?
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".grid.overview.gebaude")))
    # 2) Mindestens 1 sichtbare Karte (Link)
    try:
        wait.until(EC.visibility_of_any_elements_located((By.CSS_SELECTOR, ".grid.overview.gebaude .item a.overlayLink")))
    except Exception:
        # Retry: scroll + kurze Pause + erneut prüfen
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.0)
        elems = driver.find_elements(By.CSS_SELECTOR, ".grid.overview.gebaude .item a.overlayLink")
        if not elems:
            # zweite Chance: Seite neu laden
            driver.get(url)
            time.sleep(1.0 + random.random())
            accept_cookies_once()
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".grid.overview.gebaude")))
            wait.until(EC.visibility_of_any_elements_located((By.CSS_SELECTOR, ".grid.overview.gebaude .item a.overlayLink")))

    return driver.find_elements(By.CSS_SELECTOR, ".grid.overview.gebaude .item a.overlayLink")

from urllib.parse import urlparse, parse_qs

with open("minergie_objects.txt", "w", encoding="utf-8") as f:
    for page in range(1, 6):
        cards = collect_page(page)
        print(f"--------- Page: {page} -------------------------------")
        if not cards:
            print("Keine Einträge gefunden.")
            continue

        for a in cards:
            href = a.get_attribute("href") or ""
            gid = parse_qs(urlparse(href).query).get("gid", [""])[0]
            title = ""
            meta = ""
            try:
                h2 = a.find_element(By.TAG_NAME, "h2")
                title = h2.text.strip().replace("\n", " ")
            except Exception:
                pass
            try:
                p = a.find_element(By.TAG_NAME, "p")
                meta = p.text.strip().replace("\n", " | ")
            except Exception:
                pass
            line = f"{gid} | {title} | {meta}".strip(" |")
            print(line)
            f.write(line + "\n")

driver.quit()

## OpenStreetMap search location example

In [None]:
# Chrome driver
driver = webdriver.Chrome(service=service, options=options)

driver.get('https://www.openstreetmap.org/#map=9/46.7054/8.0283')
time.sleep(2)

# Submit address
element = driver.find_element(By.XPATH, '/html/body/div[3]/div[1]/div[1]/form[1]/div/div/input')
element.send_keys("Melchsee-Frutt")
element.send_keys(Keys.RETURN)
time.sleep(2)

# Click info boxes
try:
    # Click 1st info-box
    info = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[3]/div[1]/div[9]/button'))) 
    info.click()

    # Click 2nd info-box
    info = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[3]/div[1]/div[10]/button'))) 
    info.click()

except:
    pass

# Screenshot of map
driver.save_screenshot('screenshot.png')
img = mpimg.imread('screenshot.png')
plt.figure(figsize=(6,6))
plt.axis('off')
imgplot = plt.imshow(img)

# Close driver
driver.close()

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')