# Selenium For Webscraping

Designed for Test Automation of Web Applications. Slow. Ideal for Intermediate Level Tasks. 

In [1]:
import time
import pandas as pd
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


## Configuring The Driver

In [2]:
#path to chromedriver on the PC (Version must match installed version of Chrome)
driver_path = '/opt/homebrew/bin/chromedriver'

In [3]:
# Create an instance of ChromeOptions
options = Options()
options.add_argument("--headless")

# Pass the options to the Chrome driver
driver = webdriver.Chrome(service=Service(driver_path), options=options)

In [4]:
# Get response from the website
website = 'https://www.adamchoi.co.uk/overs/detailed'
driver.get(website)

## Finding Elements With Selenium

1. Finding elements by id

        singular - driver.find_element_by_id('id')
        multiple - driver.find_elements_by_id('id')

2. Finding elements by class name. 

        singular - driver.find_element_by_class_name('class_name')
        multiple - driver.find_elements_by_class_name('class_name')

3. Finding elements by tag name.

        singular - driver.find_element_by_tag_name('tag_name')
        multiple - driver.find_elements_by_tag_name('tag_name')

4. Finding elements by XPath

        singular - driver.find_element_by_xpath('xpath_expression')
        multiple - driver.find_elements_by_xpath('xpath_expression')

5. Finding elements by css selector

        singular - driver.find_element_by_css_selector('css_selector')
        multiple - driver.find_elements_by_css_selector('css_selector')

6. Finding elements by name 

        singular - driver.find_element_by_name('name')
        multiple - driver.find_elements_by_name('name')

7. Finding elements by link text

        singular - driver.find_element_by_link_text('link_text')
        multiple - driver.find_elements_by_link_text('link_text')


In [5]:
# interacting with JS web elements

all_matches_button = driver.find_element(By.XPATH, '//label[@analytics-event="All matches"]')
all_matches_button.click()

In [6]:
matches = driver.find_elements(By.TAG_NAME, 'tr')


In [57]:
def catch_element_error(root,by,selector, link=False):
    if not link:
        try: 
            return root.find_element(by, selector).text
        except:
            return None
    else: 
        try: 
            return root.find_element(by, selector).get_attribute("href")
        except:
            return None


## Compiling Data Into A DataFrame 

In [8]:
date = []
home_team = []
result = []
away_team = []

for match in matches:
    date.append(catch_element_error(match, By.XPATH, './td[1]'))
    home_team.append(catch_element_error(match, By.XPATH, './td[3]'))
    result.append(catch_element_error(match, By.XPATH, './td[4]'))
    away_team.append(catch_element_error(match, By.XPATH, './td[5]'))

In [9]:
data = {
    'date': date, 
    'home_team': home_team,
    'result': result,
    'away_team': away_team
}

df = pd.DataFrame(data)
df

Unnamed: 0,date,home_team,result,away_team
0,17-08-2024,Arsenal,2 - 0,Wolves
1,24-08-2024,Aston Villa,0 - 2,Arsenal
2,31-08-2024,Arsenal,1 - 1,Brighton
3,15-09-2024,Tottenham,0 - 1,Arsenal
4,22-09-2024,Man City,2 - 2,Arsenal
...,...,...,...,...
473,06-01-2025,Wolves,0 - 3,Nott'm Forest
474,15-01-2025,Newcastle,3 - 0,Wolves
475,20-01-2025,Chelsea,3 - 1,Wolves
476,25-01-2025,Wolves,? - ?,Arsenal


In [10]:
df.isna().sum()

date          0
home_team    28
result       28
away_team    28
dtype: int64

## Selecting Elements Using Dropdowns

In [11]:
dropdown = Select(driver.find_element(By.ID, 'country'))
dropdown.select_by_visible_text('Spain')

time.sleep(5)

matches = driver.find_elements(By.TAG_NAME, 'tr')

In [12]:
date = []
home_team = []
result = []
away_team = []

for match in matches:
    date.append(catch_element_error(match, By.XPATH, './td[1]'))
    home_team.append(catch_element_error(match, By.XPATH, './td[3]'))
    result.append(catch_element_error(match, By.XPATH, './td[4]'))
    away_team.append(catch_element_error(match, By.XPATH, './td[5]'))

In [13]:
data = {
    'date': date, 
    'home_team': home_team,
    'result': result,
    'away_team': away_team
}

df = pd.DataFrame(data)
df

Unnamed: 0,date,home_team,result,away_team
0,16-08-2024,Celta,2 - 1,Alaves
1,25-08-2024,Alaves,0 - 0,Betis
2,28-08-2024,Sociedad,1 - 2,Alaves
3,01-09-2024,Alaves,2 - 0,Las Palmas
4,14-09-2024,Espanol,3 - 2,Alaves
...,...,...,...,...
433,22-12-2024,Leganes,2 - 5,Villarreal
434,13-01-2025,Sociedad,1 - 0,Villarreal
435,20-01-2025,Villarreal,4 - 0,Mallorca
436,25-01-2025,Ath Madrid,? - ?,Villarreal


In [14]:
df.isna().sum()

date          0
home_team    28
result       28
away_team    28
dtype: int64

In [15]:
# Close driver instance
driver.quit()

## Dealing With Pagination

In [100]:
product_selector = './/li[contains(@class, "productListItem")]'

In [101]:
# Create an instance of ChromeOptions
options = Options()
options.add_argument("--headless")

# Pass the options to the Chrome driver
driver = webdriver.Chrome(service=Service(driver_path), options=options)

In [104]:
web = "https://www.audible.com/search"
driver.get(web)

In [105]:
driver.maximize_window()

In [113]:
# getting all pages
paginator = driver.find_element(By.XPATH, '//ul[contains(@class, "pagingElements")]')
pages = paginator.find_elements(By.TAG_NAME, 'li')
last_page = int(pages[-2].text)
last_page

25

In [114]:
# automate page navigation


In [107]:
container = driver.find_element(By.CLASS_NAME, 'adbl-impression-container')
products = container.find_elements(By.XPATH, product_selector)

In [115]:
title = []
subtitle = []
author = []
length = []
language = []
url = []
release_date = []

current_page = 1 

while current_page <= last_page:
    time.sleep(3) # implicit wait
    for product in products:
        title.append(catch_element_error(product, By.XPATH, ".//h3[contains(@class, 'bc-heading')]/a"))
        subtitle.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'subtitle')]"))
        author.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'authorLabel')]"))
        length.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'runtimeLabel')]"))
        release_date.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'releaseDateLabel')]"))
        language.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'languageLabel')]"))
        url.append(catch_element_error(product, By.XPATH, ".//h3[contains(@class, 'bc-heading')]/a", link=True))
    current_page += 1
    try:
        next_page = driver.find_element(By.XPATH, '//span[contains(@class, "nextButton")]')
        next_page.click()
    except: 
        pass
    

In [116]:
data = {
    'title': title,
    'subtitle': subtitle,
    'author': author,
    'length': length,
    'language': language,
    'release_date': release_date,
    'url': url
}

In [117]:
df = pd.DataFrame(data)

In [118]:
df.head()

Unnamed: 0,title,subtitle,author,length,language,release_date,url
0,The King's Captive: Magiford Supernatural City,"Gate of Myth and Power, Book 1",By: K. M. Shea,Length: 9 hrs and 43 mins,Language: English,Release date: 01-24-25,https://www.audible.com/pd/The-Kings-Captive-M...
1,Shadow Heart,"Cursed Legacies, Book 2",By: Morgan B Lee,Length: 13 hrs and 12 mins,Language: English,Release date: 01-24-25,https://www.audible.com/pd/Shadow-Heart-Audiob...
2,A Shot to Kill,"Blake Wilder FBI Mystery Thriller, Book 26",By: Elle Gray,Length: 7 hrs and 31 mins,Language: English,Release date: 01-22-25,https://www.audible.com/pd/A-Shot-to-Kill-Audi...
3,Charlotte's Reject,,By: K. R. Treadway,Length: 11 hrs and 38 mins,Language: English,Release date: 01-22-25,https://www.audible.com/pd/Charlottes-Reject-A...
4,The Runic Artist: Painted Path,,By: Ellake,Length: 17 hrs and 54 mins,Language: English,Release date: 01-22-25,https://www.audible.com/pd/The-Runic-Artist-Pa...


In [120]:
df.shape

(500, 7)

## Working With Explicit Waits

In [96]:
#driver.quit()