# Selenium For Webscraping

Designed for Test Automation of Web Applications. Slow. Ideal for Intermediate Level Tasks. 

In [1]:
import os
import time
import pandas as pd
from pathlib import Path
from selenium import webdriver
from dotenv import load_dotenv
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

#path to chromedriver on the PC (Version must match installed version of Chrome)
driver_path = '/opt/homebrew/bin/chromedriver'


## Writing Custom Code To Handle Missing Values During Scraping

In [2]:
def catch_element_error(root,by,selector, link=False):
    if not link:
        try: 
            return root.find_element(by, selector).text
        except:
            return None
    else: 
        try: 
            return root.find_element(by, selector).get_attribute("href")
        except:
            return None

## Configuring The Driver

In [3]:
# Create an instance of ChromeOptions
options = Options()
options.add_argument("--headless")

# Pass the options to the Chrome driver
driver = webdriver.Chrome(service=Service(driver_path), options=options)

In [4]:
# Get response from the website
website = 'https://www.adamchoi.co.uk/overs/detailed'
driver.get(website)

## Finding Elements With Selenium

1. Finding elements by id

        singular - driver.find_element_by_id('id')
        multiple - driver.find_elements_by_id('id')

2. Finding elements by class name. 

        singular - driver.find_element_by_class_name('class_name')
        multiple - driver.find_elements_by_class_name('class_name')

3. Finding elements by tag name.

        singular - driver.find_element_by_tag_name('tag_name')
        multiple - driver.find_elements_by_tag_name('tag_name')

4. Finding elements by XPath

        singular - driver.find_element_by_xpath('xpath_expression')
        multiple - driver.find_elements_by_xpath('xpath_expression')

5. Finding elements by css selector

        singular - driver.find_element_by_css_selector('css_selector')
        multiple - driver.find_elements_by_css_selector('css_selector')

6. Finding elements by name 

        singular - driver.find_element_by_name('name')
        multiple - driver.find_elements_by_name('name')

7. Finding elements by link text

        singular - driver.find_element_by_link_text('link_text')
        multiple - driver.find_elements_by_link_text('link_text')


In [5]:
# interacting with JS web elements

all_matches_button = driver.find_element(By.XPATH, '//label[@analytics-event="All matches"]')
all_matches_button.click()

In [6]:
matches = driver.find_elements(By.TAG_NAME, 'tr')


## Compiling Data Into A DataFrame 

In [8]:
date = []
home_team = []
result = []
away_team = []

for match in matches:
    date.append(catch_element_error(match, By.XPATH, './td[1]'))
    home_team.append(catch_element_error(match, By.XPATH, './td[3]'))
    result.append(catch_element_error(match, By.XPATH, './td[4]'))
    away_team.append(catch_element_error(match, By.XPATH, './td[5]'))

In [None]:
data = {
    'date': date, 
    'home_team': home_team,
    'result': result,
    'away_team': away_team
}

df = pd.DataFrame(data)
df

In [None]:
df.to_csv(Path.cwd().parent/'data'/'matches'/'EPL.csv')

In [None]:
df.isna().sum()

## Selecting Elements Using Dropdowns

In [11]:
dropdown = Select(driver.find_element(By.ID, 'country'))
dropdown.select_by_visible_text('Spain')

time.sleep(5)

matches = driver.find_elements(By.TAG_NAME, 'tr')

In [12]:
date = []
home_team = []
result = []
away_team = []

for match in matches:
    date.append(catch_element_error(match, By.XPATH, './td[1]'))
    home_team.append(catch_element_error(match, By.XPATH, './td[3]'))
    result.append(catch_element_error(match, By.XPATH, './td[4]'))
    away_team.append(catch_element_error(match, By.XPATH, './td[5]'))

In [None]:
data = {
    'date': date, 
    'home_team': home_team,
    'result': result,
    'away_team': away_team
}

df = pd.DataFrame(data)
df

In [None]:
df.to_csv(Path.cwd().parent/'data'/'matches'/'LaLiga.csv')

In [None]:
df.isna().sum()

In [15]:
# Close driver instance
driver.quit()

## Dealing With Pagination

In [101]:
# Create an instance of ChromeOptions
options = Options()
options.add_argument("--headless")

# Pass the options to the Chrome driver
driver = webdriver.Chrome(service=Service(driver_path), options=options)

In [104]:

web = "https://www.audible.com/search"
driver.get(web)

In [105]:
driver.maximize_window()

In [None]:
# getting all pages
paginator = driver.find_element(By.XPATH, '//ul[contains(@class, "pagingElements")]')
pages = paginator.find_elements(By.TAG_NAME, 'li')
last_page = int(pages[-2].text)
last_page

In [107]:
product_selector = './/li[contains(@class, "productListItem")]'
container = driver.find_element(By.CLASS_NAME, 'adbl-impression-container')
products = container.find_elements(By.XPATH, product_selector)

In [115]:
title = []
subtitle = []
author = []
length = []
language = []
url = []
release_date = []

current_page = 1 

while current_page <= last_page:
    time.sleep(3) # implicit wait
    for product in products:
        title.append(catch_element_error(product, By.XPATH, ".//h3[contains(@class, 'bc-heading')]/a"))
        subtitle.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'subtitle')]"))
        author.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'authorLabel')]"))
        length.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'runtimeLabel')]"))
        release_date.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'releaseDateLabel')]"))
        language.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'languageLabel')]"))
        url.append(catch_element_error(product, By.XPATH, ".//h3[contains(@class, 'bc-heading')]/a", link=True))
    current_page += 1
    try:
        next_page = driver.find_element(By.XPATH, '//span[contains(@class, "nextButton")]')
        next_page.click()
    except: 
        pass
    

In [116]:
data = {
    'title': title,
    'subtitle': subtitle,
    'author': author,
    'length': length,
    'language': language,
    'release_date': release_date,
    'url': url
}

In [117]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.to_csv(Path.cwd().parent/'data'/'books'/'MajorTitles.csv')

In [None]:
df.shape

In [121]:
driver.close()

## Working With Explicit Waits

In [3]:
# Create an instance of ChromeOptions
options = Options()
options.add_argument("--headless")

# Pass the options to the Chrome driver
driver = webdriver.Chrome(service=Service(driver_path), options=options)

In [4]:
load_dotenv(Path.cwd().parent / '.env')
web = os.environ.get('ENV_LINK')

In [5]:
print(web)

https://www.audible.com/search?node=18573351011&ref_pageloadid=not_applicable&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=RG9WDH6A8G3HKV1HWGX0&plink=q4L5kdRIJSAxY9um&pageLoadId=4lRcejtapgJ27POW&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&ref=a_search_l1_catRefs_7


In [6]:
driver.get(web)

In [7]:
paginator = driver.find_element(By.XPATH, '//ul[contains(@class, "pagingElements")]')
pages = paginator.find_elements(By.TAG_NAME, 'li')
last_page = int(pages[-2].text)
last_page

25

In [8]:
product_selector = './/li[contains(@class, "productListItem")]'


In [9]:
title = []
subtitle = []
author = []
length = []
language = []
url = []
release_date = []

current_page = 1

while current_page <= last_page:
    time.sleep(3) # implicit wait
    container = WebDriverWait(driver, 10).until(ec.presence_of_element_located((By.CLASS_NAME, 'adbl-impression-container')))
    products = WebDriverWait(driver, 10).until(ec.presence_of_all_elements_located((By.XPATH, product_selector)))
    for product in products:
        title.append(catch_element_error(product, By.XPATH, ".//h3[contains(@class, 'bc-heading')]/a"))
        subtitle.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'subtitle')]"))
        author.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'authorLabel')]"))
        length.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'runtimeLabel')]"))
        release_date.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'releaseDateLabel')]"))
        language.append(catch_element_error(product, By.XPATH, ".//li[contains(@class, 'languageLabel')]"))
        url.append(catch_element_error(product, By.XPATH, ".//h3[contains(@class, 'bc-heading')]/a", link=True))
    current_page += 1
    try:
        next_page = driver.find_element(By.XPATH, '//span[contains(@class, "nextButton")]')
        next_page.click()
    except: 
        
        pass

In [14]:
data = {
    'title': title,
    'subtitle': subtitle,
    'author': author,
    'length': length,
    'language': language,
    'release_date': release_date,
    'url': url
}

In [15]:
df = pd.DataFrame(data)

In [16]:
df.head()

Unnamed: 0,title,subtitle,author,length,language,release_date,url
0,The Moon & His Tides,"Impossible Universe Trilogy, Book 1",By: Giana Darling,Length: 10 hrs and 59 mins,Language: English,Release date: 01-23-25,https://www.audible.com/ac/The-Moon-His-Tides-...
1,Caught Up,"Into Darkness, Book 2",By: Navessa Allen,Length: Not Yet Known,Language: English,Release date: 06-10-25,https://www.audible.com/ac/Caught-Up-Audiobook...
2,Madness,,By: Shantel Tessier,Length: 23 hrs and 33 mins,Language: English,Release date: 12-14-24,https://www.audible.com/ac/Madness-Audiobook/B...
3,Inner Voice,I Heard the Inner Voice of My Childhood Friend,By: Aine McCuin,Length: 57 mins,Language: English,Release date: 12-16-24,https://www.audible.com/ac/Inner-Voice-Audiobo...
4,Sabotage,A Dark Enemies to Lovers Romance,By: Shantel Tessier,Length: 7 hrs and 12 mins,Language: English,Release date: 01-02-25,https://www.audible.com/ac/Sabotage-Audiobook/...


In [17]:
df.shape

(500, 7)

In [18]:
df.to_csv(Path.cwd().parent/'data'/'books'/'EroticTitles.csv')

In [13]:
#driver.quit()