In [None]:
%pip install selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

In [3]:
results_url = "https://www.premierleague.com/results"

In [4]:
def get_driver(headless: bool = False):
    # Path to the chromedriver executable
    chromedriver_path = './chromedriver.exe'

    # Set headless mode if specified
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')

    # Start and return the chrome insance
    return webdriver.Chrome(service=Service(executable_path=chromedriver_path), options=options)

In [5]:
driver = get_driver()

# Navigate to the url
driver.get(results_url)

# Quit the driver
driver.quit()

Using the "By" class, we can access elements by many different methods:
ID = "id"
NAME = "name"
XPATH = "xpath"
LINK_TEXT = "link text"
PARTIAL_LINK_TEXT = "partial link text"
TAG_NAME = "tag name"
CLASS_NAME = "class name"
CSS_SELECTOR = "css selector"

In [6]:
from selenium.webdriver.common.by import By

In [7]:
# using find_element() and click() APIs
driver = get_driver()
driver.get(results_url)

accept_cookies_id, close_advert_id = "onetrust-accept-btn-handler", "advertClose"

driver.find_element(By.ID, accept_cookies_id).click()
driver.find_element(By.ID, close_advert_id).click()

driver.quit()

### Explicit and Implicit Waits

Explicit - Wait a specific amount of time to find a certain element
Implicit - When finding any element, wait a certain amount of time

Use explicit - gives us more customization over the code, and can avoid problems from wait times being too great or too little on individual cases.

### Expected Conditions

Can be used in conjuntion with waits - we wait EITHER for an expected condition to be true, or until the time limit is exceeded.

Examples of Expected Conditions (EC):
- title_is
- title_contains
- presence_of_element_located


In [8]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = get_driver()
driver.get(results_url)

accept_cookies_id, close_advert_id, invalid_element_id = "onetrust-accept-btn-handler", "advertClose", "not-an-element"

try:
    accept_cookies_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, accept_cookies_id))
    ).click()

    close_advert_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, close_advert_id))
    ).click()

    # THE CODE BELOW WILL NOT WORK
    # Keep it uncommented to see how an invalid element will fail with this code
    # invalid_element = WebDriverWait(driver, 10).until(
    #     EC.presence_of_element_located((By.ID, invalid_element_id))
    # ).click()
finally:
    driver.quit()

# Using XPaths
We can also parse through the DOM with XPaths.

More information: 
- https://www.w3schools.com/xml/xpath_intro.asp
- https://scrapfly.io/blog/parsing-html-with-xpath/

# Scrolling using ActionChains
We can use the ActionChains library for various actions in the browser.

In [34]:
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup as bs
import pandas as pd

driver = get_driver()
driver.get(results_url)

accept_cookies_id, close_advert_id = "onetrust-accept-btn-handler", "advertClose"

try:
    accept_cookies_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, accept_cookies_id))
    ).click()

    close_advert_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, close_advert_id))
    ).click()

    # Scroll to footer to activate JavaScript load of data
    ActionChains(driver).scroll_to_element(
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "footer"))
        )
    ).perform()

    # waiting until all the data has loaded onto the page
    date_string_to_find = "Friday 11 August 2023"
    date_xpath = f"//*[contains(text(),'{date_string_to_find}')]"
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, date_xpath))
    )

    # Once we fetch the HTML, we can use beautifulsoup to parse through the webpage
    pl_html = driver.find_element(By.TAG_NAME, "html").get_attribute("innerHTML")
    driver.quit()
    
    soup = bs(pl_html, "html.parser")

    dates, home, away, home_score, away_score, stadium = [], [], [], [], [], []
    col_names = "date", "home", "away", "home_score", "away_score", "stadium"

    for date in soup.select(".fixtures__date-container"):
        match_date = date.find("time").text
        for _ in range(len(date.select(".match-fixture"))):
            dates.append(match_date)

    match_list= [i for i in soup.select(".matchList > .match-fixture")]
    home = [i['data-home'] for i in match_list]
    away = [i['data-away'] for i in match_list]
    scores = [i.text.split("-") for i in soup.select(".match-fixture__score")]
    home_score = [score[0] for score in scores]
    away_score = [score[1] for score in scores]
    stadium = [i['data-venue'] for i in match_list]

    cols = [dates, home, away, home_score, away_score, stadium]
    data = dict()
    for i in range(len(col_names)):
        data[col_names[i]] = cols[i]

    df = pd.DataFrame(data=data)
    print(df)
except Exception as e:
    print("There was an error.")
    print(e)

                          date           home            away home_score   
0     Tuesday 20 February 2024       Man City       Brentford          1  \
1      Monday 19 February 2024        Everton  Crystal Palace          1   
2      Sunday 18 February 2024  Sheffield Utd        Brighton          0   
3      Sunday 18 February 2024          Luton         Man Utd          1   
4    Saturday 17 February 2024      Brentford       Liverpool          1   
..                         ...            ...             ...        ...   
244    Saturday 12 August 2023       Brighton           Luton          4   
245    Saturday 12 August 2023        Everton          Fulham          0   
246    Saturday 12 August 2023  Sheffield Utd  Crystal Palace          0   
247    Saturday 12 August 2023      Newcastle     Aston Villa          5   
248      Friday 11 August 2023        Burnley        Man City          0   

    away_score                             stadium  
0            0          Etihad Sta

# Scrolling using

# Page Object Model Design Pattern

For more information:
- https://selenium-python.readthedocs.io/