In [None]:
import time
import re
import ctypes
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException

ctypes.windll.kernel32.SetThreadExecutionState(0x80000002) # stops sleep, Windows only

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.maximize_window()

def extract_text(driver, by=None, value=None, script=None, timeout=3, default='N/A'):
    if script:
        try:
            return driver.execute_script(script)
        except Exception as e:
            print(f"Javascript execution failed: {e}")
            return default 
    else:
            try:
                return WebDriverWait(driver, timeout).until(
                    EC.presence_of_element_located((by, value))
                ).text.strip()
            except TimeoutException:
                return default

driver.get('https://www.premierleague.com/players')

try:
    accept_cookies = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
    )
    accept_cookies.click()
    print("Cookies popup dismissed.")
except (TimeoutException, NoSuchElementException) as e:
    print(f"No cookies popup found or could not dismiss it: {e}")

# Scroll to load all players
SCROLL_PAUSE_TIME = 2
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(SCROLL_PAUSE_TIME)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


try:
    WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'player__name')))
except TimeoutException:
    print("Timeout occurred while waiting for player name class to be present")
    driver.quit()
    exit()


player_links = driver.find_elements(By.CLASS_NAME, 'player__name')

links = []
for link in player_links:
    try:
        player_url = link.get_attribute('href')
        if not player_url.startswith('http'):
            player_url = f"https:{player_url}"
        links.append(player_url)
    except StaleElementReferenceException as e:
        print(f"Stale element reference: {e}")
        continue


players_data = []


for i, player_url in enumerate(links):
    driver.get(player_url)
    try:
        first_name = extract_text(driver, By.CLASS_NAME, 'player-header__name-first')
        last_name = extract_text(driver, By.CLASS_NAME, 'player-header__name-last')
        nationality = extract_text(driver, By.CLASS_NAME, 'player-info__player-country')
        # using JavaScript
        shirt_number_script = """
            var element = document.querySelector('.player-header__player-number');
            return element ? element.innerText : 'N/A';
        """
        shirt_number = extract_text(driver, script=shirt_number_script)
        club = extract_text(driver, By.CSS_SELECTOR, '.player-overview__info a')
        position = extract_text(driver, By.XPATH, "//div[@class='player-overview__col'][div[text()='Position']]/div[2]")
        age_info = extract_text(driver, By.XPATH, "(//div[@class='player-info__info'])[2]")
        age_match = re.search(r'\((\d+)\)', age_info)
        age = age_match.group(1) if age_match else 'N/A'
        player_data = {
            'first_name': first_name,
            'last_name': last_name,
            'nationality': nationality,
            'age': age,
            'shirt_number': shirt_number,
            'club': club,
            'position': position
        }
        players_data.append(player_data)
        print(f"Player {i + 1}: {player_data}")
    except Exception as e:
        print(f"An error occurred while processing {player_url}: {e}")

df = pd.DataFrame(players_data)

df.to_csv('premier_league_players.csv', index=False)

print("Data saved to premier_league_players.csv")

driver.quit()
