# Imports

In [235]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

# Webscraper Class

In [294]:
# Webscraper class to scrape the data
class webscraper:

    # Initialise data containers
    def __init__(self):
        self.premier_league = { 'premier_league': []}
        self.team_stats = { 'team_stats': []}
        self.player_stats = { 'player_stats': []}

    # Function to handle add blockers
    def handle_blockers(self, driver):
        # Try to handle blocker
        try:
            # Locate and close blocker 
            accept = driver.find_element(By.ID, "onetrust-accept-btn-handler")
            accept.click()
        except:
            pass

        # Try to handle blocker
        try:
            # Locate and close blocker
            accept = driver.find_element(By.ID, "advertClose")
            accept.click()
        except:
            pass
        
    # Function to get team stats
    def get_team_stats(self, driver):
        # Handle add blockers
        self.handle_blockers(driver)

        # Allow page to load
        time.sleep(5)

        # Locate body
        body = driver.find_element(By.CSS_SELECTOR, 'tbody[class="league-table__tbody isPL"]')

        # For each position in table
        for position in range(1,21):
            # Locate a tag for each team's overview page
            tr = body.find_element(By.CSS_SELECTOR, f'tr[data-position="{position}"]')
            td = tr.find_element(By.CSS_SELECTOR, 'td[class="league-table__team team"]')
            a = td.find_element(By.TAG_NAME, 'a')

            # Click and navigate to team overview page
            a.click()

            # Allow page to load
            time.sleep(2)

            # Locate navigation bar tabs
            nav = driver.find_element(By.CSS_SELECTOR, 'nav[class="club-navigation wrapper"]')
            ul = nav.find_element(By.CSS_SELECTOR, 'ul[class="tablist club-navigation__nav"]')
            li = ul.find_elements(By.TAG_NAME, 'li')

            # Loop through navigation tab names
            for item in li:
                # Find link for tab
                a = item.find_element(By.TAG_NAME, 'a')

                # Find link for 'Stats' page
                if a.get_attribute('data-text') == 'Stats':
                    # Click 'Stats' tab
                    a.click()

                    # Found 'Stats' tab so break loop
                    break
                    
            # Allow page to load
            time.sleep(5)

            # Go back to leage table to move onto next team
            driver.execute_script("window.history.go(-2)")

            # Allow page to load
            time.sleep(1)
    
    # Function to get a single premier league table data
    def get_premier_league_data(self, driver, season):
        # Locate body
        body = driver.find_element(By.CSS_SELECTOR, 'tbody[class="league-table__tbody isPL"]') 

        # Loop through 20 league positions
        for position in range(1,21):
            # Locate elements
            tr = body.find_element(By.CSS_SELECTOR, f'tr[data-position="{position}"]')
            team_name = tr.get_attribute('data-filtered-table-row-name')
            td = tr.find_elements(By.TAG_NAME, 'td')

            count = 1 # Data tags do not have classes so I use the count and order of the tags to determine what the value means

            # Loop through each tag
            for value in td:
                # Get the text in the tag
                innerHTML = value.get_attribute('innerHTML') 

                # Check if it's a digit
                if innerHTML.lstrip('-').isdigit(): 
                    if count == 1:
                        played = innerHTML # How many games have been played
                    elif count == 2:
                        won = innerHTML # How many games have been won
                    elif count == 3:
                        drawn = innerHTML # How many games have been drawn
                    elif count == 4:
                        lost = innerHTML # How many games have been lost
                    elif count == 5:
                        gf = innerHTML # How many goals the team has scored
                    elif count == 6:
                        ga = innerHTML # How many goals have been conceded
                    elif count == 7:
                        gd = innerHTML # The goal difference between goals scored and goals conceded
                    elif count == 8:
                        points = innerHTML # Number of league points
                    else:
                        break
                    
                    count += 1 # Increase count to handle tag order

            # Append data to 'premier_league' using a dictionary
            self.premier_league['premier_league'].append(
                {
                        'season': season, # Season
                        'team_name': team_name, # Team name
                        'position': position, # Position in league
                        'played': played, # Games played
                        'won': won, # Games won
                        'drawn': drawn, # Games drawn
                        'lost': lost, # Games lost
                        'gf': gf, # Goals scored
                        'ga': ga, # Goals conceded
                        'gd': gd, # Goal difference
                        'points': points, # Team league points
                }
            )

    # Get the premier league data for all seasons
    def get_all_premier_leagues_data(self, driver):
        # Handle ad blockers
        self.handle_blockers(driver) 

        # Allow page to load
        time.sleep(5) 

        # Locate and open season filter dropdown
        dropdown = driver.find_element(By.CSS_SELECTOR, 'div[data-dropdown-block="compSeasons"]') 
        dropdown.click() # Click season dropdown

        # Allow dropdown to load
        time.sleep(1) 

        # Locate all season items in list
        seasons_div = dropdown.find_element(By.CSS_SELECTOR, 'div[class="dropdownListContainer"]')
        season_ul = seasons_div.find_element(By.CSS_SELECTOR, 'ul[class="dropdownList"]')
        seasons_li = season_ul.find_elements(By.TAG_NAME, 'li')

        count = 1  # Create a count in order to decide when to open the dropdown

        # Loop through each season filter
        for season_li in seasons_li:
            # Do not open the dropdown when count is 1 as it's already open
            # Open dropdown
            if count > 1:
                # Locate and open season filter dropdown
                dropdown = driver.find_element(By.CSS_SELECTOR, 'div[data-dropdown-block="compSeasons"]') # Locate season dropdown
                dropdown.click() # Click dropdown

                # Allow dropdown to load
                time.sleep(1) 

            # Only click filter for individual seasons and not 'All Seasons'
            if (season_li.get_attribute('data-option-name') != 'All Seasons'):
                # Click season and filter table
                season_li.click() 

                # Allow league table to load
                time.sleep(10) 

                # Get season text
                season = season_li.get_attribute('data-option-name') 

                # Get premier league data for season
                self.get_premier_league_data(driver, season) 

                count += 1 # Increment count

            # If season filter is 'All Seasons' keep count as 1
            else:
                count = count

    # Function to scrape all premier league tables from all seasons
    def scrape_all_premier_league_tables(self, url):
        # Initialise Selenium webdriver
        driver = webdriver.Chrome() 

        # Try to open url
        try:
            driver.get(url) # Load the webpage
            
            # Try to get data
            try:
                # Get the premier league data for all seasons
                self.get_all_premier_leagues_data(driver)  

            except Exception as e:
                # Print error
                print("An error occurred get_premier_league:", str(e)) 

                # Pass error and move on
                pass 
            
            # Return data for premier league data for all seasons
            return self.premier_league 
            
        except Exception as e:
            # Print error
            print("An error occurred:", str(e))
        
        finally:
            # Close the browser window
            driver.quit() 

    # Scapa team stats
    def scrape_all_team_stats(self, url):
        # Initialise Selenium webdriver
        driver = webdriver.Chrome() 
        
        # Try open url
        try:
            # Load the webpage
            driver.get(url)

            # Try get team stats
            try:
                # Get team stats
                self.get_team_stats(driver)
            except Exception as e:
                # Print error
                print("An error occurred get_team_stats:", str(e))

                # Pass
                pass
            
            # Return team stats data
            return self.team_stats
            
        except Exception as e:
            print("An error occurred:", str(e))
        
        finally:
            # Close the browser window
            driver.quit()

# Scrape Premier League Table For Each Seasons

In [295]:
# Set url
url = 'https://www.premierleague.com/tables'

# Create webscraper from class
webscraper = webscraper()

# Scrape premier league data for all seasons
premier_league = webscraper.scrape_all_premier_league_tables(url)

# Save data to df
df = pd.DataFrame(premier_league['premier_league'])

#### Export Data

In [299]:
# Export data to xlsx
df.to_excel('data/premier_league_data.xlsx', index=False)

# Scrape Team Stats For All Seasons