# Imports

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

# Webscraper Classes

### Premier League Table Webscraper

In [332]:
# Webscraper class to scrape the data
class LeagueTableScraper:

    # Initialise data containers
    def __init__(self):
        self.premier_league = { 'premier_league': []}

    # Function to handle add blockers
    def handle_blockers(self, driver):
        # Try to handle blocker
        try:
            # Locate and close blocker 
            accept = driver.find_element(By.ID, "onetrust-accept-btn-handler")
            accept.click()
        except:
            pass

        # Try to handle blocker
        try:
            # Locate and close blocker
            accept = driver.find_element(By.ID, "advertClose")
            accept.click()
        except:
            pass
    
    # Locate tags for team
    def locate_team_tags(self, body, position):
        # Locate elements
        tr = body.find_element(By.CSS_SELECTOR, f'tr[data-position="{position}"]')
        td = tr.find_elements(By.TAG_NAME, 'td')

        return tr, td
    
    # Locate body element
    def locate_body(self, driver):
         # Locate body
        body = driver.find_element(By.CSS_SELECTOR, 'tbody[class="league-table__tbody isPL"]') 

        return body
    
    # Get team information
    def get_team_info(self, td):
        count = 1 # Data tags do not have classes so I use the count and order of the tags to determine what the value means

        # Loop through each tag
        for value in td:
            # Get the text in the tag
            innerHTML = value.get_attribute('innerHTML') 

            # Check if it's a digit
            if innerHTML.lstrip('-').isdigit(): 
                if count == 1:
                    played = innerHTML # How many games have been played
                elif count == 2:
                    won = innerHTML # How many games have been won
                elif count == 3:
                    drawn = innerHTML # How many games have been drawn
                elif count == 4:
                    lost = innerHTML # How many games have been lost
                elif count == 5:
                    gf = innerHTML # How many goals the team has scored
                elif count == 6:
                    ga = innerHTML # How many goals have been conceded
                elif count == 7:
                    gd = innerHTML # The goal difference between goals scored and goals conceded
                elif count == 8:
                    points = innerHTML # Number of league points
                else:
                    break
                
                count += 1 # Increase count to handle tag order

        return played, won, drawn, lost, gf, ga, gd, points

    # Append data
    def append_data(self, season, team_name, position, played, won, drawn, lost, gf, ga, gd, points):
        # Append data to 'premier_league' using a dictionary
        self.premier_league['premier_league'].append(
            {
                    'season': season, # Season
                    'team_name': team_name, # Team name
                    'position': position, # Position in league
                    'played': played, # Games played
                    'won': won, # Games won
                    'drawn': drawn, # Games drawn
                    'lost': lost, # Games lost
                    'gf': gf, # Goals scored
                    'ga': ga, # Goals conceded
                    'gd': gd, # Goal difference
                    'points': points, # Team league points
            }
        )

    # Open season filter dropdown
    def open_season_dropdown(self, driver):
        # Locate and open season filter dropdown
        dropdown = driver.find_element(By.CSS_SELECTOR, 'div[data-dropdown-block="compSeasons"]') 
        dropdown.click() # Click season dropdown

        return dropdown

    # Find all the options in the season filter dropdown
    def find_seasons(self, dropdown):
        # Locate all season items in list
        seasons_div = dropdown.find_element(By.CSS_SELECTOR, 'div[class="dropdownListContainer"]')
        season_ul = seasons_div.find_element(By.CSS_SELECTOR, 'ul[class="dropdownList"]')
        seasons_li = season_ul.find_elements(By.TAG_NAME, 'li')

        return seasons_li

    # Function to get a single premier league table data
    def get_premier_league_data(self, driver, season):
        # Locate body
        body = self.locate_body(driver)

        # Loop through 20 league positions
        for position in range(1,21):
            # Locate elements
            tr, td = self.locate_team_tags(body, position)

            # Get team name
            team_name = tr.get_attribute('data-filtered-table-row-name')

            # Get team info
            played, won, drawn, lost, gf, ga, gd, points = self.get_team_info(td)

            # Append data
            self.append_data(season, team_name, position, played, won, drawn, lost, gf, ga, gd, points)

    # Get the premier league data for all seasons
    def get_all_premier_leagues_data(self, driver):
        
        # Allow page to load
        time.sleep(5) 

        # Locate and open season filter dropdown
        dropdown = self.open_season_dropdown(driver)

        # Allow dropdown to load
        time.sleep(1) 

        # Locate all season items in list
        seasons_li = self.find_seasons(dropdown)

        count = 1  # Create a count in order to decide when to open the dropdown

        # Loop through each season filter
        for season_li in seasons_li:
            # Do not open the dropdown when count is 1 as it's already open
            # Open dropdown
            if count > 1:
                # Locate and open season filter dropdown
                dropdown = self.open_season_dropdown(driver)

                # Allow dropdown to load
                time.sleep(1) 

            # Only click filter for individual seasons and not 'All Seasons'
            if (season_li.get_attribute('data-option-name') != 'All Seasons'):
                # Click season and filter table
                season_li.click() 

                # Allow league table to load
                time.sleep(10) 

                # Get season text
                season = season_li.get_attribute('data-option-name') 

                # Get premier league data for season
                self.get_premier_league_data(driver, season) 

                count += 1 # Increment count

            # If season filter is 'All Seasons' keep count as 1
            else:
                count = count

    # Function to scrape all premier league tables from all seasons
    def scrape_data(self, url):
        # Initialise Selenium webdriver
        driver = webdriver.Chrome() 

        # Try to open url
        try:
            driver.get(url) # Load the webpage
            
            # Allow page load
            time.sleep(2)

            # Handle ad blockers
            self.handle_blockers(driver) 

            # Allow page load
            time.sleep(2)

            # Try to get data
            try:
                # Get the premier league data for all seasons
                self.get_all_premier_leagues_data(driver)  

            except Exception as e:
                # Print error
                print("An error occurred get_premier_league:", str(e)) 

                # Pass error and move on
                pass 
            
            # Return data for premier league data for all seasons
            return self.premier_league 
            
        except Exception as e:
            # Print error
            print("An error occurred:", str(e))
        
        finally:
            # Close the browser window
            driver.quit() 

### Team Stats Webscraper

In [77]:
# Webscraper class to scrape the data
class TeamStatsScraper:

    # Initialise data containers
    def __init__(self):
        self.team_stats = { 'team_stats': []}
        self.page_change = 0

    # Function to handle add blockers
    def handle_blockers(self, driver):
        # Try to handle blocker
        try:
            # Locate and close blocker 
            accept = driver.find_element(By.ID, "onetrust-accept-btn-handler")
            accept.click()
        except:
            pass

        # Try to handle blocker
        try:
            # Locate and close blocker
            accept = driver.find_element(By.ID, "advertClose")
            accept.click()
        except:
            pass
        
    # Locate <a> tags for team
    def locate_team_links(self, body, position):
        # Locate a tag for each team's overview page
        tr = body.find_element(By.CSS_SELECTOR, f'tr[data-position="{position}"]')
        td = tr.find_element(By.CSS_SELECTOR, 'td[class="league-table__team team"]')
        a = td.find_element(By.TAG_NAME, 'a')

        # Click and navigate to team overview page
        self.click_filter(a)
    
    # Locate the nav element and it's tabs
    def locate_nav_tabs(self, driver):
        # Locate navigation bar tabs
        nav = driver.find_element(By.CSS_SELECTOR, 'nav[class="club-navigation wrapper"]')
        ul = nav.find_element(By.CSS_SELECTOR, 'ul[class="tablist club-navigation__nav"]')
        li = ul.find_elements(By.TAG_NAME, 'li')

        return li
    
    # Click the 'Stats' tab in the nav
    def click_stats_tab(self, li):
        # Loop through navigation tab names
        for item in li:
            # Find link for tab
            a = item.find_element(By.TAG_NAME, 'a')

            # Find link for 'Stats' page
            if a.get_attribute('data-text') == 'Stats':
                # Click 'Stats' tab
                self.click_filter(a)

                # Found 'Stats' tab so break loop
                break

    # Open seasons filter dropdown
    def open_seasons_dropdown(self, driver):
        # Locate and open season filter dropdown
        dropdown = driver.find_element(By.CSS_SELECTOR, 'div[data-dropdown-block="compSeasons"]') 
        dropdown.click() # Click season dropdown

        return dropdown
    
    # Locate the options for the season dropdown filter
    def locate_seasons(self, dropdown):
        # Locate all season items in list
        seasons_div = dropdown.find_element(By.CSS_SELECTOR, 'div[class="dropdownListContainer"]')
        season_ul = seasons_div.find_element(By.CSS_SELECTOR, 'ul[class="dropdownList"]')
        seasons_li = season_ul.find_elements(By.TAG_NAME, 'li')

        return seasons_li

    # Click to change page
    def click_filter(self, tag):
        tag.click() 
        self.page_change += 1

    # Get top level stats value
    def get_top_level_stats_value(self, container):
        container_ = container.find_element(By.CSS_SELECTOR, 'div[class="all-stats__top-stat"]')
        span = container_.find_element(By.TAG_NAME, 'span')
        value = span.get_attribute('innerHTML')

        return value
    
    # Get regurlar stats value
    def get_regular_stats_value(self, div_stat):
        stat_span = div_stat.find_element(By.CSS_SELECTOR, 'span[class="all-stats__regular-stat"]')
        stat_container = stat_span.find_element(By.TAG_NAME, 'span')
        value = stat_container.get_attribute('innerHTML')

        return value

    # Get top level stats
    def get_top_level_stats(self, driver, dict_):
        # Locate elements
        div_stats_wrapper = driver.find_element(By.CSS_SELECTOR, 'div[class="all-stats wrapper"]')
        div_all_stats = div_stats_wrapper.find_element(By.CSS_SELECTOR, 'div[data-widget="all-stats"]')
        stats_top_list = div_all_stats.find_element(By.CSS_SELECTOR, 'div[class="all-stats__top-list"]')

        # Locate stats containers
        stats_top_containers = stats_top_list.find_elements(By.CSS_SELECTOR, 'div[class="all-stats__top-stat-container"]')

        # Loop through containers and get values
        for container in stats_top_containers:
            # Get innerHTML for stats name
            stats_title_container = container.find_element(By.CSS_SELECTOR, 'div[class="all-stats__top-stat-name"]')
            innerHTML = stats_title_container.get_attribute('innerHTML')
            if innerHTML == 'Matches played':
                # Add value to dict
                dict_[innerHTML] = self.get_top_level_stats_value(container)
            elif innerHTML == 'Wins':
                # Add value to dict
                dict_[innerHTML] = self.get_top_level_stats_value(container)
            elif innerHTML == 'Losses':
                # Add value to dict
                dict_[innerHTML] = self.get_top_level_stats_value(container)

    # Get regular stats
    def get_regular_stats(self, driver, dict_):
        # Locate elements
        main_stats_ul = driver.find_element(By.CSS_SELECTOR, 'ul[class="all-stats__regular-list block-list-4 block-list-2-m"]')

        # Locate list of tags high level (Attact, Defence...)
        main_stats_li = main_stats_ul.find_elements(By.TAG_NAME, 'li')

        # Loop through highlevel tags
        for li in main_stats_li:
            # Locate elements
            div_li = li.find_element(By.CSS_SELECTOR, 'div[class="all-stats__list-container"]')

            # Locate stats containers
            div_stats = div_li.find_elements(By.CSS_SELECTOR, 'div[class="all-stats__regular-stat-container"]')

            # Loop through stats containers and find values
            for div_stat in div_stats:
                # Get innerHTML for stats name
                span = div_stat.find_element(By.CSS_SELECTOR, 'span[class="all-stats__regular-stat-name"]')
                innerHTML = span.get_attribute('innerHTML')
                
                # Add value to dict
                dict_[innerHTML] = self.get_regular_stats_value(div_stat)
            
    # Get season value
    def get_season(self, season_li, dict_):
        # Get season text
        season = season_li.get_attribute('data-option-name') 

        # Add value to dict
        dict_['Season'] = season

    # Get team name
    def get_team_name(self, driver, dict_):
        # Locate element
        header_container = driver.find_element(By.CSS_SELECTOR, 'header[data-widget="club-header"]')
        header_div = header_container.find_element(By.CSS_SELECTOR, 'div[class="club-header__text-content"]')
        header_h2 = header_div.find_element(By.TAG_NAME, 'h2')

        # Get innerHTML for team name
        innerHTML = header_h2.get_attribute('innerHTML')

        # Add value to dict
        dict_['Team name'] = innerHTML

    # Get stats data for team name, season and all other stats
    def get_stats_data(self, driver, dict_, season_li):
        # Get team name
        self.get_team_name(driver, dict_)

        # Get season text
        self.get_season(season_li, dict_)

        # Get top level stats
        self.get_top_level_stats(driver, dict_)

        # Get regular stats
        self.get_regular_stats(driver, dict_)

    # Function to get team stats
    def get_team_stats(self, driver):
        # Locate body
        body = driver.find_element(By.CSS_SELECTOR, 'tbody[class="league-table__tbody isPL"]')

        # For each position in table
        for position in range(1,21):
            # Count page changes
            self.page_change = 0

            # Locate and click tag for team's overview page
            self.locate_team_links(body, position)

            # Allow page to load
            time.sleep(5)

            # Locate navigation bar tabs
            li = self.locate_nav_tabs(driver)

            # Click 'Stats' tab
            self.click_stats_tab(li)

            # Allow page to load
            time.sleep(5)

            # Locate and open season filter dropdown
            dropdown = self.open_seasons_dropdown(driver)

            # Allow dropdown to load
            time.sleep(1) 

            # Locate all season items in list
            seasons_li = self.locate_seasons(dropdown)

            count = 1  # Create a count in order to decide when to open the dropdown

            # Loop through each season filter
            for season_li in seasons_li:
                # Do not open the dropdown when count is 1 as it's already open
                # Open dropdown
                if count > 1:
                    # Locate and open season filter dropdown
                    dropdown = self.open_seasons_dropdown(driver)

                    # Allow dropdown to load
                    time.sleep(1) 

                # Only click filter for individual seasons and not 'All Seasons'
                if (season_li.get_attribute('data-option-name') != 'All Seasons'):
                    # Click season and filter table
                    self.click_filter(season_li)

                    # Allow league table to load
                    time.sleep(4) 

                    dict_ = {}

                    self.get_stats_data(driver, dict_, season_li)
                    
                     # Append data to 'premier_league' using a dictionary
                    self.team_stats['team_stats'].append(dict_)

                    count += 1 # Increment count

                # If season filter is 'All Seasons' keep count as 1
                else:
                    count = count
        
            # Allow page to load
            time.sleep(5)

            # Go back to leage table to move onto next team
            driver.execute_script(f'window.history.go(-{self.page_change})')

            # Allow page to load
            time.sleep(1)

    # Scapa team stats
    def scrape_data(self, url):
        # Initialise Selenium webdriver
        driver = webdriver.Chrome() 
        
        # Try open url
        try:
            # Load the webpage
            driver.get(url)

            # Allow page load
            time.sleep(10)

            # Handle add blockers
            self.handle_blockers(driver)

            # Allow page load
            time.sleep(5)

            # Try get team stats
            try:
                # Get team stats
                self.get_team_stats(driver)
            except Exception as e:
                # Print error
                print("An error occurred get_team_stats:", str(e))

                # Pass
                pass
            
            # Return team stats data
            return self.team_stats
            
        except Exception as e:
            print("An error occurred:", str(e))
        
        finally:
            # Close the browser window
            driver.quit()

# Scrape Premier League Table For Each Seasons

In [333]:
# Create webscraper from class
LeagueTableScraper = LeagueTableScraper()

# Set url
url = 'https://www.premierleague.com/tables'

# Scrape premier league data for all seasons
premier_league = LeagueTableScraper.scrape_data(url)

# Save data to df
df_premier_league = pd.DataFrame(premier_league['premier_league'])

#### Export Data

In [299]:
# Export data to xlsx
df_premier_league.to_excel('data/premier_league_data.xlsx', index=False)

# Scrape Team Stats For All Seasons

In [75]:
# Create webscraper from class
TeamStatsScraper = TeamStatsScraper()

# Set url
url = 'https://www.premierleague.com/tables'

# Scrape team stats data for all seasons
team_stats = TeamStatsScraper.scrape_data(url)

# Save data to df
df_team_stats = pd.DataFrame(team_stats['team_stats'])

innerHTML Liverpool
here 1
here 2
here 3
here 4
here 5
here 6
outerHTLM Goals
here 7
here 8
here 5
here 6
outerHTLM Goals per match
here 7
here 8
here 5
here 6
outerHTLM Shots
here 7
here 8
here 5
here 6
outerHTLM Shots on target
here 7
here 8
here 5
here 6
outerHTLM Shooting accuracy %
here 7
here 8
here 5
here 6
outerHTLM Penalties scored
here 7
here 8
here 5
here 6
outerHTLM Big Chances Created
here 7
here 8
here 5
here 6
outerHTLM Hit woodwork
here 7
here 8
here 3
here 4
here 5
here 6
outerHTLM Passes
here 7
here 8
here 5
here 6
outerHTLM Passes per match
here 7
here 8
here 5
here 6
outerHTLM Pass accuracy %
here 7
here 8
here 5
here 6
outerHTLM Crosses
here 7
here 8
here 5
here 6
outerHTLM Cross accuracy %
here 7
here 8
here 3
here 4
here 5
here 6
outerHTLM Clean sheets
here 7
here 8
here 5
here 6
outerHTLM Goals Conceded
here 7
here 8
here 5
here 6
outerHTLM Goals conceded per match
here 7
here 8
here 5
here 6
outerHTLM Saves
here 7
here 8
here 5
here 6
outerHTLM Tackles
here 7
h