In [70]:
import time
import uuid
import sys
import os
import json
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException


In [103]:
def _load_and_accept_cookies() -> webdriver.Chrome:
    '''
    Open besoccer.com site and accepts cookies
    Set driver to google Chrome Beta version due to bug in driver v 103

    Returns
    -------
    driver: webdriver.Chrome

    '''
    chrome_options = Options()
    chrome_options.binary_location = "/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta"
        
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.besoccer.com/competition/scores/premier_league/2021/round1")
    delay = 10
    try:
        button_container = WebDriverWait(driver, delay).until(EC.presence_of_element_located(
            (By.XPATH, '//*[@class="qc-cmp2-summary-buttons"]')))
        print("Frame Ready!")
        accept_cookies_button = button_container.find_element(By.XPATH,"//*[text()='AGREE']")
        accept_cookies_button.click()
        time.sleep(1)
    except TimeoutException:
        print("Loading took too much time!")

    return driver

def _get_fixture_list_per_round(driver)->list:
    '''_get_fixture_list_per_round 
    For each Matchday, collect a list of all fixtures on that match day

    Arguments:
        driver: webdriver.Chrome

    Returns:
        list
    '''
    match_list = []
    match_list_body = driver.find_element(By.XPATH,'//*[@class="panel-body p0 match-list-new"]')
    match_list_container = match_list_body.find_elements(By.TAG_NAME,'a')
    for row in match_list_container:
            link = row.get_attribute('href')
            match_list.append(link)
    return match_list

def _collect_all_rounds_for_season(driver, league, season)-> list:
    '''_collect_all_rounds_for_season 
    Function to append all fixture list p/round into collective list for each season

    Arguments:
        driver -- webdriver.Chrome
        league -- name of the league for collection
        season -- name of the season for collection

    Returns:
        list
    '''

    season_match_list = []
    for round in range(1, 39, 1):
        driver.get(f"https://www.besoccer.com/competition/scores/{league}/{season}/round{round}")
        match_list = _get_fixture_list_per_round(driver)
        round_dictionary = {f"{round}": match_list}
        season_match_list.append(round_dictionary)
        time.sleep(1)
    
    print(f"Match lists collected for Season : {season} | League : {league}")
    
    return season_match_list


def _collect_season_fixture_lists(driver, league, season_list)-> list:
    '''_collect_season_fixture_lists 
    Function to append each season fixture list into one list

    Arguments:
        driver -- webdriver.Chrome
        league -- name of the league for collection
        season_list -- list containing all fixture links per season

    Returns:
        list
    '''
    league_list = []
    for season in season_list:
        season_match_list = _collect_all_rounds_for_season(driver, league, season)
        season_dictionary = {f"{season}": season_match_list}
        league_list.append(season_dictionary)
    
    return league_list

def _collect_match_data(driver, link, league, season, round)-> dict:
    '''_collect_match_data 
    Function to collect required match data from each fixture page.
    Required data is:
    - home team name
    - away team name
    - home goals scored (if match taken place)
    - away goals scored (if match taken place)
    - home elo rating
    - away elo rating




    Arguments:
        driver -- webdriver.Chrome
        link -- link to fixture page
        league -- name of the league for fixture
        season -- name of season for fixture
        round -- name of round of fixture

    Returns:
        dictionary containing required data
    '''
    
    driver.get(link + "/analysis")
    home_team_div = driver.find_element(By.XPATH,'//*[@itemprop="homeTeam"]')
    home_team = home_team_div.find_element(By.TAG_NAME, 'a').text

    away_team_div = driver.find_element(By.XPATH,'//*[@itemprop="awayTeam"]')
    away_team = away_team_div.find_element(By.TAG_NAME, 'a').text

    try:
        home_ELO_rating = driver.find_element(By.XPATH,'//*[@class="team1-c"]').text
        away_ELO_rating = driver.find_element(By.XPATH,'//*[@class="team2-c"]').text
    except NoSuchElementException:
        home_ELO_rating = "blank"
        away_ELO_rating = "blank"


    try:
        home_goals = driver.find_element(By.XPATH,'//*[@class="r1"]').text
        away_goals = driver.find_element(By.XPATH,'//*[@class="r2"]').text
    except NoSuchElementException:
        home_goals = "blank"
        away_goals = "blank"
    
    match_dictionary = {"Home_Team" : home_team, "Away_Team" : away_team, "Season" : season, "Round" : round, "League" : league, "Home_Goals" : home_goals, "Away_Goals" : away_goals, "ELO_Home": home_ELO_rating, "ELO_Away": away_ELO_rating}

    return match_dictionary

def _create_dataframe_for_collected_data(league_fixture_list, driver, league)-> pd.DataFrame:
    '''_create_dataframe_for_collected_data 
    Function to collect match data p/round - season - league
    Once fixture information collected - append to master pandas dataframe

    Arguments:
        league_fixture_list -- master list of all fixtures where collection required for each league
        driver -- webdriver.Chrome
        league -- name of the league for collection

    Returns:
        pandas dataframe
    '''
    scraped_fixture_info_df = pd.DataFrame(columns=[
    'Home_Team', 'Away_Team','Season', 'Round', 'League', 'Home_Goals', 
    'Away_Goals', 'ELO_Home', 'ELO_Away'
    ])
    for season_fixtures in league_fixture_list:
        for season_id, round_list in season_fixtures.items():
            for round_dictionary in round_list:
                for round_id, fixture_link_list in round_dictionary.items():
                    for fixture_link in fixture_link_list:
                        match_dictionary = _collect_match_data(driver, fixture_link, league, season_id, round_id)
                        match_dictionary_df = pd.DataFrame([match_dictionary])
                        scraped_fixture_info_df = pd.concat([scraped_fixture_info_df, match_dictionary_df], ignore_index=True)

                print(f"Data collected for Round : {round_id} | Season : {season_id} | League: {league}")

            print(f"Data collected for Season : {season_id} | League: {league}")

    return scraped_fixture_info_df






In [None]:
driver = _load_and_accept_cookies()

league = "premier_league"
season_list = ["2021", "2022", "2023"]

league_fixture_list = _collect_season_fixture_lists(driver, league, season_list)



In [104]:
scraped_fixture_info_df = _create_dataframe_for_collected_data(league_fixture_list, driver, league)


Data collected for Round : 1 | Season : 2021 | League: premier_league
Data collected for Round : 2 | Season : 2021 | League: premier_league
Data collected for Round : 3 | Season : 2021 | League: premier_league
Data collected for Round : 4 | Season : 2021 | League: premier_league
Data collected for Round : 5 | Season : 2021 | League: premier_league
Data collected for Round : 6 | Season : 2021 | League: premier_league
Data collected for Round : 7 | Season : 2021 | League: premier_league
Data collected for Round : 8 | Season : 2021 | League: premier_league
Data collected for Round : 9 | Season : 2021 | League: premier_league
Data collected for Round : 10 | Season : 2021 | League: premier_league
Data collected for Round : 11 | Season : 2021 | League: premier_league
Data collected for Round : 12 | Season : 2021 | League: premier_league
Data collected for Round : 13 | Season : 2021 | League: premier_league
Data collected for Round : 14 | Season : 2021 | League: premier_league
Data collected 

In [105]:
scraped_fixture_info_df

Unnamed: 0,Home_Team,Away_Team,Season,Round,League,Home_Goals,Away_Goals,ELO_Home,ELO_Away
0,Fulham,Arsenal,2021,1,premier_league,0,3,69,90
1,Crystal Palace,Southampton,2021,1,premier_league,1,0,74,79
2,Liverpool,Leeds United,2021,1,premier_league,4,3,96,69
3,West Ham,Newcastle,2021,1,premier_league,0,2,76,74
4,West Bromwich Albion,Leicester,2021,1,premier_league,0,3,79,83
...,...,...,...,...,...,...,...,...,...
1135,Everton,AFC Bournemouth,2023,38,premier_league,blank,blank,blank,blank
1136,Leeds United,Tottenham Hotspur,2023,38,premier_league,blank,blank,blank,blank
1137,Leicester,West Ham,2023,38,premier_league,blank,blank,blank,blank
1138,Man. Utd,Fulham,2023,38,premier_league,blank,blank,blank,blank


In [106]:
#path = r'/Users/tom/Documents/Coding/AiCore/Projects/4. Football Match Outcome Predictor /Scraped Datasets/raw_data.csv'
#scraped_fixture_info_df.to_csv(path, index=False)