## Scraping Football Stats Data With Selenium and BeautifulSoup.

The goal of this project is to scrape football stats data that will be used for a KMeans clustering analysis. Web scraping is an essential data collection tools for cases where an API is not readily available.

#### Why Selenium
Selenium is an automated testing tool for web pages. The reason selenium is being used is because the site that is to be scraped has some of its pages rendered in java script so using the requests library to download the html doesnt't give the relevant data. The selenium library will be used to render the javascript in the browser before the html is downloaded.

BeautifoulSoup library will be used to parse the html data into a `pandas.DataFrame` object and the stats file, as well as other files such as the club squads data will be stored as a csv file. A data ditionary was scraped from the site too and stored as a csv file.

The key decision to save each html file before parsing them was to increase the interval between requests sent to the sites server.

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import re
import time
import json

In [2]:
def get_stats(url: str, filename: str) -> None:
    """Function to download html of Fbref stats page"""
    
    with webdriver.Chrome() as driver:
        driver.get(url)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") # simulates a scroll to the end of the webpage
        time.sleep(1)
        data = driver.page_source
    
    with open(f"{filename}.html", "w+", encoding = "utf-8") as f:
        f.write(data)
        

In [3]:
def parse_html(filename: str, id_: str) -> None:
    """Function to generate csv files from he downloaded stats html files"""
    
    with open(F"{filename}.html", encoding = "utf-8") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")
    player_table = soup.find("table", id = id_)
    for element in player_table.find_all("tr", class_="thead"):
        element.decompose() # removes every tr element with  class "thead"
    
    player_stats = pd.read_html(str(player_table))[0]
    # player_stats.columns = player_stats.columns.droplevel()
    
    player_stats.to_csv(f"{filename}.csv", index=False)

In [4]:
def save_stats_data() -> None:
    urls = ["https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats",
            "https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats",
            "https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats",
            "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats",
            "https://fbref.com/en/comps/Big5/keepers/players/Big-5-European-Leagues-Stats"
           ]
    # contains the file names and html id for each
    stats = [("gca.", "stats_gca"), ("shooting", "stats_shooting"), ("passing", "stats_passing"), 
             ("defense", "stats_defense"), ("goalkeeping", "stats_keeper")]
    
    for url, stat in zip(urls, stats):
        filename = stat[0]
        id_ = stat[1]
        get_stats(url, filename)
        parse_html(filename, id_)

In [6]:
save_stats_data()

In [7]:
def get_data_dict(filename: str) -> None:
    
    with open(f"{filename}.html", encoding = "utf-8") as f:
        page = f.read()
    
    soup = BeautifulSoup(page, "html.parser")
    data_dict = soup.find_all("th", class_ = "poptip")
    
    column_names = []
    values = []
    details = []

    for stat in data_dict:
        column_name = stat.text
        value = stat["aria-label"]
        try:
            detail = stat["data-tip"].replace("<br>", " ").replace("<strong>", "").replace("</strong>", ".")
        except: 
            detail = ""
    
        column_names.append(column_name)
        values.append(value)
        details.append(detail)
    
    df = pd.DataFrame({"column_name" : column_names, "value" : values, "details" : details})
    df = df.drop_duplicates()
    df.to_csv(f"{filename}_dict.csv", index=False)

In [8]:
stats_list = ["gca", "shooting", "passing",
             "defense","goalkeeping"]
for stats in stats_list:
    get_data_dict(stats)

In [9]:
def get_laliga_squads() -> None:
    """Function to scrape La Liga club page, extract the squad list and convert the result to a csv file"""
    
    content = requests.get("https://www.laliga.com/en-GB/laliga-santander/clubs")
    soup = BeautifulSoup(content.text, "html.parser")

    # extract link to squad page for each club
    links = soup.find_all("div", class_="styled__ItemContainer-fyva03-1")
    club_links = []
    for link in links:
        link = link.find("a")["href"]
        club_link = f"https://www.laliga.com{link}/squad"
        club_links.append(club_link)

    # create a pandas dataframe of club squad data    
    laliga_squads = []
    for club_link in club_links:
        club_name = club_link.replace("https://www.laliga.com/en-GB/clubs/", "").replace("/squad", "").replace("-", " ").title()
        content = requests.get(club_link)
        data = content.text
        soup = BeautifulSoup(data, "html.parser")
        names = []
        for element in soup.find_all("div", class_="styled__PlayerName-sc-148d0nz-4 bzHSBG"):
            name = element.find_all("p")
            names.append(name)

        player_names = []
        player_positions = []
        

        for name in names:
            player_name = name[0].text
            player_position = name[1].text
            player_names.append(player_name)
            player_positions.append(player_position)
        df = pd.DataFrame({"player_name" : player_names, "position" : player_positions, "club" : club_name})
        laliga_squads.append(df)
        time.sleep(1)
        
    combined = pd.concat(laliga_squads)
    combined.to_csv("laliga_squads.csv", index=False)


In [10]:
def get_ligue1_squads() -> None:
    
    """Function to scrape Ligue1 club page, extract the squad list and convert the result to a csv file"""
    
    ligue1_clublist = requests.get("https://www.ligue1.com/clubs/List")
    ligue1_data = ligue1_clublist.text

    ligue1_soup = BeautifulSoup(ligue1_data, "html.parser")

    club_list = ligue1_soup.find("div", class_="ClubListPage-list")

    # extract link for club squads page
    ligue1_club_links = []
    for element in club_list.find_all("a", class_="ClubListPage-link"):
        element = element["href"]
        link = f"https://www.ligue1.com{element}"
        link = link[:28] + "/squad" + link[28:]
        ligue1_club_links.append(link)
        
    # create a pandas dataframe of club squad data
    ligue1_squads = []
    for link in ligue1_club_links:
        club_name = link.replace("https://www.ligue1.com/clubs/squad?id=", "").replace("-", " ").title()
        content = requests.get(link)
        data = content.text
        soup = BeautifulSoup(data, "html.parser")
        club_squad = soup.find_all("div", class_ = "SquadTeamTable-flip-card")
        
        player_names = []
        positions = []
        
        for player in club_squad:
            player_name = player.find("span", class_="SquadTeamTable-playerName").text
    
            position = player.find("span", class_="SquadTeamTable-position").text
    
            player_names.append(player_name)
            positions.append(position)
        
        df = pd.DataFrame({"player_name" : player_names, "position" : positions, "club" : club_name})
        ligue1_squads.append(df)
        time.sleep(1)
        
    combined = pd.concat(ligue1_squads)
    combined.to_csv("ligue1_squads.csv", index=False)

In [11]:
def get_bundesliga_squad() -> None:
    """Function to scrape bundesliga player page and convert the result to a csv"""
    
    bund_content = requests.get("https://www.bundesliga.com/en/bundesliga/player")
    bund_soup =  BeautifulSoup(bund_content.text, "html.parser")
    clubs = []
    
    # create a pandas dataframe with the players position for each club
    for club in bund_soup.find_all("mat-expansion-panel"):
        club_name = club.find("span").find("h2").text
        players = club.find_all("div", class_="row")
        
        gk = players[0]
        gk_position =  gk.find("div", class_ = "position").text
        goalkeepers = gk.find_all("a")
        goalkeepers = [a["href"].replace("/en/bundesliga/player/", "").replace("-", " ").title() for a in goalkeepers]
        goalkeepers = pd.DataFrame({"players": goalkeepers, "position": gk_position, "club": club_name})
        
        df = players[1]
        df_position = df.find("div", class_ = "position").text
        defenders = df.find_all("a")
        defenders = [a["href"].replace("/en/bundesliga/player/", "").replace("-", " ").title() for a in defenders]
        defenders = pd.DataFrame({"players": defenders, "position": df_position, "club": club_name})
        
        mf = players[2]
        mf_position = mf.find("div", class_ = "position").text
        midfielders = mf.find_all("a")
        midfielders = [a["href"].replace("/en/bundesliga/player/", "").replace("-", " ").title() for a in midfielders]
        midfielders = pd.DataFrame({"players": midfielders, "position": mf_position, "club": club_name})
        
        att = players[3]
        att_position = att.find("div", class_ = "position").text
        attackers = att.find_all("a")
        attackers = [a["href"].replace("/en/bundesliga/player/", "").replace("-", " ").title() for a in attackers]
        attackers = pd.DataFrame({"players": attackers, "position": att_position, "club": club_name})
        
        club_squad = pd.concat([goalkeepers, defenders, midfielders, attackers])
        clubs.append(club_squad)
        
    bundesliga_squad = pd.concat(clubs)
    bundesliga_squad.to_csv("bundesliga_squads.csv", index=False)
        

In [12]:
def get_pl_squads() -> None:
    """Function to scrape Premier League club page, extract the squad list and convert the result to a csv file"""
    
    pl = requests.get("https://www.premierleague.com/clubs")
    pl_soup = BeautifulSoup(pl.text)

    # extract link to go to the squad page of each premier league club
    pl_links = [link["href"] for link in pl_soup.find_all("a", class_ = "indexItem")]
    pl_links = [f"https://www.premierleague.com{link}" for link in pl_links]
    pl_links = [link.replace("overview", "squad") for link in pl_links]
    
    # create a pandas dataframe of club squad data
    pl_squads = []
    for link in pl_links:
        club_name = re.match("https://www.premierleague.com/clubs/[\d]+/(\w+-?\w+.+)/squad", link).groups()[0]
        content = requests.get(link)
        soup = BeautifulSoup(content.text)
        
        player_names = [
            name["href"].split("/")[3].replace("-", " ") for name in soup.find_all("a", class_= "stats-card__wrapper")
        ]
        positions = [position.text for position in soup.find_all("div", class_= "stats-card__player-position")]
        
        df = pd.DataFrame({"player_name" : player_names, "position" : positions, "club" : club_name})
        pl_squads.append(df)
        time.sleep(1)
        
    combined = pd.concat(pl_squads)
    combined.to_csv("pl_squads.csv", index=False)

In [13]:
def get_serie_a_squad() -> None:
    """Function to scrape Serie A club page, extract the squad list and convert the result to a csv file"""
    
    url = "https://www.legaseriea.it/it/serie-a/squadre"
    content = requests.get(url)
    soup = BeautifulSoup(content.text)

    # extract link to squad page for each club
    seriea_clublinks = [a["href"] for a  in soup.find_all("a")[0:20]]
    seriea_clublinks = [f"https://www.legaseriea.it{link}/squadra" for link in seriea_clublinks]


    # using selenium cause most of the page is rendered with java script
    serie_a_squads = []
    for link in seriea_clublinks:
        with webdriver.Chrome() as driver:
            driver.get(link)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") 
            driver.maximize_window()
            time.sleep(3)
            data = driver.page_source
            
        club_name = link.replace("https://www.legaseriea.it/it/team/", "").replace("/squadra", "").title()
        soup = BeautifulSoup(data)
        player_data = soup.find_all("div", class_="hm-card-body")
        player_names = [player.find_all("p")[0].text for player in player_data]
        positions = [player.find_all("p")[1].text for player in player_data]
        df = pd.DataFrame({"player_name" : player_names, "position" : positions, "club" : club_name})
        df["position"] = (df["position"].str.replace("Portiere", "Goalkeeper", regex=True)
         .str.replace("Portiere", "Goalkeeper", regex=True)
         .str.replace("Difensore", "Defender", regex=True)
         .str.replace("Centrocampista", "Midfielder", regex=True)
         .str.replace("Attaccante", "Forward", regex=True)
        )
        serie_a_squads.append(df)
        time.sleep(1)
        
    combined = pd.concat(serie_a_squads)
    combined.to_csv("serie_a_squads.csv", index=False)

In [14]:
def save_squads() -> None:
    get_laliga_squads()
    get_ligue1_squads()
    get_bundesliga_squad()
    get_pl_squads()
    get_serie_a_squad()

In [15]:
save_squads()