In [4]:
import requests
import time

from bs4 import BeautifulSoup
from typing import List, Dict, Tuple

class PremiereLeague:
  """
  The class will fetch and construct all team names and team home page urls for a specific year.
  It's important to note a few things:
    1. By default team names / URLs are initialized and fetched from most recent premiere league.
    2. The urls for each team downloads the HTML for a specific year and by default premiere league
    data is filtered except for `scores and fixtures` table which gives data for all competitions.
  """
    
  def __init__(self, years: List[int]) -> None:
    self.__main_page = requests.get("https://fbref.com/en/comps/9/Premiere-League-Stats")
    self.__team_urls: Dict[Tuple[str, str], str] = {}
    self.__team_data: Dict[Tuple[str, str], str] = {}
    self.__years = years
    self.__construct_urls_for_all_years()

  def get_team_url(self, team_name: str, season: int) -> str:
    """
    Args:
    team_name (str): The name of the team.
    season (int): The season to select.
    
    Returns:
    str: The URL for the specified team and year.
    """
    return self.__team_urls.get((team_name, season), None)
    
  def get_all_seasons(self) -> List[str]:
    """
    Returns:
        List[str]: A list of all seasons available.
    """
    return [f'{year}-{year+1}' for year in self.__years]
  
  def get_all_team_names(self) -> List[str]:
    """
    Returns:
        List[str]: A list of all team names available.
    """
    return sorted(set(name for name, _ in self.__team_urls.keys()))
  
  def get_all_urls(self) -> List[str]:
    """
    Returns:
        List[str]: A list of all team urls available.
    """
    return [url for _, _, url in self.get_all_teams()]
  
  def get_html(self, team_name: str, season: str):
    return self.__team_data.get((team_name, season), None)
  
  def fetch_html(self, team_name: str, season: str, url=None):
    key = (team_name, season)
    
    if url is None:
      url = self.get_team_url()
        
    if key in self.__team_data.keys() or url is None:
      return
    
    response = requests.get(url)
    if response.status_code == 200:
      self.__save_html(team_name, season, response.text)
    else:
      # remove the team from list of teams / year as we don't have the data.
      del self.__team_urls[(team_name, season)]

  def fetch_all_html(self, throttle=5):
    teams = self.get_all_teams()
    for (team, season, url) in teams:
      time.sleep(throttle)
      self.fetch_html(team, season, url)
      
  def get_all_teams(self) -> List[Tuple[str, str, str]]:
    """
    Returns:
        List[Tuple[str, str, str]]: A list tuple of (team name, season, url).
    """
    return [(club_name, season, url) for (club_name, season), url in self.__team_urls.items()]
  
  def __save_html(self, team_name: str, season: str, html: str) -> None:
    """
    Saves the fetched data in memory in case data (html page) needs to be accessed again.
    """
    self.__team_data[(team_name, season)] = html

  def __add_team(self, team_name: str, season: str, url: str) -> None:
      """
      Args:
      team_name (str): The name of the team.
      season (str): The season to add.
      url (str): The URL for the team and year.
      """
      self.__team_urls[(team_name, season)] = url     
    
  def __fetch_teams(self) -> List[Tuple[str, str]]:
    """
    Fetches the most recent teams and their urls from premiere league official page.

    Returns:
        List[Tuple[str, str]]: A list of clubs from most recent year and their page links.
    """
    soup = BeautifulSoup(self.__main_page.text)
    teams_standing_table = soup.select_one('table.stats_table')
    
    # Extract club links that contain 'squads' in their URL
    club_anchor_tags = teams_standing_table.find_all('a')
    club_links = [f"https://fbref.com{tag.get('href')}" for tag in club_anchor_tags if 'squads' in tag.get('href')]
    
    # Extract club names from the standings table
    club_names = [row.find('td').text.strip() for row in teams_standing_table.find_all('tr') if row.find('td')]
    return list(zip(club_names, club_links))
    
  def __construct_urls_for_all_years(self) -> None:
    """
    URLs will be constructed for all years provided and for all teams fetched in the recent year.
    """
    teams = self.__fetch_teams()
    for year in self.__years:
        for team in teams:
          name, link = team
          url_parts = link.split('/')
          season = f'{year}-{year+1}'
          url_parts.insert(-1, season)
          modified_url = '/'.join(url_parts)
          self.__add_team(name, season, modified_url)

In [5]:
from abc import ABC, abstractmethod
import pandas as pd
from typing import List, Dict

class DataProcessor(ABC):
    
  def __init__(self, pl: PremiereLeague) -> None:
    self.pl = pl

  @abstractmethod
  def prepare(self, team: str, season: str, html: str) -> pd.DataFrame:
    pass
    
  @abstractmethod
  def process(self) -> pd.DataFrame:
    pass
  
  @staticmethod
  def save(df: pd.DataFrame, file_name: str):
    df.to_csv(f"data/{file_name}", index=False)
  

In [8]:
import pandas as pd
import numpy as np
from io import StringIO

class MatchResultsProcessor(DataProcessor):

  def prepare(self, team: str, season: str, html: str) -> pd.DataFrame:
    # Load scores and fixtures table
    df = pd.read_html(StringIO(html), match="Scores & Fixtures")[0]
    df.columns = df.columns.str.lower()
    
    # Filter Premiere League competitions only
    df = df[df['comp'] == 'Premier League']
    df.reset_index(drop=True, inplace=True)

    # Add additional context on the table rows
    df.insert(1, 'season', season)
    df.insert(2, 'team', team)
    df.insert(12, 'xg', np.nan) if 'xg' not in df.columns else None
    df.insert(13, 'xga', np.nan) if 'xga' not in df.columns else None
    return df
  
  def process(self) -> pd.DataFrame:
    full_data_set = pd.DataFrame()
    teams = self.pl.get_all_teams()
    
    for (team, season, _) in teams:
      try:
        html = self.pl.get_html(team, season)
        new_data_set = self.prepare(team, season, html)
        full_data_set = pd.concat([full_data_set, new_data_set], ignore_index=True)
      except ValueError:
        continue
      
    full_data_set = self.__construct_match_results(full_data_set)
    full_data_set = self.__remove_duplicate_matches(full_data_set)
    return full_data_set
  
  def __construct_match_results(self, df: pd.DataFrame) -> pd.DataFrame:
    columns = ['date', 'season', 'home team', 'away team', 'home goals', 'away goals', 'winner']
    match_results = []
    
    for _, row in df.iterrows():
      entry = MatchResultsProcessor.__create_entry(row)
      match_results.append(entry)

    match_results = pd.DataFrame(match_results, columns=columns)
    return match_results

  @staticmethod
  def __create_entry(row: pd.Series) -> any:
    home_team = row['team'] if row['venue'] == 'Home' else row['opponent']
    away_team = row['team'] if row['venue'] == 'Away' else row['opponent']
    home_goals = row['gf'] if row['venue'] == 'Home' else row['ga']
    away_goals = row['gf'] if row['venue'] == 'Away' else row['ga']
    winner = MatchResultsProcessor.__determine_winner(home_goals, away_goals)

    return {
      "date": row['date'],
      "season": row['season'],
      "home team": home_team,
      "away team": away_team,
      "home goals": home_goals,
      "away goals": away_goals,
      "winner": winner  
    }
  
  @staticmethod
  def __determine_winner(home_goals, away_goals) -> any:
    winner = {
        home_goals > away_goals: 'Home',
        home_goals < away_goals: 'Away',
        home_goals == away_goals: 'Draw'
    }[True]
    return winner
  
  @staticmethod
  def __remove_duplicate_matches(df: pd.DataFrame) -> pd.DataFrame:
    df['sorted teams'] = df.apply(lambda row: '-'.join(sorted([row['home team'], row['away team']])), axis=1)
    df.drop_duplicates(subset=['date', 'sorted teams'], keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.drop(columns=['sorted teams'], inplace=True)
    return df

In [9]:
import pandas as pd
import numpy as np
from io import StringIO

class TeamPerformanceProcessor(DataProcessor):

  points_system = {'W': 3, 'D': 1, 'L': 0}
  
  def prepare(self, team: str, season: str, html: str) -> pd.DataFrame:
    # Load scores and fixtures table
    df = pd.read_html(StringIO(html), match="Scores & Fixtures")[0]
    df.columns = df.columns.str.lower()
    
    # Filter Premiere League competitions only
    df = df[df['comp'] == 'Premier League']
    df.reset_index(drop=True, inplace=True)

    # Add additional context on the table rows
    df.insert(1, 'season', season)
    df.insert(2, 'team', team)
    df['gf'] = df['gf'].astype(int)
    df['ga'] = df['ga'].astype(int)
    return df
  
  def process(self) -> pd.DataFrame:
    full_data_set = pd.DataFrame()
    teams = self.pl.get_all_teams()
    
    for (team, season, _) in teams:
      try:
        html = self.pl.get_html(team, season)
        new_data_set = self.prepare(team, season, html)
        full_data_set = pd.concat([full_data_set, new_data_set], ignore_index=True)
      except ValueError:
        continue

    full_data_set = self.__construct_team_performance(full_data_set)
    return full_data_set

  def __construct_team_performance(self, df: pd.DataFrame) -> pd.DataFrame:
    df['points'] = df['result'].map(self.points_system)
    points_df = df.groupby(['season', 'team'])['points'].sum().reset_index()

    # Create a DataFrame to count wins, draws, and losses
    counts_df = df.pivot_table(index=['season', 'team'], columns='result', aggfunc='size', fill_value=0)
    counts_df.reset_index()

    # Rename the columns for clarity
    counts_df.columns.name = None
    counts_df.rename(columns={'W': 'wins', 'D': 'draws', 'L': 'losses'}, inplace=True)

    # Group by 'Season' and 'Team' to sum goals scored (GF) and goals conceded (GA)
    goals_df = df.groupby(['season', 'team'])[['gf', 'ga']].sum().reset_index()

    # Calculate goal difference (GD)
    goals_df['gd'] = goals_df['gf'] - goals_df['ga']
    goals_df.rename(columns={'gf': 'goals scored', 'ga': 'goals conceded', 'gd': 'goal difference'}, inplace=True)

    # Merge data frames
    final_df = pd.merge(points_df, counts_df, on=['season', 'team'])
    final_df = pd.merge(final_df, goals_df, on=['season', 'team'])
    return final_df


In [585]:
# Initialize premiere league data / html pages
years = list(range(2023, 1999, -1))
pl = PremiereLeague(years)
pl.fetch_all_html()

In [10]:
# Construct Match Results CSV
match_results = MatchResultsProcessor(pl)
df = match_results.process()
match_results.save(df, "match_results.csv")

NameError: name 'pl' is not defined

In [612]:
# Construct Team Performance CSV
team_perf = TeamPerformanceProcessor(pl)
df = team_perf.process()
team_perf.save(df, "team_performance_by_season.csv")