In [2]:
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup

# Function for scraping weekly PL standing 

In [3]:
def scrape_standing(season, gw): 
    url = f"https://www.worldfootball.net/schedule/eng-premier-league-{season}-spieltag/{gw}/" 
    response = requests.get(url)
    if response.status_code == 200: 
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find_all("table", class_="standard_tabelle")[1]

        data = []
        columns = []

        # Extract table headers
        for th in table.find_all('th'):
            columns.append(th.text.strip())

        # Extract table rows
        for row in table.find_all('tr'):
            row_data = [td.text.strip() for td in row.find_all('td')]
            if row_data:  # Skip header row
                data.append(row_data)

        # Create a DataFrame
        df = pd.DataFrame(data)
        df.drop(1, axis=1, inplace=True)
        df.columns = columns
        df["#"] = df["#"].replace("", np.nan)
        df = df.fillna(method="ffill")
        df.rename({"#": "Standing"}, axis=1, inplace=True)
        df["GW"] = gw
        df_out = df[["GW", "Team", "Standing"]]
        
        return df_out
    else: 
        raise ValueError(f"status_code = {response.status_code}")

# Scrape weekly standing for a whole season

In [4]:
season = "2022-2023"
standing_list = []

for gw in range(1, 39): 
    df_standing = scrape_standing(season, gw)
    standing_list.append(df_standing)

df_standing_by_wk = pd.concat(standing_list)

In [5]:
df_standing_by_wk

Unnamed: 0,GW,Team,Standing
0,1,Tottenham Hotspur,1
1,1,AFC Bournemouth,2
2,1,Arsenal FC,2
3,1,Manchester City,2
4,1,Newcastle United,2
...,...,...,...
15,38,Nottingham Forest,16
16,38,Everton FC,17
17,38,Leicester City,18
18,38,Leeds United,19


# Create team id mapping

In [21]:
df_teams = pd.DataFrame({"Team": df_standing_by_wk["Team"].unique()})
df_teams.sort_values("Team", inplace=True)
df_teams["id"] = [3, 1, 2, 4, 5, 6, 7, 8, 9, 11, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20]
df_teams

Unnamed: 0,Team,id
1,AFC Bournemouth,3
2,Arsenal FC,1
15,Aston Villa,2
8,Brentford FC,4
5,Brighton & Hove Albion,5
7,Chelsea FC,6
16,Crystal Palace,7
14,Everton FC,8
9,Fulham FC,9
6,Leeds United,11


# Map team id back to weekly standing

In [23]:
df_standing_mapped = pd.merge(df_standing_by_wk, df_teams, on="Team", how="left")
df_standing_mapped

Unnamed: 0,GW,Team,Standing,id
0,1,Tottenham Hotspur,1,18
1,1,AFC Bournemouth,2,3
2,1,Arsenal FC,2,1
3,1,Manchester City,2,13
4,1,Newcastle United,2,15
...,...,...,...,...
755,38,Nottingham Forest,16,16
756,38,Everton FC,17,8
757,38,Leicester City,18,10
758,38,Leeds United,19,11


In [24]:
# export table
df_standing_mapped.to_csv("../../data/raw_data/2022-23/standing.csv", index=False)