In [353]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

# URL of the page
url_df = 'https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures'

# Request page content and parse with BeautifulSoup
response = requests.get(url_df)
soup = BeautifulSoup(response.content, 'html.parser')

# Locate the table and read it into pandas
table = soup.find('table')
df = pd.read_html(str(table))[0]

# Flatten multi-level columns and clean them up
df.columns = [' '.join(col).strip() for col in df.columns]
df = df.reset_index(drop=True)

# Remove spaces in column names and drop fully empty rows
df.columns = [col.replace(" ", "") for col in df.columns]
df = df.dropna(how='all')

# Extract hyperlinks in "Match Report" column, if available
match_report_links = []
for row in table.find_all('tr')[1:]:  # Skip header row
    link = row.find('td', {'data-stat': 'match_report'})
    if link and link.find('a'):
        match_report_links.append('https://fbref.com' + link.find('a')['href'])
# Add hyperlinks as a new column
df['MatchReportURL'] = match_report_links
df['id'] = range(0, len(df))
# Display the first few rows to check data
df.head()


  df = pd.read_html(str(table))[0]


Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,MatchReport,Notes,MatchReportURL,id
0,1.0,Fri,2023-08-11,20:00,Burnley,0.3,0–3,1.9,Manchester City,21572.0,Turf Moor,Craig Pawson,Match Report,,https://fbref.com/en/matches/3a6836b4/Burnley-...,0
1,1.0,Sat,2023-08-12,12:30,Arsenal,0.8,2–1,1.2,Nott'ham Forest,59984.0,Emirates Stadium,Michael Oliver,Match Report,,https://fbref.com/en/matches/26a7f90c/Arsenal-...,1
2,1.0,Sat,2023-08-12,15:00,Everton,2.7,0–1,1.5,Fulham,39940.0,Goodison Park,Stuart Attwell,Match Report,,https://fbref.com/en/matches/15addfc7/Everton-...,2
3,1.0,Sat,2023-08-12,15:00,Sheffield Utd,0.5,0–1,1.9,Crystal Palace,31194.0,Bramall Lane,John Brooks,Match Report,,https://fbref.com/en/matches/55fd92c7/Sheffiel...,3
4,1.0,Sat,2023-08-12,15:00,Brighton,4.0,4–1,1.5,Luton Town,31872.0,The American Express Community Stadium,David Coote,Match Report,,https://fbref.com/en/matches/56a137f7/Brighton...,4


In [350]:
df = df.drop('MatchReport', axis=1)
df = df.drop('Notes', axis=1)

df['Split Score'] = df['Score'].str.split('–')
df['Home Score'] = df['Split Score'].str[0]
df['Away Score'] = df['Split Score'].str[1]
df = df.drop('Split Score', axis=1)

def get_winner(row):
    # Replace any non-standard dash with a standard hyphen and split scores
    home_score = row['Home Score']
    away_score = row['Away Score']    
    
    # Determine the winner
    if home_score > away_score:
        return 'Home'
    elif home_score < away_score:
        return 'Away'
    else:
        return "Draw"
    
# Apply the function to each row and create a new column 'Winner'
df["Winner"] = df.apply(get_winner, axis=1)

In [351]:
df

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,MatchReportURL,id,Home Score,Away Score,Winner
0,1.0,Fri,2023-08-11,20:00,Burnley,0.3,0–3,1.9,Manchester City,21572.0,Turf Moor,Craig Pawson,https://fbref.com/en/matches/3a6836b4/Burnley-...,1,0,3,Away
1,1.0,Sat,2023-08-12,12:30,Arsenal,0.8,2–1,1.2,Nott'ham Forest,59984.0,Emirates Stadium,Michael Oliver,https://fbref.com/en/matches/26a7f90c/Arsenal-...,2,2,1,Home
2,1.0,Sat,2023-08-12,15:00,Everton,2.7,0–1,1.5,Fulham,39940.0,Goodison Park,Stuart Attwell,https://fbref.com/en/matches/15addfc7/Everton-...,3,0,1,Away
3,1.0,Sat,2023-08-12,15:00,Sheffield Utd,0.5,0–1,1.9,Crystal Palace,31194.0,Bramall Lane,John Brooks,https://fbref.com/en/matches/55fd92c7/Sheffiel...,4,0,1,Away
4,1.0,Sat,2023-08-12,15:00,Brighton,4.0,4–1,1.5,Luton Town,31872.0,The American Express Community Stadium,David Coote,https://fbref.com/en/matches/56a137f7/Brighton...,5,4,1,Home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,38.0,Sun,2024-05-19,16:00,Brentford,1.1,2–4,3.4,Newcastle Utd,17124.0,Gtech Community Stadium,Simon Hooper,https://fbref.com/en/matches/546e1a3d/Brentfor...,376,2,4,Away
419,38.0,Sun,2024-05-19,16:00,Chelsea,1.1,2–1,2.2,Bournemouth,39724.0,Stamford Bridge,Anthony Taylor,https://fbref.com/en/matches/7c034003/Chelsea-...,377,2,1,Home
420,38.0,Sun,2024-05-19,16:00,Crystal Palace,2.5,5–0,0.9,Aston Villa,25191.0,Selhurst Park,Darren Bond,https://fbref.com/en/matches/c975c7a6/Crystal-...,378,5,0,Home
421,38.0,Sun,2024-05-19,16:00,Liverpool,4.5,2–0,0.5,Wolves,60059.0,Anfield,Chris Kavanagh,https://fbref.com/en/matches/d4823ed5/Liverpoo...,379,2,0,Home


In [348]:
df.loc[0]['MatchReportURL']

'https://fbref.com/en/matches/3a6836b4/Burnley-Manchester-City-August-11-2023-Premier-League'

In [None]:
import time


for match_id in range(1,11):
    time.sleep(60)
    url = df.loc[match_id]['MatchReportURL']

    response = requests.get(url)
    print(response)
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table')
    data = []
    num_tables = len(tables)

    headers_list = []
    footer_list = []

    for i, table in enumerate(tables):
        
        # Find the thead, tbody, and tfoot (single occurrences, so use find instead of find_all)
        head = table.find('thead')
        footer = table.find('tfoot')

        # Extract and print the headers
        if head:
            headers = [header.text.strip() for header in head.find_all('th') if header.text.strip() != '']

            # Add "H" to the headers for the first half of the tables
            if i < num_tables // 2:
                headers = ['H' + header for header in headers]
            # Add "A" to the headers for the second half of the tables
            else:
                headers = ['A' + header for header in headers]    
        # Extract and print the footer (if present)
        if footer:
            footer_data = [footer_cell.text.strip() for footer_cell in footer.find_all('td') if footer_cell.text.strip() != '']

            headers_list += headers[-len(footer_data):][::1]
            footer_list += footer_data

    sub_df = pd.DataFrame([footer_list], columns=headers_list)
    sub_df['id'] = match_id

    sub_df

In [361]:
merged_df = pd.merge(df, sub_df, on='id', how='inner')

In [364]:
merged_df

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,...,ACrs,AInt,ATklW,APKwon,APKcon,AOG,ARecov,AWon,ALost,AWon%
0,1.0,Sat,2023-08-12,15:00,Everton,2.7,0–1,1.5,Fulham,39940.0,...,11,4,4,0,0,0,43,14,9,60.9


In [249]:
table = soup.find('table', id='keeper_stats_943e8050')

# Extract and print headers (from the first <thead>)
headers = [header.text.strip() for header in table.find_all('th')]
print("Headers:", headers)

# Extract and print subheaders (from the second <thead>)
subheaders = []
second_header = table.find_all('thead')[0]  # Access the second <thead>
for th in second_header.find_all('th'):
    subheaders.append(th.text.strip())

print("Subheaders:", subheaders)


Headers: ['', 'Shot Stopping', 'Launched', 'Passes', 'Goal Kicks', 'Crosses', 'Sweeper', 'Player', 'Nation', 'Age', 'Min', 'SoTA', 'GA', 'Saves', 'Save%', 'PSxG', 'Cmp', 'Att', 'Cmp%', 'Att (GK)', 'Thr', 'Launch%', 'AvgLen', 'Att', 'Launch%', 'AvgLen', 'Opp', 'Stp', 'Stp%', '#OPA', 'AvgDist', 'James Trafford']
Subheaders: ['', 'Shot Stopping', 'Launched', 'Passes', 'Goal Kicks', 'Crosses', 'Sweeper', 'Player', 'Nation', 'Age', 'Min', 'SoTA', 'GA', 'Saves', 'Save%', 'PSxG', 'Cmp', 'Att', 'Cmp%', 'Att (GK)', 'Thr', 'Launch%', 'AvgLen', 'Att', 'Launch%', 'AvgLen', 'Opp', 'Stp', 'Stp%', '#OPA', 'AvgDist']
