In [1]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from io import StringIO


In [2]:
url_df = 'https://fbref.com/en/comps/9/2022-2023/schedule/2022-2023-Premier-League-Scores-and-Fixtures'

In [3]:
# Request page content and parse with BeautifulSoup
response = requests.get(url_df)
if response.status_code != 200:
    raise Exception(f"Failed to fetch data from {url_df}. HTTP Status Code: {response.status_code}")
    
soup = BeautifulSoup(response.content, 'html.parser')

# Locate the table and read it into pandas
table = soup.find('table')
if table is None:
    raise Exception("No table found on the webpage.")

# Convert the table to a string and read it using pandas
df = pd.read_html(str(table))[0]

# Flatten multi-level columns and clean them up
df.columns = [' '.join(col).strip() if isinstance(col, tuple) else col.strip() for col in df.columns]
df = df.reset_index(drop=True)

# Remove spaces in column names and drop fully empty rows
df.columns = [col.replace(" ", "_") for col in df.columns]
df = df.dropna(how='all')

# Extract hyperlinks in the "Match Report" column, if available
match_report_links = []
for row in table.find_all('tr')[1:]:  # Skip header row
    link = row.find('td', {'data-stat': 'match_report'})
    if link and link.find('a'):
        match_report_links.append('https://fbref.com' + link.find('a')['href'])

# Add hyperlinks as a new column
df['MatchReportURL'] = match_report_links

# Assign unique Match_IDs
df['Match_ID'] = range(len(df))

# Drop unnecessary columns
df = df.drop(columns=['Match_Report', 'Notes','Referee','Attendance','Venue'], errors='ignore')

# Split the score into Home and Away scores
df['Split_Score'] = df['Score'].str.split('–')  # Ensure dash is a standard hyphen
df['Home_Score'] = df['Split_Score'].str[0].astype(int)
df['Away_Score'] = df['Split_Score'].str[1].astype(int)
df = df.drop(columns=['Split_Score'], errors='ignore')

# Function to determine the winner
def get_winner(row):
    home_score = row['Home_Score']
    away_score = row['Away_Score']
    if home_score > away_score:
        return 'Home'
    elif home_score < away_score:
        return 'Away'
    else:
        return 'Draw'

# Apply the function to determine winners
df['Winner'] = df.apply(get_winner, axis=1)

# Display the DataFrame
df = df.rename(columns={'xG': 'HxG', 
                        'xG.1': 'AxG'})

df = df.reset_index(drop=True)


  df = pd.read_html(str(table))[0]


In [4]:
df

Unnamed: 0,Wk,Day,Date,Time,Home,HxG,Score,AxG,Away,MatchReportURL,Match_ID,Home_Score,Away_Score,Winner
0,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,https://fbref.com/en/matches/e62f6e78/Crystal-...,0,0,2,Away
1,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,https://fbref.com/en/matches/6713c1dc/Fulham-L...,1,2,2,Draw
2,1.0,Sat,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,https://fbref.com/en/matches/09d8a999/Tottenha...,2,4,1,Home
3,1.0,Sat,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,https://fbref.com/en/matches/1ac96eb4/Newcastl...,3,2,0,Home
4,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,https://fbref.com/en/matches/82702941/Leeds-Un...,4,2,1,Home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,38.0,Sun,2023-05-28,16:30,Everton,1.0,1–0,0.5,Bournemouth,https://fbref.com/en/matches/94de848f/Everton-...,375,1,0,Home
376,38.0,Sun,2023-05-28,16:30,Leicester City,1.4,2–1,1.4,West Ham,https://fbref.com/en/matches/a96c9915/Leiceste...,376,2,1,Home
377,38.0,Sun,2023-05-28,16:30,Aston Villa,2.8,2–1,1.4,Brighton,https://fbref.com/en/matches/ac0e65e2/Aston-Vi...,377,2,1,Home
378,38.0,Sun,2023-05-28,16:30,Leeds United,1.5,1–4,2.2,Tottenham,https://fbref.com/en/matches/c9c73ddd/Leeds-Un...,378,1,4,Away


In [5]:
# Initialize an empty DataFrame for aggregated results
agg_df = pd.DataFrame()

# Example MatchReportURL and match_id (replace with your actual data)
url = df.loc[0, 'MatchReportURL']

# Iterate through each match in the DataFrame
for link in tqdm(range(0, 380), desc="Processing Match IDs"):

    url = df.loc[link, 'MatchReportURL']
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for match ID {link}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table')
    num_tables = len(tables)

    headers_list = []
    footer_list = []

    # Process each table
    for i, table in enumerate(tables):
        head = table.find('thead')
        footer = table.find('tfoot')
        
        # Determine home or away team based on table index
        team_side = "H" if i < num_tables // 2 else "A"

        # Extract super-header and sub-header
        if head:
            rows = head.find_all('tr')
            if len(rows) >= 2:
                super_headers = [th.text.strip() for th in rows[0].find_all('th') if th.text.strip()]
                sub_headers = [th.text.strip() for th in rows[1].find_all('th') if th.text.strip()]
                
                # Expand super-headers based on colspan
                expanded_super_headers = []
                for th in rows[0].find_all('th'):
                    colspan = int(th.get("colspan", 1))
                    expanded_super_headers.extend([th.text.strip()] * colspan)
                
                # Combine super-header and sub-header with team side
                combined_headers = [f"{team_side}_{super_}_{sub}" for super_, sub in zip(expanded_super_headers, sub_headers)]
            else:
                combined_headers = [f"{team_side}_{th.text.strip()}" for th in rows[0].find_all('th') if th.text.strip()]
        
        # Extract footer data
        if footer:
            footer_data = [cell.text.strip() for cell in footer.find_all('td') if cell.text.strip()]
            if len(combined_headers) >= len(footer_data):
                headers_list += combined_headers[-len(footer_data):]  # Match last headers to footer data
                footer_list += footer_data

    # Create a sub DataFrame
    if headers_list and footer_list:
        sub_df = pd.DataFrame([footer_list], columns=headers_list)
        sub_df['Match_ID'] = link
        
        # Concatenate to the aggregate DataFrame
        agg_df = pd.concat([agg_df, sub_df], ignore_index=True)

    # Add a delay to avoid overwhelming the server
    time.sleep(30)


# Display aggregated DataFrame
# Define a dictionary to map old column names to new ones
column_renames = {
    'H__Tkl+Int': 'H_Defensive_Tkl+Int',
    'A__Tkl+Int': 'A_Defensive_Tkl+Int',
    'H__xA': 'H_Passing_xA',
    'A__xA': 'A_Passing_xA',
    'H__Clr': 'H_Defensive_Clr',
    'A__Clr': 'A_Defensive_Clr',
    'H__PPA': 'H_Passing_PPA',
    'A__PPA': 'A_Passing_PPA',
    'H__KP': 'H_Passing_KP',
    'A__KP': 'A_Passing_KP',
    'H__1/3': 'H_Passing_1/3',
    'A__1/3': 'A_Passing_1/3',
    'H__Min': 'H_Min'
    # Add more column renaming rules as needed
}



Processing Match IDs:  53%|█████▎    | 203/380 [1:47:05<1:24:44, 28.73s/it]

Failed to retrieve data for match ID 202


Processing Match IDs:  54%|█████▎    | 204/380 [1:47:40<1:29:44, 30.59s/it]

Failed to retrieve data for match ID 203


Processing Match IDs:  54%|█████▍    | 205/380 [1:48:12<1:30:57, 31.18s/it]

Failed to retrieve data for match ID 204


Processing Match IDs:  54%|█████▍    | 206/380 [1:48:47<1:33:58, 32.41s/it]

Failed to retrieve data for match ID 205


Processing Match IDs: 100%|██████████| 380/380 [3:20:56<00:00, 31.73s/it]  


In [6]:
filepath = Path('final_2023.csv')  
agg_df.to_csv(filepath, index=False)

In [19]:
agg_df = pd.read_csv('final_2023.csv')

In [9]:
# Initialize an empty DataFrame for aggregated results
missing_df = pd.DataFrame()

# Example MatchReportURL and match_id (replace with your actual data)
url = df.loc[0, 'MatchReportURL']

# Iterate through each match in the DataFrame
for link in tqdm([202,203,204,205], desc="Processing Match IDs"):

    url = df.loc[link, 'MatchReportURL']
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for match ID {link}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table')
    num_tables = len(tables)

    headers_list = []
    footer_list = []

    # Process each table
    for i, table in enumerate(tables):
        head = table.find('thead')
        footer = table.find('tfoot')
        
        # Determine home or away team based on table index
        team_side = "H" if i < num_tables // 2 else "A"

        # Extract super-header and sub-header
        if head:
            rows = head.find_all('tr')
            if len(rows) >= 2:
                super_headers = [th.text.strip() for th in rows[0].find_all('th') if th.text.strip()]
                sub_headers = [th.text.strip() for th in rows[1].find_all('th') if th.text.strip()]
                
                # Expand super-headers based on colspan
                expanded_super_headers = []
                for th in rows[0].find_all('th'):
                    colspan = int(th.get("colspan", 1))
                    expanded_super_headers.extend([th.text.strip()] * colspan)
                
                # Combine super-header and sub-header with team side
                combined_headers = [f"{team_side}_{super_}_{sub}" for super_, sub in zip(expanded_super_headers, sub_headers)]
            else:
                combined_headers = [f"{team_side}_{th.text.strip()}" for th in rows[0].find_all('th') if th.text.strip()]
        
        # Extract footer data
        if footer:
            footer_data = [cell.text.strip() for cell in footer.find_all('td') if cell.text.strip()]
            if len(combined_headers) >= len(footer_data):
                headers_list += combined_headers[-len(footer_data):]  # Match last headers to footer data
                footer_list += footer_data

    # Create a sub DataFrame
    if headers_list and footer_list:
        sub_df = pd.DataFrame([footer_list], columns=headers_list)
        sub_df['Match_ID'] = link
        
        # Concatenate to the aggregate DataFrame
        missing_df = pd.concat([missing_df, sub_df], ignore_index=True)

    # Add a delay to avoid overwhelming the server
    time.sleep(30)


# Display aggregated DataFrame
# Define a dictionary to map old column names to new ones
column_renames = {
    'H__Tkl+Int': 'H_Defensive_Tkl+Int',
    'A__Tkl+Int': 'A_Defensive_Tkl+Int',
    'H__xA': 'H_Passing_xA',
    'A__xA': 'A_Passing_xA',
    'H__Clr': 'H_Defensive_Clr',
    'A__Clr': 'A_Defensive_Clr',
    'H__PPA': 'H_Passing_PPA',
    'A__PPA': 'A_Passing_PPA',
    'H__KP': 'H_Passing_KP',
    'A__KP': 'A_Passing_KP',
    'H__1/3': 'H_Passing_1/3',
    'A__1/3': 'A_Passing_1/3',
    'H__Min': 'H_Min'
    # Add more column renaming rules as needed
}

# Apply the renaming
missing_df = missing_df.rename(columns=column_renames)

Processing Match IDs: 100%|██████████| 4/4 [02:07<00:00, 31.84s/it]


In [22]:
merged_df = df.merge(missing_df)

In [33]:
missing_df

Unnamed: 0,H_Min,H_Performance_Gls,H_Performance_Ast,H_Performance_PK,H_Performance_PKatt,H_Performance_Sh,H_Performance_SoT,H_Performance_CrdY,H_Performance_CrdR,H_Performance_Touches,...,A_Performance_Int,A_Performance_TklW,A_Performance_PKwon,A_Performance_PKcon,A_Performance_OG,A_Performance_Recov,A_Aerial Duels_Won,A_Aerial Duels_Lost,A_Aerial Duels_Won%,Match_ID
0,990,2,1,0,0,12,5,1,0,558,...,10,7,0,0,1,59,16,14,53.3,202
1,969,2,0,1,1,13,4,1,1,648,...,13,9,0,1,0,51,8,11,42.1,203
2,990,1,1,0,0,20,6,3,0,824,...,9,12,0,0,0,43,16,7,69.6,204
3,990,3,3,0,0,14,3,1,0,554,...,17,10,0,0,0,66,25,20,55.6,205


In [40]:
missing_df

Unnamed: 0,H_Min,H_Performance_Gls,H_Performance_Ast,H_Performance_PK,H_Performance_PKatt,H_Performance_Sh,H_Performance_SoT,H_Performance_CrdY,H_Performance_CrdR,H_Performance_Touches,...,A_Performance_Int,A_Performance_TklW,A_Performance_PKwon,A_Performance_PKcon,A_Performance_OG,A_Performance_Recov,A_Aerial Duels_Won,A_Aerial Duels_Lost,A_Aerial Duels_Won%,Match_ID
0,990,2,1,0,0,12,5,1,0,558,...,10,7,0,0,1,59,16,14,53.3,202
1,969,2,0,1,1,13,4,1,1,648,...,13,9,0,1,0,51,8,11,42.1,203
2,990,1,1,0,0,20,6,3,0,824,...,9,12,0,0,0,43,16,7,69.6,204
3,990,3,3,0,0,14,3,1,0,554,...,17,10,0,0,0,66,25,20,55.6,205


In [41]:
agg_df

Unnamed: 0,H__Min,H_Performance_Gls,H_Performance_Ast,H_Performance_PK,H_Performance_PKatt,H_Performance_Sh,H_Performance_SoT,H_Performance_CrdY,H_Performance_CrdR,H_Performance_Touches,...,A_Performance_Int.1,A_Performance_TklW,A_Performance_PKwon,A_Performance_PKcon,A_Performance_OG,A_Performance_Recov,A_Aerial Duels_Won,A_Aerial Duels_Lost,A_Aerial Duels_Won%,Match_ID
0,990,0,0,0,0,10,2,1,0,726,...,9,16,0,0,0,55,14,10,58.3,0
1,990,2,1,1,1,8,2,2,0,474,...,10,9,0,1,0,63,13,23,36.1,1
2,990,3,3,0,0,18,8,3,0,709,...,13,6,0,0,1,56,11,13,45.8,2
3,990,2,1,0,0,23,9,0,0,634,...,10,11,0,0,0,52,16,12,57.1,3
4,990,1,1,0,0,12,4,2,0,515,...,14,11,0,0,1,65,7,9,43.8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,990,1,0,0,0,13,6,1,0,469,...,6,7,0,0,0,53,18,21,46.2,375
372,990,2,2,0,0,13,4,1,0,676,...,3,10,0,0,0,53,5,11,31.3,376
373,990,2,2,0,0,12,5,4,0,413,...,3,2,0,0,0,46,11,12,47.8,377
374,990,1,1,0,0,19,2,3,0,636,...,12,12,0,0,0,67,25,30,45.5,378


In [39]:
agg_df.reset_index(drop=True)

Unnamed: 0,H__Min,H_Performance_Gls,H_Performance_Ast,H_Performance_PK,H_Performance_PKatt,H_Performance_Sh,H_Performance_SoT,H_Performance_CrdY,H_Performance_CrdR,H_Performance_Touches,...,A_Performance_Int.1,A_Performance_TklW,A_Performance_PKwon,A_Performance_PKcon,A_Performance_OG,A_Performance_Recov,A_Aerial Duels_Won,A_Aerial Duels_Lost,A_Aerial Duels_Won%,Match_ID
0,990,0,0,0,0,10,2,1,0,726,...,9,16,0,0,0,55,14,10,58.3,0
1,990,2,1,1,1,8,2,2,0,474,...,10,9,0,1,0,63,13,23,36.1,1
2,990,3,3,0,0,18,8,3,0,709,...,13,6,0,0,1,56,11,13,45.8,2
3,990,2,1,0,0,23,9,0,0,634,...,10,11,0,0,0,52,16,12,57.1,3
4,990,1,1,0,0,12,4,2,0,515,...,14,11,0,0,1,65,7,9,43.8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,990,1,0,0,0,13,6,1,0,469,...,6,7,0,0,0,53,18,21,46.2,375
372,990,2,2,0,0,13,4,1,0,676,...,3,10,0,0,0,53,5,11,31.3,376
373,990,2,2,0,0,12,5,4,0,413,...,3,2,0,0,0,46,11,12,47.8,377
374,990,1,1,0,0,19,2,3,0,636,...,12,12,0,0,0,67,25,30,45.5,378


In [44]:
result = pd.concat([agg_df.reset_index(drop=True), missing_df.reset_index(drop=True)], ignore_index=True)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [26]:
agg_df + missing_df

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [18]:
agg_df

Unnamed: 0,H__Min,H_Performance_Gls,H_Performance_Ast,H_Performance_PK,H_Performance_PKatt,H_Performance_Sh,H_Performance_SoT,H_Performance_CrdY,H_Performance_CrdR,H_Performance_Touches,...,A_Performance_Int.1,A_Performance_TklW,A_Performance_PKwon,A_Performance_PKcon,A_Performance_OG,A_Performance_Recov,A_Aerial Duels_Won,A_Aerial Duels_Lost,A_Aerial Duels_Won%,Match_ID
0,990,0,0,0,0,10,2,1,0,726,...,9,16,0,0,0,55,14,10,58.3,0
1,990,2,1,1,1,8,2,2,0,474,...,10,9,0,1,0,63,13,23,36.1,1
2,990,3,3,0,0,18,8,3,0,709,...,13,6,0,0,1,56,11,13,45.8,2
3,990,2,1,0,0,23,9,0,0,634,...,10,11,0,0,0,52,16,12,57.1,3
4,990,1,1,0,0,12,4,2,0,515,...,14,11,0,0,1,65,7,9,43.8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,990,1,0,0,0,13,6,1,0,469,...,6,7,0,0,0,53,18,21,46.2,375
372,990,2,2,0,0,13,4,1,0,676,...,3,10,0,0,0,53,5,11,31.3,376
373,990,2,2,0,0,12,5,4,0,413,...,3,2,0,0,0,46,11,12,47.8,377
374,990,1,1,0,0,19,2,3,0,636,...,12,12,0,0,0,67,25,30,45.5,378


In [16]:
agg_df2 = pd.concat([agg_df, merged_df])

In [17]:
agg_df2

Unnamed: 0,H__Min,H_Performance_Gls,H_Performance_Ast,H_Performance_PK,H_Performance_PKatt,H_Performance_Sh,H_Performance_SoT,H_Performance_CrdY,H_Performance_CrdR,H_Performance_Touches,...,Time,Home,HxG,Score,AxG,Away,MatchReportURL,Home_Score,Away_Score,Winner
0,990,0,0,0,0,10,2,1,0,726,...,,,,,,,,,,
1,990,2,1,1,1,8,2,2,0,474,...,,,,,,,,,,
2,990,3,3,0,0,18,8,3,0,709,...,,,,,,,,,,
3,990,2,1,0,0,23,9,0,0,634,...,,,,,,,,,,
4,990,1,1,0,0,12,4,2,0,515,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,990,1,0,0,0,13,6,1,0,469,...,16:30,Everton,1.0,1–0,0.5,Bournemouth,https://fbref.com/en/matches/94de848f/Everton-...,1.0,0.0,Home
372,990,2,2,0,0,13,4,1,0,676,...,16:30,Leicester City,1.4,2–1,1.4,West Ham,https://fbref.com/en/matches/a96c9915/Leiceste...,2.0,1.0,Home
373,990,2,2,0,0,12,5,4,0,413,...,16:30,Aston Villa,2.8,2–1,1.4,Brighton,https://fbref.com/en/matches/ac0e65e2/Aston-Vi...,2.0,1.0,Home
374,990,1,1,0,0,19,2,3,0,636,...,16:30,Leeds United,1.5,1–4,2.2,Tottenham,https://fbref.com/en/matches/c9c73ddd/Leeds-Un...,1.0,4.0,Away


In [14]:
merged_df = df.merge(agg_df)

In [15]:
merged_df

Unnamed: 0,Wk,Day,Date,Time,Home,HxG,Score,AxG,Away,MatchReportURL,...,A_Performance_Crs,A_Performance_Int.1,A_Performance_TklW,A_Performance_PKwon,A_Performance_PKcon,A_Performance_OG,A_Performance_Recov,A_Aerial Duels_Won,A_Aerial Duels_Lost,A_Aerial Duels_Won%
0,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,https://fbref.com/en/matches/e62f6e78/Crystal-...,...,11,9,16,0,0,0,55,14,10,58.3
1,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,https://fbref.com/en/matches/6713c1dc/Fulham-L...,...,18,10,9,0,1,0,63,13,23,36.1
2,1.0,Sat,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,https://fbref.com/en/matches/09d8a999/Tottenha...,...,18,13,6,0,0,1,56,11,13,45.8
3,1.0,Sat,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,https://fbref.com/en/matches/1ac96eb4/Newcastl...,...,9,10,11,0,0,0,52,16,12,57.1
4,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,https://fbref.com/en/matches/82702941/Leeds-Un...,...,20,14,11,0,0,1,65,7,9,43.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,38.0,Sun,2023-05-28,16:30,Everton,1.0,1–0,0.5,Bournemouth,https://fbref.com/en/matches/94de848f/Everton-...,...,15,6,7,0,0,0,53,18,21,46.2
372,38.0,Sun,2023-05-28,16:30,Leicester City,1.4,2–1,1.4,West Ham,https://fbref.com/en/matches/a96c9915/Leiceste...,...,27,3,10,0,0,0,53,5,11,31.3
373,38.0,Sun,2023-05-28,16:30,Aston Villa,2.8,2–1,1.4,Brighton,https://fbref.com/en/matches/ac0e65e2/Aston-Vi...,...,7,3,2,0,0,0,46,11,12,47.8
374,38.0,Sun,2023-05-28,16:30,Leeds United,1.5,1–4,2.2,Tottenham,https://fbref.com/en/matches/c9c73ddd/Leeds-Un...,...,6,12,12,0,0,0,67,25,30,45.5


In [29]:
df_sorted = agg_df2.sort_values(by='Match_ID')

# If you want to reset the index after sorting
df_sorted = df_sorted.reset_index(drop=True)

In [30]:
df_sorted

Unnamed: 0,Wk,Day,Date,Time,Home,HxG,Score,AxG,Away,MatchReportURL,...,A_Carries_Dis,A_Receiving_Rec,A_Receiving_PrgR,A_Performance_2CrdY,A_Performance_Fls,A_Performance_Fld,A_Performance_PKwon,A_Performance_OG,A_Performance_Recov,A_Aerial Duels_Won%
0,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,https://fbref.com/en/matches/e62f6e78/Crystal-...,...,15,376,39,0,11,16,0,0,55,58.3
1,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,https://fbref.com/en/matches/6713c1dc/Fulham-L...,...,13,510,52,0,9,7,0,0,63,36.1
2,1.0,Sat,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,https://fbref.com/en/matches/09d8a999/Tottenha...,...,17,349,30,0,6,11,0,1,56,45.8
3,1.0,Sat,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,https://fbref.com/en/matches/1ac96eb4/Newcastl...,...,8,243,16,0,14,9,0,0,52,57.1
4,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,https://fbref.com/en/matches/82702941/Leeds-Un...,...,7,501,44,0,9,13,0,1,65,43.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,38.0,Sun,2023-05-28,16:30,Everton,1.0,1–0,0.5,Bournemouth,https://fbref.com/en/matches/94de848f/Everton-...,...,5,474,40,0,12,11,0,0,53,46.2
376,38.0,Sun,2023-05-28,16:30,Leicester City,1.4,2–1,1.4,West Ham,https://fbref.com/en/matches/a96c9915/Leiceste...,...,7,516,54,0,10,8,0,0,53,31.3
377,38.0,Sun,2023-05-28,16:30,Aston Villa,2.8,2–1,1.4,Brighton,https://fbref.com/en/matches/ac0e65e2/Aston-Vi...,...,9,365,33,0,16,15,0,0,46,47.8
378,38.0,Sun,2023-05-28,16:30,Leeds United,1.5,1–4,2.2,Tottenham,https://fbref.com/en/matches/c9c73ddd/Leeds-Un...,...,9,359,28,0,5,7,0,0,67,45.5


In [31]:
filepath = Path('final_2023.csv')  
df_sorted.to_csv(filepath, index=False)