In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
url = "https://fbref.com/en/comps/676/2021/schedule/2021-European-Championship-Scores-and-Fixtures"

# Send a request to the page
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table with id 'sched_all'
table = soup.find('table', id='sched_all')

# List to store match report links
match_report_links = []

# Check if the table exists
if table:
    print("Table found!")
    # Find all rows in the table
    rows = table.find_all('tr')
    
    print(f"Found {len(rows)} rows")
    
    # Loop through each row and extract the match report link
    for row in rows:
        match_report_cell = row.find('td', attrs={'data-stat': 'match_report'})
        if match_report_cell:
            link = match_report_cell.find('a', href=True)
            if link:
                match_report_links.append("https://fbref.com" + link['href'])
else:
    print("Table not found!")

# Print all match report links
for link in match_report_links:
    print(link)

# Save the links to a file
with open('match_report_links_historical.txt', 'w') as f:
    for link in match_report_links:
        f.write(f"{link}\n")


Table found!
Found 55 rows
https://fbref.com/en/matches/95a9ebd1/Turkiye-Italy-June-11-2021-European-Championship
https://fbref.com/en/matches/d9eaa85c/Wales-Switzerland-June-12-2021-European-Championship
https://fbref.com/en/matches/c3c2ffa2/Denmark-Finland-June-12-2021-European-Championship
https://fbref.com/en/matches/e594174b/Belgium-Russia-June-12-2021-European-Championship
https://fbref.com/en/matches/764c27dc/England-Croatia-June-13-2021-European-Championship
https://fbref.com/en/matches/b47a0ea6/Austria-North-Macedonia-June-13-2021-European-Championship
https://fbref.com/en/matches/0e9919a5/Netherlands-Ukraine-June-13-2021-European-Championship
https://fbref.com/en/matches/6599f4ab/Scotland-Czechia-June-14-2021-European-Championship
https://fbref.com/en/matches/d35ad7a8/Poland-Slovakia-June-14-2021-European-Championship
https://fbref.com/en/matches/107fd412/Spain-Sweden-June-14-2021-European-Championship
https://fbref.com/en/matches/ba500d70/Hungary-Portugal-June-15-2021-Europe

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Read match report links from a file
with open('match_report_links.txt', 'r') as file:
    match_report_links = file.readlines()

# List to store data for all players
all_players_data = []

# Function to extract player statistics from a table
def extract_player_stats(table):
    players_data = []
    rows = table.find('tbody').find_all('tr')
    print(f"Found {len(rows)} rows in the table.")
    for row in rows:
        player_data = {}
        player_data['name'] = row.find('th', {'data-stat': 'player'}).text.strip()
        player_data['pos'] = row.find('td', {'data-stat': 'position'}).text.strip() if row.find('td', {'data-stat': 'position'}) else 'GK'
        
        def get_int_value(stat):
            try:
                return int(row.find('td', {'data-stat': stat}).text.strip())
            except (ValueError, AttributeError):
                return 0

        player_data['minutes'] = get_int_value('minutes')
        player_data['goals'] = get_int_value('goals')
        player_data['assists'] = get_int_value('assists')
        player_data['pens_made'] = get_int_value('pens_made')
        player_data['pens_att'] = get_int_value('pens_att')
        player_data['shots'] = get_int_value('shots')
        player_data['shots_on_target'] = get_int_value('shots_on_target')
        player_data['cards_yellow'] = get_int_value('cards_yellow')
        player_data['cards_red'] = get_int_value('cards_red')
        player_data['fouls'] = get_int_value('fouls')
        player_data['fouled'] = get_int_value('fouled')
        player_data['offsides'] = get_int_value('offsides')
        player_data['crosses'] = get_int_value('crosses')
        player_data['tackles_won'] = get_int_value('tackles_won')
        player_data['interceptions'] = get_int_value('interceptions')
        player_data['own_goals'] = get_int_value('own_goals')
        player_data['pens_won'] = get_int_value('pens_won')
        player_data['pens_conceded'] = get_int_value('pens_conceded')
        
        players_data.append(player_data)
        print(f"Extracted data for player: {player_data['name']}")
    return players_data

# Function to extract goalkeeper statistics from a table
def extract_goalkeeper_stats(table):
    goalkeepers_data = []
    rows = table.find('tbody').find_all('tr')
    print(f"Found {len(rows)} rows in the goalkeeper table.")
    for row in rows:
        gk_data = {}
        gk_data['name'] = row.find('th', {'data-stat': 'player'}).text.strip()
        gk_data['pos'] = 'GK'
        
        def get_int_value(stat):
            try:
                return int(row.find('td', {'data-stat': stat}).text.strip())
            except (ValueError, AttributeError):
                return 0
        
        gk_data['minutes'] = get_int_value('minutes')
        gk_data['shots_on_target_against'] = get_int_value('gk_shots_on_target_against')
        gk_data['goals_against'] = get_int_value('gk_goals_against')
        gk_data['saves'] = get_int_value('gk_saves')
        gk_data['save%'] = get_int_value('gk_save_pct')
        
        goalkeepers_data.append(gk_data)
        print(f"Extracted data for goalkeeper: {gk_data['name']}")
    return goalkeepers_data

# Function to merge player and goalkeeper statistics by aggregating data
def merge_stats(all_players_data, new_data):
    for new_player in new_data:
        existing_player = next((player for player in all_players_data if player['name'] == new_player['name']), None)
        if existing_player:
            for key, value in new_player.items():
                if key in existing_player and key not in ['name', 'pos']:  # Do not sum the 'name' and 'pos' fields
                    existing_player[key] += value
                else:
                    existing_player[key] = value
        else:
            all_players_data.append(new_player)

# Loop through each match report link
for match_report_link in match_report_links:
    match_report_link = match_report_link.strip()
    print(f"Processing match report: {match_report_link}")
    
    # Fetch the match report page
    response = requests.get(match_report_link)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all tables on the page
    tables = soup.find_all('table')

    # Extract player and goalkeeper statistics from each table
    for table in tables:
        if table.find('caption') and 'Player Stats Table' in table.find('caption').text:
            players_data = extract_player_stats(table)
            merge_stats(all_players_data, players_data)
        elif table.find('caption') and 'goalkeeper stats' in table.find('caption').text.lower():
            goalkeepers_data = extract_goalkeeper_stats(table)
            merge_stats(all_players_data, goalkeepers_data)
    
    # Delay before processing the next link
    time.sleep(3)

# Create DataFrame from the combined data
df_players = pd.DataFrame(all_players_data)

# Print column names for debugging
print("Column names in DataFrame:", df_players.columns.tolist())

# Clean up data (convert numerical columns from strings to integers)
numerical_columns = ['minutes', 'goals', 'assists', 'pens_made', 'pens_att', 'shots', 'shots_on_target', 'cards_yellow', 'cards_red', 'fouls', 'fouled', 'offsides', 'crosses', 'tackles_won', 'interceptions', 'own_goals', 'pens_won', 'pens_conceded', 'shots_on_target_against', 'goals_against', 'saves', 'save%']
for column in numerical_columns:
    if column in df_players.columns:
        df_players[column] = pd.to_numeric(df_players[column], errors='coerce').fillna(0).astype(int)
    else:
        print(f"Column {column} is missing in the DataFrame.")

# Save the DataFrame to a CSV file
df_players.to_csv('players_stats.csv', index=False)

# Display the first few rows of the DataFrame
print(df_players.head())


FileNotFoundError: [Errno 2] No such file or directory: 'match_report_links.txt'