In [141]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Load match report links from file
with open('match_report_links.txt', 'r') as file:
    match_report_links = [line.strip() for line in file.readlines()]

# List to store data for all players
all_players_data = []

# Function to extract player statistics from a table
def extract_player_stats(table):
    players_data = []
    rows = table.find('tbody').find_all('tr')
    print(f"Found {len(rows)} rows in the table.")
    for row in rows:
        player_data = {}
        player_data['name'] = row.find('th', {'data-stat': 'player'}).text.strip()
        player_data['pos'] = row.find('td', {'data-stat': 'position'}).text.strip()
        player_data['minutes'] = row.find('td', {'data-stat': 'minutes'}).text.strip()
        player_data['goals'] = row.find('td', {'data-stat': 'goals'}).text.strip()
        player_data['assists'] = row.find('td', {'data-stat': 'assists'}).text.strip()
        player_data['pens_made'] = row.find('td', {'data-stat': 'pens_made'}).text.strip()
        player_data['pens_att'] = row.find('td', {'data-stat': 'pens_att'}).text.strip()
        player_data['shots'] = row.find('td', {'data-stat': 'shots'}).text.strip()
        player_data['shots_on_target'] = row.find('td', {'data-stat': 'shots_on_target'}).text.strip()
        player_data['cards_yellow'] = row.find('td', {'data-stat': 'cards_yellow'}).text.strip()
        player_data['cards_red'] = row.find('td', {'data-stat': 'cards_red'}).text.strip()
        player_data['fouls'] = row.find('td', {'data-stat': 'fouls'}).text.strip()
        player_data['fouled'] = row.find('td', {'data-stat': 'fouled'}).text.strip()
        player_data['offsides'] = row.find('td', {'data-stat': 'offsides'}).text.strip()
        player_data['crosses'] = row.find('td', {'data-stat': 'crosses'}).text.strip()
        player_data['tackles_won'] = row.find('td', {'data-stat': 'tackles_won'}).text.strip()
        player_data['interceptions'] = row.find('td', {'data-stat': 'interceptions'}).text.strip()
        player_data['own_goals'] = row.find('td', {'data-stat': 'own_goals'}).text.strip()
        player_data['pens_won'] = row.find('td', {'data-stat': 'pens_won'}).text.strip()
        player_data['pens_conceded'] = row.find('td', {'data-stat': 'pens_conceded'}).text.strip()
        players_data.append(player_data)
        print(f"Extracted data for player: {player_data['name']}")
    return players_data

# Iterate through each match report link
for link in match_report_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Print raw HTML content of the page for debugging
    print(f"Raw HTML content for {link}:\n")
    print(soup.prettify()[:2000])  # Print the first 2000 characters for inspection

    # Find all tables on the page
    tables = soup.find_all('table')

    # Print number of tables found for debugging
    print(f"Found {len(tables)} tables in the page.")

    # Extract player statistics from each table
    for table in tables:
        if table.find('caption') and 'Player Stats Table' in table.find('caption').text:
            players_data = extract_player_stats(table)
            all_players_data.extend(players_data)

    # Delay to avoid hitting the rate limit
    time.sleep(3)

# Create DataFrame from the combined data
df_players = pd.DataFrame(all_players_data)

# Print column names for debugging
print("Column names in DataFrame:", df_players.columns.tolist())

# Clean up data (convert numerical columns from strings to integers)
numerical_columns = ['minutes', 'goals', 'assists', 'pens_made', 'pens_att', 'shots', 'shots_on_target', 'cards_yellow', 'cards_red', 'fouls', 'fouled', 'offsides', 'crosses', 'tackles_won', 'interceptions', 'own_goals', 'pens_won', 'pens_conceded']
for column in numerical_columns:
    if column in df_players.columns:
        df_players[column] = pd.to_numeric(df_players[column], errors='coerce').fillna(0).astype(int)
    else:
        print(f"Column {column} is missing in the DataFrame.")

# Save the DataFrame to a CSV file
df_players.to_csv('players_stats.csv', index=False)

# Display the first few rows of the DataFrame
print(df_players.head())


In [157]:
import pandas as pd

# Load the data into a pandas DataFrame
data = pd.read_csv('players_stats.csv')

# Fill missing values with 0
data = data.fillna(0)

# Aggregate the data by summing the numeric columns for each player
aggregated_data = data.groupby('name').sum(numeric_only=True).reset_index()

# For positions, we'll take the unique positions each player has been listed as
positions = data.groupby('name')['pos'].apply(lambda x: ','.join(set(x))).reset_index()

# Merge the aggregated numeric data with the unique positions
aggregated_data = pd.merge(aggregated_data, positions, on='name')

# Reorder columns to place 'pos' after 'name'
cols = ['name', 'pos'] + [col for col in aggregated_data.columns if col not in ['name', 'pos']]
aggregated_data = aggregated_data[cols]

# Order by minutes
aggregated_data = aggregated_data.sort_values(by='minutes', ascending=False)

# Save the aggregated data to a new CSV file
aggregated_data.to_csv('aggregated_player_stats.csv', index=False)
