## Packages and configuration

In [1]:
from statsbombpy import sb
import pandas as pd
from mplsoccer import VerticalPitch,Pitch
from highlight_text import ax_text, fig_text
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
import seaborn as sns
import pprint

## Load Competiton, Match, and Event Data from statsbombpy

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Call statsbombpy API to get all free competitions, then chec Women's comps
free_comps = sb.competitions()
women_comps = free_comps[free_comps['competition_gender'] == 'female']
women_comps

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
25,37,90,England,FA Women's Super League,female,False,False,2020/2021,2025-04-23T14:16:46.924831,2021-06-13T16:17:31.694,,2025-04-23T14:16:46.924831
26,37,42,England,FA Women's Super League,female,False,False,2019/2020,2024-02-12T15:05:34.211400,2021-06-13T16:17:31.694,,2024-02-12T15:05:34.211400
27,37,4,England,FA Women's Super League,female,False,False,2018/2019,2024-08-07T17:22:40.334287,2021-06-13T16:17:31.694,,2024-08-07T17:22:40.334287
63,49,3,United States of America,NWSL,female,False,False,2018,2024-12-15T12:31:48.035735,2021-06-13T16:17:31.694,,2024-12-15T12:31:48.035735
71,53,315,Europe,UEFA Women's Euro,female,False,True,2025,2025-07-28T14:19:20.467348,2025-07-29T16:03:07.355174,2025-07-29T16:03:07.355174,2025-07-28T14:19:20.467348
72,53,106,Europe,UEFA Women's Euro,female,False,True,2022,2024-02-13T13:27:17.178263,2024-02-13T13:30:52.820588,2024-02-13T13:30:52.820588,2024-02-13T13:27:17.178263
73,72,107,International,Women's World Cup,female,False,True,2023,2025-07-14T10:07:06.620906,2025-07-14T10:10:27.224586,2025-07-14T10:10:27.224586,2025-07-14T10:07:06.620906
74,72,30,International,Women's World Cup,female,False,True,2019,2024-08-08T15:57:56.748740,2021-06-13T16:17:31.694,,2024-08-08T15:57:56.748740


# EUROS

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

matches_euros = sb.matches(competition_id=53, season_id=315)
matches_euros.head(2)

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,match_status_360,last_updated,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,4020846,2025-07-27,16:00:00.000,Europe - UEFA Women's Euro,2025,England Women's,Spain Women's,1,1,available,available,2025-07-28T14:19:20.467348,2025-07-29T16:03:07.355174,6,Final,St. Jakob-Park,Stéphanie Frappart,Sarina Glotzbach-Wiegman,Montserrat Tomé Vázquez,1.1.0,2,2
1,4020077,2025-07-23,19:00:00.000,Europe - UEFA Women's Euro,2025,Germany Women's,Spain Women's,0,1,available,available,2025-07-24T19:44:48.774783,2025-07-25T15:22:27.432293,5,Semi-finals,Stadion Letzigrund,Edina Alves Batista,Christian Richard Wück,Montserrat Tomé Vázquez,1.1.0,2,2


### EUROS final - events

In [4]:
final_match = matches_euros[matches_euros['competition_stage'] == 'Final'].iloc[0]
final_match_id = final_match['match_id']

events_df = sb.events(match_id=final_match_id)

In [5]:
shots_df = events_df[events_df['type'] == 'Shot'].copy()

# Mark goals
shots_df['goal'] = shots_df['shot_outcome'].apply(lambda x: 1 if x == 'Goal' else 0)

In [6]:
# LINEUP

# Select first two Starting XI rows
xi_rows = events_df.loc[events_df['type'] == 'Starting XI'].iloc[:2]

# Create an empty list to store DataFrames
lineups = []

for _, xi_row in xi_rows.iterrows():
    tactics = xi_row['tactics']
    lineup_df = pd.DataFrame(tactics['lineup'])
    
    # Extract player and position info
    lineup_df['player_id'] = lineup_df['player'].apply(lambda x: x['id'])
    lineup_df['player'] = lineup_df['player'].apply(lambda x: x['name'])
    
    lineup_df['position_id'] = lineup_df['position'].apply(lambda x: x['id'])
    lineup_df['position'] = lineup_df['position'].apply(lambda x: x['name'])
    
    # Add team and formation
    lineup_df['team'] = xi_row['team']
    
    lineups.append(lineup_df)

# Combine home and away into one DataFrame
lineup_df = pd.concat(lineups, ignore_index=True)

lineup_df_euros = lineup_df

In [7]:
# Filter shots up to minute 120 (to exclude penalties)
shots_up_to_120 = shots_df[shots_df['minute'] < 120]

# Calculate total xG, goals, number of shots, and avg xG per shot for each player
player_xg_summary = shots_up_to_120.groupby(['player', 'team']).agg(
    shots=('shot_statsbomb_xg', 'count'),
    total_xg=('shot_statsbomb_xg', 'sum'),
    xg_per_shot=('shot_statsbomb_xg', 'mean'),
    goals=('goal', 'sum')
).sort_values('total_xg', ascending=False)

# Merge position info from the lineup
player_xg_summary = player_xg_summary.reset_index().merge(
    lineup_df_euros[['player', 'team', 'position', 'jersey_number']],
    on=['player', 'team'],
    how='left'
)

# Fill missing positions and jersey numbers for substitutes
player_xg_summary['position'] = player_xg_summary['position'].fillna('Sub')
player_xg_summary['jersey_number'] = player_xg_summary['jersey_number'].fillna(-1).astype(int)

# Reorder columns
player_xg_summary = player_xg_summary[
    ['player', 'team', 'position', 'jersey_number', 'shots', 'total_xg', 'xg_per_shot', 'goals']
]

player_xg_summary_euros = player_xg_summary
player_xg_summary_euros

Unnamed: 0,player,team,position,jersey_number,shots,total_xg,xg_per_shot,goals
0,María Francesca Caldentey Oliver,Spain Women's,Left Wing,8,3,0.547321,0.18244,1
1,Salma Paralluelo Ayingono,Spain Women's,Sub,-1,3,0.441645,0.147215,0
2,Lauren Hemp,England Women's,Right Wing,11,1,0.33025,0.33025,0
3,Esther Gonzalez Rodríguez,Spain Women's,Center Forward,9,3,0.304451,0.101484,0
4,Alessia Russo,England Women's,Center Forward,23,2,0.277606,0.138803,1
5,Victoria López,Spain Women's,Sub,-1,3,0.249601,0.0832,0
6,Aitana Bonmati Conca,Spain Women's,Right Center Midfield,6,4,0.216761,0.05419,0
7,Claudia Pina Medina,Spain Women's,Sub,-1,2,0.138034,0.069017,0
8,Athenea del Castillo Belvide,Spain Women's,Right Wing,10,3,0.115528,0.038509,0
9,Chloe Kelly,England Women's,Sub,-1,2,0.108643,0.054321,0


In [8]:
# Total xG by substitutes
subs_xg = player_xg_summary_euros.loc[player_xg_summary_euros['position'] == 'Sub', 'total_xg'].sum()

# Total xG by other positions
starters_xg = player_xg_summary_euros.loc[player_xg_summary_euros['position'] != 'Sub', 'total_xg'].sum()

# Combine in a DataFrame for easy view
xg_split = pd.DataFrame({
    'Category': ['Subs', 'Starters/Other'],
    'Total_xG': [subs_xg, starters_xg]
})

xg_split

Unnamed: 0,Category,Total_xG
0,Subs,0.937922
1,Starters/Other,2.080566


In [9]:
# Calculate total xG and goals for each team
team_xg_summary = shots_up_to_120.groupby('team').agg(
    shots=('shot_statsbomb_xg', 'count'),
    total_xg=('shot_statsbomb_xg', 'sum'),
    goals=('goal', 'sum'),
    xg_per_shot=('shot_statsbomb_xg', 'mean')
).sort_values('total_xg', ascending=False)

team_xg_summary

Unnamed: 0_level_0,shots,total_xg,goals,xg_per_shot
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Spain Women's,23,2.13598,1,0.092869
England Women's,8,0.882508,1,0.110313


## WORLDS

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

matches_worlds = sb.matches(competition_id=72, season_id=107)
matches_worlds.head(2)

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,match_status_360,last_updated,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,3904629,2023-08-16,13:00:00.000,International - Women's World Cup,2023,Australia Women's,England Women's,1,3,available,available,2023-08-30T11:15:11.306289,2023-08-30T11:17:47.551826,6,Semi-finals,Accor Stadium,Tori Penso,Tony Gustavsson,Sarina Glotzbach-Wiegman,1.1.0,2,2
1,3906390,2023-08-20,13:00:00.000,International - Women's World Cup,2023,Spain Women's,England Women's,1,0,available,available,2023-08-22T19:29:29.948278,2023-08-22T19:38:43.965521,7,Final,Accor Stadium,Tori Penso,Jorge Vilda,Sarina Glotzbach-Wiegman,1.1.0,2,2


In [11]:
final_match = matches_worlds[matches_worlds['competition_stage'] == 'Final'].iloc[0]
final_match_id = final_match['match_id']

events_df = sb.events(match_id=final_match_id)

In [12]:
shots_df = events_df[events_df['type'] == 'Shot'].copy()

# Mark goals
shots_df['goal'] = shots_df['shot_outcome'].apply(lambda x: 1 if x == 'Goal' else 0)

In [13]:
# LINEUP

# Select first two Starting XI rows
xi_rows = events_df.loc[events_df['type'] == 'Starting XI'].iloc[:2]

# Create an empty list to store DataFrames
lineups = []

for _, xi_row in xi_rows.iterrows():
    tactics = xi_row['tactics']
    lineup_df = pd.DataFrame(tactics['lineup'])
    
    # Extract player and position info
    lineup_df['player_id'] = lineup_df['player'].apply(lambda x: x['id'])
    lineup_df['player'] = lineup_df['player'].apply(lambda x: x['name'])
    
    lineup_df['position_id'] = lineup_df['position'].apply(lambda x: x['id'])
    lineup_df['position'] = lineup_df['position'].apply(lambda x: x['name'])
    
    # Add team and formation
    lineup_df['team'] = xi_row['team']
    
    lineups.append(lineup_df)

# Combine home and away into one DataFrame
lineup_df = pd.concat(lineups, ignore_index=True)

lineup_df_worlds = lineup_df

In [14]:
# Filter shots up to minute 120 (to exclude penalties)
shots_up_to_120 = shots_df[shots_df['minute'] < 120]

# Calculate total xG, goals, number of shots, and avg xG per shot for each player
player_xg_summary = shots_up_to_120.groupby(['player', 'team']).agg(
    shots=('shot_statsbomb_xg', 'count'),
    total_xg=('shot_statsbomb_xg', 'sum'),
    xg_per_shot=('shot_statsbomb_xg', 'mean'),
    goals=('goal', 'sum')
).sort_values('total_xg', ascending=False)

# Merge position info from the lineup
player_xg_summary = player_xg_summary.reset_index().merge(
    lineup_df_worlds[['player', 'team', 'position', 'jersey_number']],
    on=['player', 'team'],
    how='left'
)

# Fill missing positions and jersey numbers for substitutes
player_xg_summary['position'] = player_xg_summary['position'].fillna('Sub')
player_xg_summary['jersey_number'] = player_xg_summary['jersey_number'].fillna(-1).astype(int)

# Reorder columns
player_xg_summary = player_xg_summary[
    ['player', 'team', 'position', 'jersey_number', 'shots', 'total_xg', 'xg_per_shot', 'goals']
]

player_xg_summary_worlds = player_xg_summary
player_xg_summary_worlds

Unnamed: 0,player,team,position,jersey_number,shots,total_xg,xg_per_shot,goals
0,Jennifer Hermoso Fuentes,Spain Women's,Center Attacking Midfield,10,2,0.843641,0.421821,0
1,Salma Paralluelo Ayingono,Spain Women's,Center Forward,18,3,0.503586,0.167862,0
2,Alba María Redondo Ferrer,Spain Women's,Right Wing,17,2,0.471368,0.235684,0
3,Lauren Hemp,England Women's,Left Center Forward,11,4,0.397953,0.099488,0
4,Alexia Putellas Segura,Spain Women's,Sub,-1,1,0.098619,0.098619,0
5,Aitana Bonmati Conca,Spain Women's,Right Center Midfield,6,2,0.091389,0.045694,0
6,Olga Carmona García,Spain Women's,Left Back,19,1,0.054861,0.054861,1
7,Millie Bright,England Women's,Center Back,6,2,0.0507,0.02535,0
8,Irene Paredes Hernandez,Spain Women's,Right Center Back,4,1,0.0498,0.0498,0
9,María Francesca Caldentey Oliver,Spain Women's,Left Wing,8,1,0.046465,0.046465,0


In [15]:
# Total xG by substitutes
subs_xg = player_xg_summary_worlds.loc[player_xg_summary_worlds['position'] == 'Sub', 'total_xg'].sum()

# Total xG by other positions
starters_xg = player_xg_summary_worlds.loc[player_xg_summary_worlds['position'] != 'Sub', 'total_xg'].sum()

# Combine in a DataFrame for easy view
xg_split = pd.DataFrame({
    'Category': ['Subs', 'Starters/Other'],
    'Total_xG': [subs_xg, starters_xg]
})

xg_split

Unnamed: 0,Category,Total_xG
0,Subs,0.112913
1,Starters/Other,2.550889


In [16]:
# Calculate total xG and goals for each team
team_xg_summary = shots_up_to_120.groupby('team').agg(
    shots=('shot_statsbomb_xg', 'count'),
    total_xg=('shot_statsbomb_xg', 'sum'),
    goals=('goal', 'sum'),
    xg_per_shot=('shot_statsbomb_xg', 'mean')
).sort_values('total_xg', ascending=False)

team_xg_summary

Unnamed: 0_level_0,shots,total_xg,goals,xg_per_shot
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Spain Women's,14,2.194123,1,0.156723
England Women's,8,0.469679,0,0.05871
