## Packages and configuration

In [3]:
from statsbombpy import sb
import pandas as pd
from mplsoccer import VerticalPitch,Pitch
from highlight_text import ax_text, fig_text
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
import seaborn as sns
import pprint
import numpy as np

## Load Competiton, Match, and Event Data from statsbombpy

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Call statsbombpy API to get all free competitions, then chec Women's comps
free_comps = sb.competitions()
women_comps = free_comps[free_comps['competition_gender'] == 'female']
women_comps

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
25,37,90,England,FA Women's Super League,female,False,False,2020/2021,2025-04-23T14:16:46.924831,2021-06-13T16:17:31.694,,2025-04-23T14:16:46.924831
26,37,42,England,FA Women's Super League,female,False,False,2019/2020,2024-02-12T15:05:34.211400,2021-06-13T16:17:31.694,,2024-02-12T15:05:34.211400
27,37,4,England,FA Women's Super League,female,False,False,2018/2019,2024-08-07T17:22:40.334287,2021-06-13T16:17:31.694,,2024-08-07T17:22:40.334287
63,49,3,United States of America,NWSL,female,False,False,2018,2024-12-15T12:31:48.035735,2021-06-13T16:17:31.694,,2024-12-15T12:31:48.035735
71,53,315,Europe,UEFA Women's Euro,female,False,True,2025,2025-07-28T14:19:20.467348,2025-07-29T16:03:07.355174,2025-07-29T16:03:07.355174,2025-07-28T14:19:20.467348
72,53,106,Europe,UEFA Women's Euro,female,False,True,2022,2024-02-13T13:27:17.178263,2024-02-13T13:30:52.820588,2024-02-13T13:30:52.820588,2024-02-13T13:27:17.178263
73,72,107,International,Women's World Cup,female,False,True,2023,2025-07-14T10:07:06.620906,2025-07-14T10:10:27.224586,2025-07-14T10:10:27.224586,2025-07-14T10:07:06.620906
74,72,30,International,Women's World Cup,female,False,True,2019,2024-08-08T15:57:56.748740,2021-06-13T16:17:31.694,,2024-08-08T15:57:56.748740


## All women's matches

In [5]:
all_matches = []

for _, row in women_comps.iterrows():
    matches = sb.matches(
        competition_id=row["competition_id"],
        season_id=row["season_id"]
    )
    all_matches.append(matches)

matches_df = pd.concat(all_matches, ignore_index=True)

In [10]:
len(matches_df)

540

## All events, passes only

In [None]:
all_passes = []

for match_id in matches_df["match_id"]:
    events = sb.events(match_id=match_id)
    passes = events[events["type"] == "Pass"].copy()
    all_passes.append(passes)

passes_df = pd.concat(all_passes, ignore_index=True)


## Feature engineering

In [None]:
df = passes_df.copy()

# target
df["completed"] = df["pass_outcome"].isna().astype(int)

# locations
df["start_x"] = df["location"].str[0]
df["start_y"] = df["location"].str[1]
df["end_x"] = df["pass_end_location"].str[0]
df["end_y"] = df["pass_end_location"].str[1]

# geometry
df["dx"] = df["end_x"] - df["start_x"]
df["dy"] = df["end_y"] - df["start_y"]
df["distance"] = np.sqrt(df["dx"]**2 + df["dy"]**2)
df["forward"] = (df["dx"] > 0).astype(int)

# pressure & intent
df["under_pressure"] = df["under_pressure"].fillna(False).astype(bool).astype(int)
df["through_ball"] = df["pass_through_ball"].fillna(False).astype(bool).astype(int)
df["cross"] = df["pass_cross"].fillna(False).astype(bool).astype(int)
df["switch"] = df["pass_switch"].fillna(False).astype(bool).astype(int)
df["high_pass"] = (df["pass_height"] == "High Pass").astype(int)

features = [
    "distance",
    "forward",
    "under_pressure",
    "high_pass",
    "through_ball",
    "cross",
    "switch"
]

df_model = df.dropna(subset=features + ["completed"])

## Train global xP model