In [1]:
import pandas as pd
from pathlib import Path

PROCESSED_DIR = Path("../data/processed")

fact_event = pd.read_csv(PROCESSED_DIR / "fact_event.csv")
fact_shot = pd.read_csv(PROCESSED_DIR / "fact_shot.csv")


Build team_match aggregation table

In [4]:
base_team_match = (
    fact_event
    .groupby(["match_id", "team_id"])
    .agg(
        events=("event_id", "count"),
        passes=("is_pass", "sum"),
        shots=("is_shot", "sum"),
        carries=("is_carry", "sum"),
        pressures=("is_pressure", "sum"),
    )
    .reset_index()
)

base_team_match.head()



Unnamed: 0,match_id,team_id,events,passes,shots,carries,pressures
0,3753972,26,1698,455,12,294,236
1,3753972,28,1985,577,17,445,132
2,3753973,33,1976,586,22,394,154
3,3753973,40,1502,377,13,262,154
4,3753974,29,2058,581,20,482,165


In [6]:
fact_shot["is_goal"] = fact_shot["shot_outcome"] == "Goal"


In [7]:
shots_with_team = fact_shot.merge(
    fact_event[["event_id", "match_id", "team_id"]],
    on="event_id",
    how="left"
)


In [8]:
shot_aggs = (
    shots_with_team
    .groupby(["match_id", "team_id"])
    .agg(
        goals=("is_goal", "sum"),
        xg=("shot_statsbomb_xg", "sum"),
    )
    .reset_index()
)


In [9]:
fact_team_match = base_team_match.merge(
    shot_aggs,
    on=["match_id", "team_id"],
    how="left"
)


In [10]:
fact_team_match[["goals", "xg"]] = fact_team_match[["goals", "xg"]].fillna(0)


In [15]:
fact_team_match.to_csv(
    PROCESSED_DIR / "fact_team_match_stats.csv",
    index=False
)


 Build match-player aggregation table

In [None]:
base_player_match = (
    fact_event
    .dropna(subset=["player_id"])
    .groupby(["match_id", "player_id"])
    .agg(
        events=("event_id", "count"),
        passes=("is_pass", "sum"),
        shots=("is_shot", "sum"),
        carries=("is_carry", "sum"),
        pressures=("is_pressure", "sum"),
    )
    .reset_index()
)
base_player_match.shape


(10450, 7)

In [19]:
shots_with_player = fact_shot.merge(
    fact_event[["event_id", "match_id", "player_id"]],
    on="event_id",
    how="left"
)


In [20]:
player_shot_aggs = (
    shots_with_player
    .groupby(["match_id", "player_id"])
    .agg(
        goals=("is_goal", "sum"),
        xg=("shot_statsbomb_xg", "sum"),
    )
    .reset_index()
)


In [21]:
fact_player_match = base_player_match.merge(
    player_shot_aggs,
    on=["match_id", "player_id"],
    how="left"
)


In [22]:
fact_player_match[["goals", "xg"]] = (
    fact_player_match[["goals", "xg"]].fillna(0)
)


In [26]:
fact_player_match.to_csv(
    PROCESSED_DIR / "fact_player_match_stats.csv",
    index=False
)
