In [1]:
import pandas as pd
from pathlib import Path

RAW_DIR = Path("../data/raw")
PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)


In [2]:
events = pd.read_csv(RAW_DIR / "events_epl_2015_16.csv")
events.shape


  events = pd.read_csv(RAW_DIR / "events_epl_2015_16.csv")


(1313783, 118)

In [3]:
raw_events = events.copy()

In [4]:
fact_event = events[
    [
        "id",
        "match_id",
        "team_id",
        "player_id",
        "minute",
        "second",
        "period",
        "type",
        "possession",
        "possession_team_id",
        "under_pressure",
        "location"
    ]
].copy()


In [5]:
fact_event = fact_event.rename(columns={"id": "event_id", "type": "event_type"})


In [9]:
fact_event["location"].head(10)
# fact_event["location"].isna().mean()


0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
5             NaN
6    [61.0, 40.1]
7    [60.4, 43.6]
8    [48.0, 41.7]
9    [37.5, 74.6]
Name: location, dtype: object

Lets first transform location

In [10]:
import ast

def extract_xy(loc):
    if isinstance(loc, str):
        try:
            xy = ast.literal_eval(loc)
            if isinstance(xy, list) and len(xy) == 2:
                return xy[0], xy[1]
        except Exception:
            pass
    return None, None


In [11]:
xy = fact_event["location"].apply(extract_xy)

fact_event["x"] = xy.apply(lambda v: v[0])
fact_event["y"] = xy.apply(lambda v: v[1])


In [None]:
fact_event[["location", "x", "y"]].head(10)


In [13]:
fact_event.to_csv(
    PROCESSED_DIR / "fact_event_with_xy.csv",
    index=False
)


In [16]:
fact_event["under_pressure"] = fact_event["under_pressure"].fillna(False).astype(bool)


  fact_event["under_pressure"] = fact_event["under_pressure"].fillna(False).astype(bool)


In [17]:
fact_event[["minute", "second", "period"]].describe()


Unnamed: 0,minute,second,period
count,1313783.0,1313783.0,1313783.0
mean,45.004,29.2509,1.493956
std,27.07731,17.37765,0.4999637
min,0.0,0.0,1.0
25%,21.0,14.0,1.0
50%,45.0,29.0,1.0
75%,68.0,44.0,2.0
max,101.0,59.0,2.0


We classify the events into types, Performance, Admin and Contextual

In [18]:
PERFORMANCE_EVENTS = {
    "Pass",
    "Shot",
    "Carry",
    "Dribble",
    "Pressure",
    "Duel",
    "Interception",
    "Ball Recovery",
    "Clearance",
    "Block",
    "Miscontrol",
    "Dispossessed",
    "Dribbled Past",
    "50/50",
    "Foul Won",
    "Foul Committed"
}

ADMIN_EVENTS = {
    "Starting XI",
    "Substitution",
    "Player On",
    "Player Off",
    "Tactical Shift",
    "Half Start",
    "Half End",
    "Injury Stoppage"
}


In [19]:
def classify_event(event_type):
    if event_type in PERFORMANCE_EVENTS:
        return "performance"
    if event_type in ADMIN_EVENTS:
        return "admin"
    return "context"


In [20]:
fact_event["event_category"] = fact_event["event_type"].apply(classify_event)


In [21]:
fact_event["event_category"].value_counts()


event_category
performance    952496
context        353432
admin            7855
Name: count, dtype: int64

In [22]:
fact_event["is_pass"] = fact_event["event_type"] == "Pass"
fact_event["is_shot"] = fact_event["event_type"] == "Shot"
fact_event["is_carry"] = fact_event["event_type"] == "Carry"
fact_event["is_pressure"] = fact_event["event_type"] == "Pressure"


In [25]:
final_columns = [
    "event_id",
    "match_id",
    "team_id",
    "player_id",
    "period",
    "minute",
    "second",
    "event_type",
    "event_category",
    "possession",
    "possession_team_id",
    "under_pressure",
    "x",
    "y",
    "is_pass",
    "is_shot",
    "is_carry",
    "is_pressure",
]

fact_event_final = fact_event[final_columns].copy()


In [27]:
fact_event_final["period"] = fact_event_final["period"].astype("int8")
fact_event_final["minute"] = fact_event_final["minute"].astype("int16")
fact_event_final["second"] = fact_event_final["second"].astype("int8")

fact_event_final["is_pass"] = fact_event_final["is_pass"].astype(bool)
fact_event_final["is_shot"] = fact_event_final["is_shot"].astype(bool)
fact_event_final["is_carry"] = fact_event_final["is_carry"].astype(bool)
fact_event_final["is_pressure"] = fact_event_final["is_pressure"].astype(bool)


In [28]:
fact_event_final.to_csv(
    PROCESSED_DIR / "fact_event.csv",
    index=False
)


Fact pass

In [29]:
pass_events = events[events["type"] == "Pass"].copy()
pass_events.shape

(368619, 118)

In [30]:
pass_fact = pass_events[
    [
        "id",
        "pass_length",
        "pass_angle",
        "pass_height",
        "pass_outcome",
        "pass_end_location",
        "pass_recipient",
        "pass_type",
    ]
].copy()

pass_fact = pass_fact.rename(columns={"id": "event_id"})


In [31]:
pass_fact.head()


Unnamed: 0,event_id,pass_length,pass_angle,pass_height,pass_outcome,pass_end_location,pass_recipient,pass_type
6,2ca23eea-a984-47e4-8243-8f00880ad1c9,3.551056,1.740575,Ground Pass,,"[60.4, 43.6]",Joshua King,Kick Off
7,0fee7719-7e69-49c5-be81-3f2b77da604e,12.54472,-2.989549,Ground Pass,,"[48.0, 41.7]",Andrew Surman,
8,6362aa69-892f-4d11-8644-21a680ea7c66,35.96679,1.867047,Ground Pass,,"[37.5, 76.1]",Adam Smith,
9,56da36e4-8b0d-4596-ba46-1d944c3d3f04,19.3458,-2.120081,Ground Pass,,"[27.4, 58.1]",Simon Francis,
10,bcfea2e3-9736-4975-be28-ef2c9d693fa7,15.890248,1.06492,Ground Pass,,"[35.1, 77.8]",Adam Smith,


In [36]:
import ast

def extract_end_xy(loc):
    if isinstance(loc, str):
        try:
            xy = ast.literal_eval(loc)
            return xy[0], xy[1]
        except Exception:
            return None, None
    return None, None


In [37]:
end_xy = pass_fact["pass_end_location"].apply(extract_end_xy)

pass_fact["end_x"] = end_xy.apply(lambda v: v[0])
pass_fact["end_y"] = end_xy.apply(lambda v: v[1])


In [39]:
pass_fact = pass_fact.drop(columns=["pass_end_location"])


In [41]:
pass_fact

Unnamed: 0,event_id,pass_length,pass_angle,pass_height,pass_outcome,pass_recipient,pass_type,end_x,end_y
6,2ca23eea-a984-47e4-8243-8f00880ad1c9,3.551056,1.740575,Ground Pass,,Joshua King,Kick Off,60.4,43.6
7,0fee7719-7e69-49c5-be81-3f2b77da604e,12.544720,-2.989549,Ground Pass,,Andrew Surman,,48.0,41.7
8,6362aa69-892f-4d11-8644-21a680ea7c66,35.966790,1.867047,Ground Pass,,Adam Smith,,37.5,76.1
9,56da36e4-8b0d-4596-ba46-1d944c3d3f04,19.345800,-2.120081,Ground Pass,,Simon Francis,,27.4,58.1
10,bcfea2e3-9736-4975-be28-ef2c9d693fa7,15.890248,1.064920,Ground Pass,,Adam Smith,,35.1,77.8
...,...,...,...,...,...,...,...,...,...
1311364,153ee3b7-da4e-454e-a88e-e634ef372e8b,6.806614,-0.044089,Ground Pass,Incomplete,Gylfi Þór Sigurðsson,,72.2,20.8
1311365,86ea2a73-2206-40c5-a4c2-8752aa680db9,7.710382,1.518895,Ground Pass,,Radamel Falcao García Zárate,,53.9,65.6
1311366,3283bedc-834a-404b-8b45-3b7640f5983a,3.298485,-1.815775,Low Pass,Incomplete,,,53.9,63.8
1311367,88aafe86-2436-4bc4-b396-49bb0c928656,10.218122,-0.349607,Low Pass,,Éderzito António Macedo Lopes,,73.2,6.6


In [42]:
final_pass_columns = [
    "event_id",
    "end_x",
    "end_y",
    "pass_length",
    "pass_angle",
    "pass_height",
    "pass_type",
    "pass_outcome",
    "pass_recipient",
]

fact_pass_final = pass_fact[final_pass_columns].copy()


In [44]:
fact_pass_final["pass_length"] = fact_pass_final["pass_length"].astype(float)
fact_pass_final["pass_angle"] = fact_pass_final["pass_angle"].astype(float)


In [45]:
fact_pass_final.to_csv(
    PROCESSED_DIR / "fact_pass.csv",
    index=False
)


Fact Shots

In [46]:
shot_events = events[events["type"] == "Shot"].copy()
shot_events.shape


(9908, 118)

In [47]:
shot_fact = shot_events[
    [
        "id",
        "shot_statsbomb_xg",
        "shot_outcome",
        "shot_body_part",
        "shot_type",
        "shot_technique",
        "shot_first_time",
        "shot_one_on_one",
        "shot_end_location",
    ]
].copy()

shot_fact = shot_fact.rename(columns={"id": "event_id"})


In [54]:
import ast

def extract_end_xyz(loc):
    if isinstance(loc, str):
        try:
            xyz = ast.literal_eval(loc)
            if isinstance(xyz, list):
                if len(xyz) == 3:
                    return xyz[0], xyz[1], xyz[2]
                if len(xyz) == 2:
                    return xyz[0], xyz[1], None
        except Exception:
            pass
    return None, None, None


In [55]:
end_xyz = shot_fact["shot_end_location"].apply(extract_end_xyz)

shot_fact["end_x"] = end_xyz.apply(lambda v: v[0])
shot_fact["end_y"] = end_xyz.apply(lambda v: v[1])
shot_fact["end_z"] = end_xyz.apply(lambda v: v[2])


In [58]:
shot_fact = shot_fact.drop(columns=["shot_end_location"])


In [59]:
final_shot_columns = [
    "event_id",
    "shot_statsbomb_xg",
    "shot_outcome",
    "shot_body_part",
    "shot_type",
    "shot_technique",
    "shot_first_time",
    "shot_one_on_one",
    "end_x",
    "end_y",
    "end_z",
]

fact_shot_final = shot_fact[final_shot_columns].copy()


In [61]:
fact_shot_final["shot_statsbomb_xg"] = fact_shot_final["shot_statsbomb_xg"].astype(float)

fact_shot_final["shot_first_time"] = (
    fact_shot_final["shot_first_time"].fillna(False).astype(bool)
)
fact_shot_final["shot_one_on_one"] = (
    fact_shot_final["shot_one_on_one"].fillna(False).astype(bool)
)


  fact_shot_final["shot_first_time"].fillna(False).astype(bool)
  fact_shot_final["shot_one_on_one"].fillna(False).astype(bool)


In [62]:
fact_shot_final.to_csv(
    PROCESSED_DIR / "fact_shot.csv",
    index=False
)
