In [None]:
#| default_exp  Feature.pressurelines

# Dynamic Pressure Lines Feature

In this module, we compute the pressure lines feature as mentioned in the paper [A framework for the fine-grained evaluation of the instantaneous expected value of soccer possessions](https://link.springer.com/article/10.1007/s10994-021-05989-6)

In [None]:
#| export 

import pandas as pd
import numpy as np
import collections
import math 
from sklearn.cluster import KMeans

In [None]:
tracking_path = "/home/islam/Downloads/data/skillCorner_tracking_df.csv"

In [None]:
tracking_df = pd.read_csv(tracking_path,low_memory=False)

In [None]:
frame = tracking_df[(tracking_df["frameId"] == 741)]

# Cluster

To compute this feature, we use KMeans clustering. For each frame, we compute the vertical and horizontal pressure lines of the opponent team. 

To compute that, we identify for each frame :
+ opponent team (`home` or `away`)
+ ball coordinates (`x`,`y`)
+ `cluster input`

The `cluster input` has two elements:
+ player's coordinates of the opponent team (`x`,`y`)
+ player's ids of the opponent team

After that, we can apply the cluster and sort it by following these steps:
+ compute centroid for each cluster
+ compute distance between each centroid and ball
+ sort cluster by nearest centroid to ball

In [None]:
#| export 

def _cluster(frame, n_clusters=3):

    #prepare the cluster input.
    def _input(frame, jerseyn_goal="1"):
        opponent_team = "away" if frame.iloc[0]["possessionSide"] == "home" else "home"
        cluster_input = frame.loc[:,
                                  frame.columns.str.startswith(
                                      (opponent_team))]
        cluster_input = cluster_input.loc[:,
                                          cluster_input.columns.str.endswith(
                                              ("_x", "_y"))]
        cluster_input = cluster_input.drop([
            opponent_team + "_player_" + jerseyn_goal + "_x",
            opponent_team + "_player_" + jerseyn_goal + "_y"
        ],
                                           axis=1)
        cluster_input = cluster_input.dropna(axis=1, how='all')

        #find players cols
        player_col = [
            item.replace("x", "player_id")
            for item in list(cluster_input.loc[:,
                                               cluster_input.columns.str.
                                               endswith('_x')].columns.values)
        ]
        player_ids = [frame.iloc[0][col] for col in player_col]

        cluster_input = np.reshape(cluster_input.values, (-1, 2))

        return cluster_input, player_ids

    #calculate the centroid of a cluster.
    def centeroid(coord: list) -> float:
        x, y = zip(*coord)
        l = len(x)
        return sum(x) / l, sum(y) / l

    #calculate distance between centroid & ball.
    def ball_dist(centroid_coord: list, ball_coord: list) -> float:
        return math.sqrt((ball_coord[0] - centroid_coord[0])**2 +
                         (ball_coord[1] - centroid_coord[1])**2)

    #sort the clusters according to the centroid closest to the ball.
    def sort_cluster(cluster_output, player_ids):

        clt_pl = collections.defaultdict(list)
        clt_coord = collections.defaultdict(list)

        for x, y in zip(cluster_output, player_ids):
            clt_pl[x].append(y)
        for x, y in zip(cluster_output, cluster_input):
            clt_coord[x].append(y)

        #calculate centeroid for each cluster
        clt_coord = dict(
            map(
                lambda coord: (coord[0], centeroid(coord[1])),
                clt_coord.items()))
        #calculate distance between centeroid & ball
        clt_coord = dict(
            map(
                lambda coord: (coord[0], ball_dist(coord[1], [ball_coord[0], ball_coord[1]])),
                clt_coord.items()))
        #sort dict
        clt_coord = dict(sorted(clt_coord.items(), key=lambda item: item[1]))

        sorted_keys = list(clt_coord.keys())

        return [(",".join(clt_pl.get(cluster_num)))
                for cluster_num in sorted_keys]

    #apply cluster on input
    def cluster_output(dataset, player_ids, Vpl=True):
        X = np.array(dataset, copy=True)
        #vertical pressure lines, replace y with zeros
        if Vpl:
            X[:, 1] = 0
        #horizontal pressure lines, replace x with zeros
        else:
            X[:, 0] = 0

        km = KMeans(n_clusters)
        y_means = km.fit_predict(X)

        return sort_cluster(y_means, player_ids)

    ball_coord = [frame.iloc[0]["ball_x"], frame.iloc[0]["ball_y"]]

    cluster_input, player_ids = _input(frame)

    vpl_cluster = cluster_output(cluster_input, player_ids)
    hpl_cluster = cluster_output(cluster_input, player_ids, Vpl=False)

    return vpl_cluster, hpl_cluster

In [None]:
#| export 

def Pressurelines_feature(frame: pd.DataFrame) -> pd.DataFrame:
    
    #verify the frame is not empty and the possession side is not unknown
    if not frame.empty and not frame.iloc[0]["possessionSide"] == "unknown":
        vpl, hpl = _cluster(frame)

        row = dict(frameId=frame.iloc[0]["frameId"],
                   vPressureline1=vpl[0],
                   vPressureline2=vpl[1],
                   vPressureline3=vpl[2],
                   hPressureline1=hpl[0],
                   hPressureline2=hpl[1],
                   hPressureline3=hpl[2])

    return pd.DataFrame(row,index=[0])


In [None]:
Pressurelines_feature(frame)

Unnamed: 0,frameId,vPressureline1,vPressureline2,vPressureline3,hPressureline1,hPressureline2,hPressureline3
0,741,d852020e305114abbdef2c6680626fd9743629a930f194...,4fdd6d44f27fccd6926b99b66b96ac31acb58d78aa3c5a...,27a873ab8064d003025d1809775a404ea6d66f3ce24ea8...,957e96878335293e1b2d97a906ccf182c725390214158f...,27a873ab8064d003025d1809775a404ea6d66f3ce24ea8...,4fdd6d44f27fccd6926b99b66b96ac31acb58d78aa3c5a...
