# Software Dependencies

- pandas: `pip install pandas`
- numpy: `pip install numpy`
- matplotlib: `pip install matplotlib`
- graph-tool: `conda install -c conda-forge graph-tool` ([doc](https://git.skewed.de/count0/graph-tool/-/wikis/installation-instructions))

# Initialization

In [2]:
import numpy as np
import pandas as pd
from typing import List

data_matches_path = 'data/matches.csv'
data_passings_path = 'data/passingevents.csv'
data_events_path = 'data/fullevents.csv'

# load data
matches_df = pd.read_csv(data_matches_path)
passings_df = pd.read_csv(data_passings_path)
events_df = pd.read_csv(data_events_path)


# convert 'win', 'tie' and 'lose' to int
def outcome_int_map(x: str):
    if x == 'win':
        return 1
    elif x == 'tie':
        return 0
    else:
        return -1


matches_df['Outcome'] = matches_df['Outcome'].apply(outcome_int_map)

all_events = events_df.join(
    matches_df[['OwnScore', 'OpponentScore', 'Outcome']],
    on='MatchID',
    how='outer')
huskies_passes = all_events[(all_events['TeamID'] == 'Huskies')
                            & (all_events['EventType'] == 'Pass')]
huskies_events = all_events[all_events['TeamID'] == 'Huskies']

huskies_player_ids: List[str] = huskies_events['OriginPlayerID'].tolist() + huskies_events[
    'DestinationPlayerID'].tolist()
huskies_player_ids = np.unique(huskies_player_ids).tolist()
huskies_player_ids.remove('nan')

match_ids = np.unique(matches_df['MatchID'])

# Attack Index

In [1]:
data_matches_path = 'data/matches.csv'
data_passings_path = 'data/passingevents.csv'
data_events_path = 'data/fullevents.csv'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120

In [3]:
matches_df = pd.read_csv(data_matches_path)
passings_df = pd.read_csv(data_passings_path)
events_df = pd.read_csv(data_events_path)


# convert 'win', 'tie' and 'lose' to int
def outcome_int_map(x: str):
    if x == 'win':
        return 1
    elif x == 'tie':
        return 0
    else:
        return -1


matches_df['Outcome'] = matches_df['Outcome'].apply(outcome_int_map)

all_events = events_df.join(
    matches_df[['OwnScore', 'OpponentScore', 'Outcome']],
    on='MatchID',
    how='outer')
huskies_passes = all_events[(all_events['TeamID'] == 'Huskies')
                            & (all_events['EventType'] == 'Pass')]
huskies_events = all_events[all_events['TeamID'] == 'Huskies']

In [4]:
huskies_player_ids = np.unique(
    huskies_events['OriginPlayerID'].tolist() +
    huskies_events['DestinationPlayerID'].tolist()).tolist()
huskies_player_ids.remove('nan')

match_ids = np.unique(matches_df['MatchID'])

# Compute metrics for a network

*Defining a historic football team: Using NetworkScience to analyze Guardiola’s F.C. Barcelona*

1. Number of passes

In [8]:
def get_n_passes(match_id: int):
    return passings_df[(passings_df['MatchID'] == match_id)
                       & (passings_df['TeamID'] == 'Huskies')].shape[0]

2. Number of shots to goal

In [9]:
def get_n_shots(match_id: int):
    return events_df[(events_df['EventType'] == 'Shot')
                     & (events_df['MatchID'] == match_id) &
                     (events_df['TeamID'] == 'Huskies')].shape[0]

3. Number of goals

In [10]:
def get_score(match_id: int):
    return matches_df[matches_df['MatchID'] ==
                      match_id]['OwnScore'].tolist()[0]


get_n_goals = get_score

4. Number of points at the end of season

NONE

$X$ and $Y$ centroid coordinates correspond to the average position of all pases of the network. Specifically, we **only consider the position from where the pass is sent**

5. X-coordinate of the network centroid $X$
6. Y-coordinate of the network centroid $Y$

In [11]:
from typing import Tuple


def get_centroids(match_id: int) -> Tuple[float, float]:
    data = passings_df[(passings_df['MatchID'] == match_id)
                       & (passings_df['TeamID'] == 'Huskies')]
    return data['EventOrigin_x'].mean(), data['EventOrigin_y'].mean()


def get_x_centroid(match_id: int):
    return get_centroids(match_id)[0]


def get_y_centroid(match_id: int):
    return get_centroids(match_id)[1]

7. Dispersion of the position of the players around the network centroid $NC_\text{disp}$

The centroid dispersion $C\text{ent}_\text{disp}$ corresponds to the standard deviation of the distances of the players with regard to the position of the network centroid.

**FIXME: how to calculate the position of a player? Assuming the coordinates when initiating a pass**

In [12]:
def get_dispersion(match_id: int):
    data = passings_df[(passings_df['MatchID'] == match_id)
                       & (passings_df['TeamID'] == 'Huskies')]
    cx = data['EventOrigin_x'].mean()
    cy = data['EventOrigin_y'].mean()
    dists = [(r['EventOrigin_x'] - cx)**2 + (r['EventOrigin_y'] - cy)**2
             for i, r in data.iterrows()]
    dists = np.sqrt(np.asarray(dists))
    return np.std(dists)

8. Average ratio between the passing distance parallel and perpendicular to the opponent’s goal $\Delta y/\Delta x$

In [13]:
def get_dx_div_dy(match_id: int):
    data = passings_df[(passings_df['MatchID'] == match_id)
                       & (passings_df['TeamID'] == 'Huskies')]
    ratios = [
        (r['EventDestination_y'] - r['EventOrigin_y']) /
        (r['EventDestination_x'] - r['EventOrigin_x'] + 10**-5
         )  # prevent divided by zero
        for i, r in data.iterrows()
    ]
    return np.mean(np.asarray(ratios))

## My metrics

In [14]:
def get_n_attempt_shots(match_id: int):
    return get_n_shots(match_id) - get_n_goals(match_id)

## Evaluation methods

In [15]:
def get_score_diff(match_id: int):
    data = matches_df[matches_df['MatchID'] == match_id]
    return data['OwnScore'].tolist()[0] - data['OpponentScore'].tolist()[0]


def get_outcome(match_id: int):
    return matches_df[matches_df['MatchID'] == match_id]['Outcome'].tolist()[0]

# Plot metrics against evaluation

In [16]:
from typing import List


def plot_metrics_against(y_func, y_name: str):
    metric_funcs = [
        get_n_passes, get_n_shots, get_n_goals, get_x_centroid, get_y_centroid,
        get_dispersion, get_dx_div_dy, get_n_attempt_shots
    ]
    metric_names = [
        'Number of passes', 'Number of shots', 'Number of goals',
        r'$X$ centroid', r'$Y$ centroid', 'Dispersion',
        r'$\frac{\Delta y}{\Delta x}$', 'Attempted Shots'
    ]
    n = len(metric_names)
    fig, axs = plt.subplots(4, 2)
    fig.set_size_inches(18, 15)
    Y = [y_func(mi) for mi in match_ids]
    for i in range(n):
        X = [metric_funcs[i](mi) for mi in match_ids]
        ax_x = i // 2
        ax_y = i % 2
        axs[ax_x, ax_y].scatter(X, Y)
        axs[ax_x, ax_y].set_xlabel(metric_names[i])
        axs[ax_x, ax_y].set_ylabel(y_name)
    fig.tight_layout()
    fig.suptitle("Metrics vs. {0} in each match".format(y_name), size=16)
    fig.subplots_adjust(top=0.95)
    plt.show()

In [17]:
# plot_metrics_against(get_score_diff, 'Score differences')

In [18]:
# plot_metrics_against(get_outcome, 'Score differences')

In [19]:
from typing import List


def get_metrics_eval_corr(y_funcs, y_names: List[str]):
    metric_funcs = [
        get_n_passes, get_n_shots, get_n_goals, get_x_centroid, get_y_centroid,
        get_dispersion, get_dx_div_dy, get_n_attempt_shots
    ]
    metric_names = [
        'Number of passes', 'Number of shots', 'Number of goals', 'X centroid',
        'Y centroid', 'Dispersion', 'dy/dx', 'Attempted Shots'
    ]
    df_dict = {}
    for yf, yn in zip(y_funcs, y_names):
        df_dict[yn] = [yf(mi) for mi in match_ids]

    for xf, xn in zip(metric_funcs, metric_names):
        df_dict[xn] = [xf(mi) for mi in match_ids]
    df = pd.DataFrame(df_dict)
    return df.corr()

In [20]:
# get_metrics_eval_corr([get_score_diff, get_outcome],
#                       ['Score differences', 'Outcomes'])

# Metrics from paper

*The harsh rule of the goals: Data-driven performance indicators for football teams*

1. $\mu$: the average amount of passes managed by players (in and out) in the team during the game
2. $\sigma$: the variance  of the amount of passes managed by players in the team during the game. 

In [21]:
def get_player_passes_metrics():
    """Calculate and return:
    - the total number of passes,
    - the average number of passes for players,
    - the std of number of passes for players,
    for every match.
    """
    total_player_counts = [[
        huskies_passes[(huskies_passes['MatchID'] == mi) & (
            (huskies_passes['OriginPlayerID'] == hpi)
            | (huskies_passes['DestinationPlayerID'] == hpi))].shape[0]
        for hpi in huskies_player_ids
    ] for mi in match_ids]
    passes = np.asarray(total_player_counts)
    return np.sum(passes, axis=1), np.mean(passes, axis=1), np.std(passes,
                                                                   axis=1)

In [22]:
w, mu, sigma = get_player_passes_metrics()
h = 1 / w + 1 / mu + 1 / sigma
metrics_df = pd.DataFrame({'w': w, 'mu': mu, 'sigma': sigma, 'h': h})
metrics_df = metrics_df.join(matches_df[['Outcome', 'OwnScore']])
metrics_df.corr()

Unnamed: 0,w,mu,sigma,h,Outcome,OwnScore
w,1.0,1.0,0.988331,-0.9201,0.131284,-0.072945
mu,1.0,1.0,0.988331,-0.9201,0.131284,-0.072945
sigma,0.988331,0.988331,1.0,-0.909304,0.183129,-0.048731
h,-0.9201,-0.9201,-0.909304,1.0,-0.143148,0.035912
Outcome,0.131284,0.131284,0.183129,-0.143148,1.0,0.676123
OwnScore,-0.072945,-0.072945,-0.048731,0.035912,0.676123,1.0
