In [None]:
from IPython.core.debugger import set_trace
from IPython.utils import traitlets as _traitlets

  from IPython.utils import traitlets as _traitlets


<h1><center> Analyzing Learner Output </center></h1>

In this notebook, we will investigate the model predictions and will try to identify any potential biases. In particular, we will be after any pattern that could allow us to improve the model:
+ We will investigate the probability distribution by action type
+ We will investigate the spatial distribution of actions by type

In [None]:
# hide
import glob
import os
import pickle
import tempfile
import warnings
from pathlib import Path
from random import sample
from typing import List, Tuple

import numpy as np
import pandas as pd
from fastcore.basics import *
from fastcore.foundation import *
from fastcore.xtras import load_pickle, save_pickle
from progressbar import progressbar
from tsai.all import *

from footSeq.datastruct.core import *
from footSeq.model.learner import *
from footSeq.plots import *

# Data preparation

Let's start by loading the model we want to investigate:

In [None]:
# hide
model_name = "LSTM_FCN_bidir-True_layers-2_no_goal_prop-2"
base_path = Path(".")

learn = load_all(
    path=base_path / "models" / model_name,
    dls_fname="dls",
    model_fname="model",
    learner_fname="learner",
    device="cpu",
    verbose=True,
)

FileNotFoundError: [Errno 2] No such file or directory: '/sequences_clean/1375911___1___30078___30121___no-goal.csv'

Next, we will select a number of files to test the model on. Ideally, these files should not be part of the training/validation set.

In [None]:
# hide
data_path = Path("/sequences")
used_files = learn.dls.tfms.train_files + learn.dls.tfms.valid_files
all_files = data_path.ls(file_exts=".csv")

test_files = L(set(all_files) - set(used_files))
train_goals, test_goals, no_goals = pick_files(test_files[:100000])

In [None]:
# hide
no_goal_prop = 2
goals_info = pd.concat([train_goals, test_goals], axis=0)
n_no_goals = int(goals_info.shape[0] * no_goal_prop)

files_info = (
    pd.concat([goals_info, no_goals.sample(n_no_goals)], axis=0)
    .sample(frac=1, ignore_index=True)
    .sort_values(["gameId", "possNumber", "nSteps"])
    .drop_duplicates(["gameId", "possNumber"], keep="last")
)

In [None]:
# hide
files_info.groupby("target").size()

Finally, we can now run some predictions that can be used in the analysis steps:

In [None]:
# hide
def _prob(file):
    try:
        _probs = learn.predict_poss(pd.read_csv(file))
    except:
        _probs = None
    return _probs


probs_file_path = Path("./models") / model_name / "sample_probs.csv"
if os.path.isfile(probs_file_path):
    probs = pd.read_csv(probs_file_path)
else:
    n_files = 500 if files_info.shape[0] > 500 else files_info.shape[0]
    probs = pd.concat(
        [_prob(file) for file in files_info.sample(n_files).file]
    ).reset_index(drop=True)
    probs.to_csv(Path("./models") / model_name / "sample_probs.csv")

# Probability distribution by action

The first step in our analysis is to investigate the distribution of probability by action type. It should allow us to identify some obvious issues in our data-preparation or how we present the data to the network:

In [None]:
# hide
probs.groupby(["generic_action_type_name", "type_name"])[
    "proba_goal"
].describe().sort_values(["generic_action_type_name", "50%"])

# Spatial distribution of probability by action

It also useful to visualize the distribution of the probability mass over the pitch for a given action and see if it is line with our intuition:

In [None]:
# hide
action_name = "Shot on target"
hm_tabl = dens_prob(probs[probs.type_name == action_name])
_ = plot_heatmap(
    dens_arr=hm_tabl,
    fieldcolor="white",
    linecolor="black",
    cmap="Blues",
    title=action_name,
)

In [None]:
## TODO: plot it multiple actions on a pitch