In [2]:
%matplotlib inline

import sys
sys.path.append("../")

from haxml.utils import (
    get_matches_metadata,
    get_stadiums,
    get_opposing_goalpost,
    load_match,
    is_target_stadium,
    is_scored_goal,
    total_scored_goals,
    total_kicks,
    goal_fraction,
    stadium_distance,
    angle_from_goal,
    train_test_split_matches_even_count
)

from haxml.viz import (
    plot_positions
)

import math
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

stadiums = get_stadiums("../data/stadiums.json")
metadata = get_matches_metadata("../data/matches_metadata.csv")

In [3]:
train, test = train_test_split_matches_even_count(metadata)

In [4]:
# add to utils
def get_positions_at_time(positions, t):
    """
    Return a list of positions (dicts) closest to, but before time t.
    """
    # Assume positions list is already sorted.
    # frame is a list of positions (dicts) that have the same timestamp.
    frame = []
    time = 0.0
    for pos in positions:
        if pos["time"] > t:
            break
        if pos["time"] == time:
            frame.append(pos)
        else:
            frame = []
            time = pos["time"]
    return frame

In [5]:
def defender_feature(match,kick,dist):
    """
    For a given kick, find the closests defender and the number of defenders within 200 dist
    """
    positions = get_positions_at_time(match["positions"], kick["time"])
    ret = [0,0]
    closest_defender = float('inf')
    defenders_pressuring = 0
    for person in positions:
        if person['team'] is not kick['fromTeam'] and person['type'] == "player": 
            defender_dist = ((kick['fromX'] - person['x'])**2 + (kick['fromY'] - person['y'])**2)**(1/2) # distance formula
            if defender_dist < closest_defender:
                closest_defender = defender_dist
                ret[0] = closest_defender
            if defender_dist <= dist:
                defenders_pressuring = defenders_pressuring + 1
                ret[1] = defenders_pressuring
    return ret

In [6]:
def is_in_range(person,goal_low,goal_high,fromX,goal_x, kick_team):
    is_x = False
    is_y = False
    if kick_team == "red":
        if(person['x']>=fromX and person['x']<=goal_x):
            is_x = True
    else:
        if(person['x']>=goal_x and person['x']<=fromX):
            is_x = True
    
    if(person['y']>=goal_low and person['y']<=goal_high):
        is_y = True
        
    return is_x and is_y

def defender_box(match,stadium,kick):
    #is_there_players =
    #height,width =
    #area = 
    count = 0
    gp = get_opposing_goalpost(stadium,kick["fromTeam"])
    gp_y_high = max([p["y"] for p in gp["posts"]])
    gp_y_low = min([p["y"] for p in gp["posts"]])
    goal_x = gp["posts"][0]["x"]
    positions = get_positions_at_time(match["positions"], kick["time"])
    kicker = None
    for person in positions:
        if person["playerId"] == kick["fromId"]:
            kicker = person
            break
    if kicker is None:
        return 0
    #print("positions time = ", positions[0]["time"])
    for person in positions:
        if person["type"] == "ball" or person["playerId"] == kicker["playerId"]:
            continue
        if is_in_range(person,gp_y_low,gp_y_high,kicker['x'],goal_x, kicker["team"]):
            count = count + 1
    return count

In [7]:
def generate_rows_demo(match, stadium):
    """
    Generates target and features for each kick in the match.
    Produces two features for demo classifiers:
        goal_distance: Distance from where  ball was kicked to goal midpoint.
        goal_angle: Angle (in radians) between straight shot from where ball was
            kicked to goal midpoint.
    Args:
        match: Inflated match data (dict).
        stadium: Stadium data (dict).
    Returns:
        Generator of dicts with values for each kick in the given match.
        Includes prediction target "ag" (actual goals) which is 1 for a scored
        goal (goal or error) and 0 otherwise, "index" which is the index of the
        kick in the match kick list, and all the other features needed for
        prediction and explanation.
    """
    for i, kick in enumerate(match["kicks"]):
        gp = get_opposing_goalpost(stadium, kick["fromTeam"])
        x = kick["fromX"]
        y = kick["fromY"]
        gx = gp["mid"]["x"]
        gy = gp["mid"]["y"]
        dist = stadium_distance(x, y, gx, gy)
        angle = angle_from_goal(x, y, gx, gy)
        closest_defender,defender_within = defender_feature(match,kick,100)
        defenders_box = defender_box(match,stadium,kick)
        row = {
            "ag": 1 if is_scored_goal(kick) else 0,
            "index": i,
            "time": kick["time"],
            "x": x,
            "y": y,
            "goal_x": gx,
            "goal_y": gy,
            "goal_distance": dist,
            "goal_angle": angle,
            "team": kick["fromTeam"],
            "stadium": match["stadium"],
            "closest_defender": closest_defender,
            "defender_within": defender_within,
            "defenders_box": defenders_box
        }
        yield row

In [8]:
def make_df(metadata, callback, progress=False):
    """
    Transforms match metadata into a DataFrame of records for
    each kick, including target label and features.
    Args:
        metadata: Match metadata (list of dicts).
        callback: Method to run on each match to extract kicks.
        progress: Whether or not to show progress bar (boolean).
    Returns:
        DataFrame where each row is a kick record.
    """
    rows = []
    bar = tqdm(metadata) if progress else metadata
    for meta in bar:
        key = meta["match_id"]
        infile = "../data/packed_matches/{}.json".format(key)
        try:
            s = stadiums[meta["stadium"]]
            row_gen = load_match(infile, lambda m: callback(m, s))
            for row in row_gen:
                row["match"] = key
                rows.append(row)
        except FileNotFoundError:
            pass
    return pd.DataFrame(rows)

In [9]:
d_train = make_df(train, generate_rows_demo, progress=True)
d_test = make_df(test, generate_rows_demo, progress=True)

100%|██████████| 394/394 [00:32<00:00, 12.00it/s]
100%|██████████| 393/393 [00:32<00:00, 11.98it/s]


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score
)

In [14]:
def summarize_model(yt, yp):
    """
    Helper method to summarize some prediction metrics.
    Args:
        yt: Array of true scored goal values.
        yp: Array of predicted scored goal values.
    """
    print("Accuracy = {:.3f}".format(accuracy_score(yt, yp)))
    print("Precision = {:.3f}".format(precision_score(yt, yp)))
    print("Recall    = {:.3f}".format(recall_score(yt, yp)))
    print("ROC AUC   = {:.3f}".format(roc_auc_score(yt, yp)))

In [15]:
def model_features(features,classifier,kwargs):
    X_train = d_train[features]
    y_train = d_train["ag"]
    X_test = d_test[features]
    y_test = d_test["ag"]
    clf = classifier(**kwargs)
    clf.fit(X_train, y_train)
    #print("Train Scores:")
    #summarize_model(y_train, clf.predict(X_train))
    #print()
    print("Test Scores:")
    summarize_model(y_test, clf.predict(X_test))
    return clf

In [16]:
#best model so far
from sklearn.ensemble import GradientBoostingClassifier
features = ["goal_distance","goal_angle","defenders_box"]
clf = model_features(features, GradientBoostingClassifier, {"n_estimators":100, "learning_rate":1.0,"max_depth":1, "random_state":0})
clf

Test Scores:
Accuracy = 0.966
Precision = 0.602
Recall    = 0.048
ROC AUC   = 0.523


GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0)

In [18]:
p_test = clf.predict_proba(d_test[features])[:,1]
df_results = pd.DataFrame(d_test)
df_results["xg"] = p_test
df_results.groupby(["match", "team"])[["ag", "xg"]].sum().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,ag,xg
match,team,Unnamed: 2_level_1,Unnamed: 3_level_1
-MOTVkwbfE_IKa15MVn9,blue,1,0.092756
-MOTVkwbfE_IKa15MVn9,red,2,0.338985
-MOy-f6_nveB6alhv7BD,red,1,0.107252
-MOy0mtEf9VvJLYeb9g_,red,1,0.09501
-MOy1YNMaXX-VaR3ROtD,red,1,0.00229
-MOy_8TXdyiIYo9ty5Zu,blue,0,0.885856
-MOy_8TXdyiIYo9ty5Zu,red,2,1.764216
-MOy_aJlP-DUs6MyaR1N,blue,2,0.966248
-MOy_aJlP-DUs6MyaR1N,red,3,0.820188
-MOybwFHeoLse3Kyjf3h,red,1,0.381549


In [19]:
import joblib

joblib.dump(clf, "../models/gradientBoost.pkl")

['../models/gradientBoost.pkl']

In [20]:
def predict_xg_demo(match, stadium, generate_rows, clf):
    """
    Augments match data with XG predictions.
    Args:
        match: Inflated match data (dict).
        stadium: Stadium data (dict).
        generate_rows: function(match, stadium) to generate kick records.
        clf: Classifier following scikit-learn interface.
    Returns:
        Inflated match data with "xg" field added to each kick (dict).
    """
    features = ["goal_distance", "goal_angle"]
    d_kicks = pd.DataFrame(generate_rows(match, stadium))
    d_kicks["xg"] = clf.predict_proba(d_kicks[features])[:,1]
    for kick in d_kicks.to_dict(orient="records"):
        match["kicks"][kick["index"]]["xg"] = kick["xg"]
    return match

In [21]:
test_meta = test[45]
s = stadiums[test_meta["stadium"]]
demo_clf = joblib.load("../models/demo_logistic_regression.pkl")
test_match = load_match(
    "../data/packed_matches/{}.json".format(test_meta["match_id"]),
    lambda m: predict_xg_demo(m, s, generate_rows_demo, demo_clf)
)
test_meta

{'match_id': '-MP_GIvX9ZA_GRX_hhUp',
 'stadium': 'NAFL 1v1/2v2 Map v1',
 'time': 81.1,
 'kicks_red': 12,
 'kicks_blue': 9,
 'score_red': 0,
 'score_blue': 3,
 'scored_goals_red': 0,
 'scored_goals_blue': 1}

In [22]:
pd.DataFrame(test_match["kicks"]).query("type == 'goal'").head()

Unnamed: 0,time,type,fromId,fromX,fromY,fromName,fromTeam,toId,toX,toY,toName,toTeam,xg
8,36.4,goal,3,197.0,-106.0,Player 183,blue,,,,,,0.023527
