In [1]:
import sys
sys.path.append("../")

from haxml.utils import (
    get_stadiums,
    get_matches_metadata,
    train_test_split_matches_even_count,
    get_opposing_goalpost,
    stadium_distance,
    angle_from_goal,
    is_scored_goal,
    # my features
    get_positions_at_time,
    defender_feature, # closest defender, defenders within dist
    defender_box, # count defender within goal and shot in rect
    defender_cone, # count defenders between shot and goal in cone
    speed_ball
)
from haxml.evaluation import (
    summarize_split,
    style_columns,
    make_df,
    score_model,
    run_models,
    blank_plot,
    plot_errors_by_kicks,
    plot_errors_by_goals,
    plot_xg_histogram
)
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

In [2]:
stadiums = get_stadiums("../data/stadiums.json")
metadata = get_matches_metadata("../data/matches_metadata.csv")
train, test = train_test_split_matches_even_count(metadata)

In [3]:
print("Train Data:")
summarize_split(train)
print()
print("Test Data:")
summarize_split(test)

Train Data:
Matches: 1,158
Goals: 3,227
Kicks: 94,478
E(XG): 0.034

Test Data:
Matches: 1,158
Goals: 3,233
Kicks: 95,110
E(XG): 0.034


In [4]:
def generate_rows_compare(match, stadium):
    """
    Generates target and features for each kick in the match.
    Produces many features for model comparison.
    Args:
        match: Inflated match data (dict).
        stadium: Stadium data (dict).
    Returns:
        Generator of dicts with values for each kick in the given match.
        Includes prediction target "ag" (actual goals) which is 1 for a scored
        goal (goal or error) and 0 otherwise, "index" which is the index of the
        kick in the match kick list, and all the other features needed for
        prediction and explanation.
    """
    for i, kick in enumerate(match["kicks"]):
        gp = get_opposing_goalpost(stadium, kick["fromTeam"])
        x = kick["fromX"]
        y = kick["fromY"]
        gx = gp["mid"]["x"]
        gy = gp["mid"]["y"]
        dist = stadium_distance(x, y, gx, gy)
        angle = angle_from_goal(x, y, gx, gy)
        defender_dist,closest_defender = defender_feature(match,kick,100)
        defenders_within_box,in_box = defender_box(match,stadium,kick)
        defenders_within_shot,in_shot = defender_cone(match,stadium,kick,1)
        ball_speed=speed_ball(match,kick,1)
        row = {
            "ag": 1 if is_scored_goal(kick) else 0,
            "index": i,
            "time": kick["time"],
            "x": x,
            "y": y,
            "goal_x": gx,
            "goal_y": gy,
            "goal_distance": dist,
            "goal_angle": angle,
            "team": kick["fromTeam"],
            "stadium": match["stadium"],
            "defender_dist": defender_dist, # numerical , numbers within a range
            "closest_defender": closest_defender, # numerical, distance of the closest 
            "defenders_within_box": defenders_within_box, # numerical, number of player(defenders) within goal and kick(ball)
            "in_box": in_box, # Boolean, Is there defenders in this boxb bteween goal and kick
            "defenders_within_shot": defenders_within_shot, # numerical, how many players(defenders) are in this cone
            "in_shot": in_shot, # Boolean, Is there defenders in this cone
            "ball_speed": ball_speed # numerical, speeds ball within a given time range
        }
        yield row

In [5]:
d_train = make_df(train, stadiums, generate_rows_compare, progress=True)
d_test = make_df(test, stadiums, generate_rows_compare, progress=True)

100%|██████████| 1158/1158 [04:55<00:00,  3.92it/s]
100%|██████████| 1158/1158 [04:47<00:00,  4.03it/s]


In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [18]:
feature_sets = [
    ["goal_distance","goal_angle","defender_dist","closest_defender","defenders_within_box","in_box","in_shot","ball_speed"] # best model
    #["goal_distance","goal_angle"]
]

In [14]:
model_params = [
    (RandomForestClassifier, {"max_depth":12, "random_state":0}),
    (LogisticRegression, {"random_state": 0}),
    (DecisionTreeClassifier, {"max_depth": 5}),
    (KNeighborsClassifier, {"n_neighbors": 7}),
    (GradientBoostingClassifier, {"n_estimators":100, "learning_rate":1.0,"max_depth":1, "random_state":0}),
    (AdaBoostClassifier, {"n_estimators":100})
]

In [19]:
df_scored = run_models(d_train, d_test, score_model, "ag", feature_sets, model_params)

Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_stat

In [20]:
score_color = "94, 156, 255"
style_config = {
    "accuracy": {"rgb": score_color, "low": 0, "high": 1},
    "precision": {"rgb": score_color, "low": 0, "high": 1},
    "recall": {"rgb": score_color, "low": 0, "high": 1},
    "roc_auc": {"rgb": score_color, "low": 0.5, "high": 1},
    "match_mae": {"rgb": score_color, "low": 3, "high": 0},
    "match_rmse": {"rgb": score_color, "low": 3, "high": 0},
    "xg_mean": {"rgb": score_color, "low": 0, "high": 0.07},
    "xg_std": {"rgb": score_color, "low": 0, "high": 0.5},
}

In [21]:
df_ranked = df_scored \
    .sort_values(by=["precision", "roc_auc"], ascending=[False, False]) \
    .reset_index()
df_ranked \
    [["clf", "features"] + list(style_config.keys())] \
    .style.apply(style_columns(style_config)) \
    .set_precision(3)

Unnamed: 0,clf,features,accuracy,precision,recall,roc_auc,match_mae,match_rmse,xg_mean,xg_std
0,"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,  criterion='gini', max_depth=12, max_features='auto',  max_leaf_nodes=None, max_samples=None,  min_impurity_decrease=0.0, min_impurity_split=None,  min_samples_leaf=1, min_samples_split=2,  min_weight_fraction_leaf=0.0, n_estimators=100,  n_jobs=None, oob_score=False, random_state=0, verbose=0,  warm_start=False)","['goal_distance', 'goal_angle', 'defender_dist', 'closest_defender', 'defenders_within_box', 'in_box', 'in_shot', 'ball_speed']",0.97,0.706,0.179,0.588,1.238,1.565,0.034,0.09
1,"DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',  max_depth=5, max_features=None, max_leaf_nodes=None,  min_impurity_decrease=0.0, min_impurity_split=None,  min_samples_leaf=1, min_samples_split=2,  min_weight_fraction_leaf=0.0, presort='deprecated',  random_state=None, splitter='best')","['goal_distance', 'goal_angle', 'defender_dist', 'closest_defender', 'defenders_within_box', 'in_box', 'in_shot', 'ball_speed']",0.967,0.53,0.229,0.611,1.434,1.814,0.035,0.09
2,"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',  metric_params=None, n_jobs=None, n_neighbors=7, p=2,  weights='uniform')","['goal_distance', 'goal_angle', 'defender_dist', 'closest_defender', 'defenders_within_box', 'in_box', 'in_shot', 'ball_speed']",0.966,0.498,0.196,0.595,1.384,1.754,0.033,0.109
3,"AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,  n_estimators=100, random_state=None)","['goal_distance', 'goal_angle', 'defender_dist', 'closest_defender', 'defenders_within_box', 'in_box', 'in_shot', 'ball_speed']",0.966,0.486,0.154,0.574,35.729,42.602,0.469,0.031
4,"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,  intercept_scaling=1, l1_ratio=None, max_iter=100,  multi_class='auto', n_jobs=None, penalty='l2',  random_state=0, solver='lbfgs', tol=0.0001, verbose=0,  warm_start=False)","['goal_distance', 'goal_angle', 'defender_dist', 'closest_defender', 'defenders_within_box', 'in_box', 'in_shot', 'ball_speed']",0.965,0.406,0.067,0.532,1.312,1.659,0.034,0.08
5,"GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,  learning_rate=1.0, loss='deviance', max_depth=1,  max_features=None, max_leaf_nodes=None,  min_impurity_decrease=0.0, min_impurity_split=None,  min_samples_leaf=1, min_samples_split=2,  min_weight_fraction_leaf=0.0, n_estimators=100,  n_iter_no_change=None, presort='deprecated',  random_state=0, subsample=1.0, tol=0.0001,  validation_fraction=0.1, verbose=0,  warm_start=False)","['goal_distance', 'goal_angle', 'defender_dist', 'closest_defender', 'defenders_within_box', 'in_box', 'in_shot', 'ball_speed']",0.949,0.252,0.251,0.612,2.699,3.712,0.059,0.176
