In [1]:
import datetime as dt
import functools
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from scipy import stats


In [2]:
# Configuration

raw_data_dir = Path("../../raw_data")
reports_dir = Path("../../reports")
season_id = 25
week_date = dt.datetime(2019, 10, 31)
season_raw_data_dir = raw_data_dir / f"season{season_id}"
season_reports_dir = reports_dir / f"season{season_id}"
season_box_scores_dir = raw_data_dir / f"season{season_id}" / "box_scores"

pd.options.display.float_format = "{:,.2f}".format

In [3]:
possessions = pd.read_csv(season_raw_data_dir / "possessions.csv")

In [4]:
player_logs_raw = pd.read_csv(season_raw_data_dir / "player_logs.csv")
player_logs = player_logs_raw.groupby(["game_id", "team_id", "player_id"])

# player_logs.get_group(("20191017-1845-hotshots-rockets", "hotshots", "hotshots-harrison"))["stl"]

In [5]:
summary_rows = []
for (game_id, team_id), group in possessions.groupby(["game_id", "team_id"]):
    is_after_tov = group["is_after_turnover"] == True
    is_tov = group["resolution_is_turnover"] == True
    
    opp_num_tov = len(group[is_after_tov].index)
    num_tov = len(group[is_tov].index)

    row = {
        "game_id": game_id,
        "team_id": team_id,
        "poss": group["game_id"].size,
        "pts": group["points_scored"].mean(),
        "2nd_chance_points": group["second_chance_points"].mean(),
        "pts_per_opp_tov": group["points_scored"][is_after_tov].sum() / opp_num_tov if opp_num_tov else 0,
        "opp_pts_per_tov": group["opp_points_scored_next"][is_tov].sum() / num_tov if num_tov else 0,
        "passes": group["passes"].mean(),
        "players_inv": group["players_involved"].mean(),
        "tov": num_tov,
        "opp_tov": opp_num_tov, 
        "pts_total": int(group["points_scored"].sum()),
        "2nd_chance_points_total": int(group["second_chance_points"].sum()),
        "opp_pts_after_tov_total": int(group["opp_points_scored_next"][is_tov].sum()),
        "oreb": group["oreb"].mean(),
        "fta": group["fta"].mean(),
    }
    summary_rows.append(row)

summary = pd.DataFrame(summary_rows)
summary


Unnamed: 0,game_id,team_id,poss,pts,2nd_chance_points,pts_per_opp_tov,opp_pts_per_tov,passes,players_inv,tov,opp_tov,pts_total,2nd_chance_points_total,opp_pts_after_tov_total,oreb,fta
0,20191031-1845-hotshots-camdenhells,camdenhells,53,1.09,0.28,1.0,0.75,2.49,2.85,4,12,58,15,3,0.43,0.36
1,20191031-1845-hotshots-camdenhells,hotshots,54,0.69,0.07,0.75,1.0,1.35,2.15,12,4,37,4,12,0.09,0.22
2,20191031-1930-greenmachine-burritos,burritos,59,0.8,0.15,0.83,0.75,2.54,2.88,12,23,47,9,9,0.22,0.22
3,20191031-1930-greenmachine-burritos,greenmachine,60,0.45,0.18,0.75,0.83,2.32,2.67,23,12,27,11,19,0.17,0.05
4,20191031-2015-lostangels-northlondonlions,lostangels,61,0.9,0.25,0.79,1.0,3.31,3.13,13,14,55,15,13,0.25,0.15
5,20191031-2015-lostangels-northlondonlions,northlondonlions,61,0.97,0.21,1.0,0.79,1.7,2.33,14,13,59,13,11,0.18,0.13
6,20191031-2100-lions-rockets,lions,54,0.7,0.09,0.81,0.62,2.28,2.7,16,16,38,5,10,0.09,0.15
7,20191031-2100-lions-rockets,rockets,54,0.91,0.31,0.62,0.81,1.63,2.3,16,16,49,17,13,0.24,0.43


In [6]:
# Possession finisher / decision maker stat.

display(Markdown("""

### Possession Resolver / Decision Maker

At first, the idea was to calculate how many points on average are scored in possessions 
that are ended by a particular player. If a player shoots and makes the shot,
or shoots and the defense gets the rebound, or turns the ball over, they have effectively
ended their team's possession. Without any weighting added, this stat is pretty interesting itself.
It penalises turnovers and missed shots that end with defensive rebounds, but does not penalise
missed shots that end with offensive rebounds which is usually deemed a good shooting decision.
You can see this stat in the **points_scored** column.

Then, we thought, well, if a player steals the ball and creates possessions, they should not 
be penalised for all turnovers. For example, if a player has 4 steals and 1 turnover then that
one possession ending with a turnover should not count against the player. If, however, the player
has 4 steals and no turnovers, they don't get any discounts. If the player has 1 steal and 4 turnovers
then we can only discount one of their turnovers. This adjustment boosted stats for those who collect
many steals, but since we were now rewarding steals, why not additionaly penalise those who turn the ball
over based on how many points their turnovers cost the team? So we did.

**points_conceded_after_tov** tells how many points did the opposition score in total across all 
possessions following the player's turnover.

This is still not perfect. We are thinking about weighing the steals and turnovers by the number of possessions.

For example 11 ended possessions out of which 4 were turnovers is probably way worse than 4 turnovers 
with 26 ended possessions.

"""))

filter_no_tovs = possessions["resolution_is_turnover"] == False

summary_rows = []
for (game_id, team_id, last_player_id), group in possessions[filter_no_tovs].groupby(["game_id", "team_id", "last_player_id"]):
    player_game_stats = player_logs.get_group((game_id, team_id, last_player_id))

#     # If a player has 6 steals and 2 turnovers then we can discount 2 turnovers.
#     # If a player has 2 steals and 6 turnovers then we can discount 2 turnovers.
#     discounted_tov_possessions = min(player_game_stats["tov"].sum(), player_game_stats["stl"].sum())
    
#     points_conceded_after_tov = group[group["resolution_is_turnover"]]["opp_points_scored_next"].sum()
    
    row = {
        "game_id": game_id,
        "team_id": team_id,
        "last_player_id": last_player_id,
        "ended_total": group["game_id"].size,
#         "ended_with_tov": group[tovs].count(),
#         "ended_with_tov": group[tovs]["group_id"].size,
#         "points_scored_weighted": (
#             (group["points_scored"].sum() - points_conceded_after_tov)
#             /
#             max(1.0, group["game_id"].size - discounted_tov_possessions)
#         ),
        "points_scored": group["points_scored"].mean(),
        "stl": player_game_stats["stl"].sum(),
        "tov": player_game_stats["tov"].sum(),
#         "discounted_tov_possessions": discounted_tov_possessions,
        "points_conceded_after_tov": points_conceded_after_tov,
    }
    summary_rows.append(row)

summary = pd.DataFrame(summary_rows)
# summary_sorted = summary.sort_values(["points_scored_weighted"], ascending=[False])
summary_sorted = summary.sort_values(["points_scored"], ascending=[False])
summary_sorted = summary_sorted.reset_index(drop=True)
summary_sorted.index += 1
summary_sorted




### Possession Resolver / Decision Maker

At first, the idea was to calculate how many points on average are scored in possessions 
that are ended by a particular player. If a player shoots and makes the shot,
or shoots and the defense gets the rebound, or turns the ball over, they have effectively
ended their team's possession. Without any weighting added, this stat is pretty interesting itself.
It penalises turnovers and missed shots that end with defensive rebounds, but does not penalise
missed shots that end with offensive rebounds which is usually deemed a good shooting decision.
You can see this stat in the **points_scored** column.

Then, we thought, well, if a player steals the ball and creates possessions, they should not 
be penalised for all turnovers. For example, if a player has 4 steals and 1 turnover then that
one possession ending with a turnover should not count against the player. If, however, the player
has 4 steals and no turnovers, they don't get any discounts. If the player has 1 steal and 4 turnovers
then we can only discount one of their turnovers. This adjustment boosted stats for those who collect
many steals, but since we were now rewarding steals, why not additionaly penalise those who turn the ball
over based on how many points their turnovers cost the team? So we did.

**points_conceded_after_tov** tells how many points did the opposition score in total across all 
possessions following the player's turnover.

This is still not perfect. We are thinking about weighing the steals and turnovers by the number of possessions.

For example 11 ended possessions out of which 4 were turnovers is probably way worse than 4 turnovers 
with 26 ended possessions.



NameError: name 'points_conceded_after_tov' is not defined

In [None]:
# # TODO: Points scored in the possessions starting with your steals
# # TODO HeldBall not handled properly...
# is_after_steal = possessions["first_event_type"] == "Steal"
# summary_rows = []
# for (game_id, team_id, player_id), group in possessions[is_after_steal].groupby(["game_id", "team_id", "first_on_court_player_id"]):
#     player_game_stats = player_logs.get_group((game_id, team_id, player_id))
#     row = {
#         "player_id": player_id,
#         "team_id": team_id,
#         "stl": player_game_stats["stl"].sum(),
#         "points_scored_total": group["points_scored"].sum(),
#         "points_scored_avg": group["points_scored"].mean(),
#         "tov": player_game_stats["tov"].sum(),
#     }
#     summary_rows.append(row)

# summary = pd.DataFrame(summary_rows)
# min_requirements = summary["stl"] > 0

# summary_sorted = summary[min_requirements].sort_values(["points_scored_avg", "points_scored_total"], ascending=[False, False])
# summary_sorted.reset_index(drop=True, inplace=True)
# summary_sorted.index += 1
# summary_sorted

In [None]:
# # TODO: Points scored in the possessions starting with your steals
# # TODO HeldBall not handled properly...
# is_after_steal = possessions["first_event_type"] == "Steal"
# summary_rows = []
# for (game_id, team_id, player_id), group in possessions[is_after_steal].groupby(["game_id", "team_id", "first_on_court_player_id"]):
#     player_game_stats = player_logs.get_group((game_id, team_id, player_id))
#     row = {
#         "player_id": player_id,
#         "team_id": team_id,
#         "stl": player_game_stats["stl"].sum(),
#         "points_scored_total": group["points_scored"].sum(),
#         "points_scored_avg": group["points_scored"].mean(),
#         "tov": player_game_stats["tov"].sum(),
#     }
#     summary_rows.append(row)

# summary = pd.DataFrame(summary_rows)
# min_requirements = summary["stl"] > 0

# summary_sorted = summary[min_requirements].sort_values(["points_scored_avg", "points_scored_total"], ascending=[False, False])
# summary_sorted.reset_index(drop=True, inplace=True)
# summary_sorted.index += 1
# summary_sorted