In [1]:
import pandas as pd

pbp = pd.read_csv("pbp_18-19.csv")
print(pbp.shape)

(568577, 27)


In [2]:
is_home_play = pbp["HomeEvent"].notnull()
pbp["Event"] = pbp["HomeEvent"].fillna("") + pbp["AwayEvent"].fillna("")

pbp["Offensive_rebound"] = pbp["Event"].str.contains("Offensive rebound")
pbp["Defensive_rebound"] = pbp["Event"].str.contains("Defensive rebound")
pbp["Steal"] = pbp["Event"].str.contains("steal")
pbp["Block"] = pbp["Event"].str.contains("block")

# Compute points scored as the difference in score between the next play and this play
# for the team who made the play
pbp["Points"] = is_home_play * (pbp["HomeScore"].shift(-1) - pbp["HomeScore"]) + \
                (1 - is_home_play) * (pbp["AwayScore"].shift(-1) - pbp["AwayScore"])

# Set the points scored by the last play of each game to 0
pbp.loc[pbp["GameID"] != pbp["GameID"].shift(-1), "Points"] = 0

pbp["Points"].describe()

count    568577.000000
mean          0.258482
std           0.675543
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           3.000000
Name: Points, dtype: float64

In [3]:
# Exclude jump balls, timeouts, and start/end of quarters
print("Pre-filtering: " + str(len(pbp)))
bad_plays = (pbp["AwayEvent"].str.contains("Jump ball") | pbp["HomeEvent"].str.contains("Jump ball")
                | pbp["AwayEvent"].str.contains("End of") | pbp["HomeEvent"].str.contains("End of")
                | pbp["AwayEvent"].str.contains("Start of") | pbp["HomeEvent"].str.contains("Start of"))
pbp = pbp[~bad_plays]
print("Post-filtering: " + str(len(pbp)))

Pre-filtering: 568577
Post-filtering: 557943


In [4]:
print("Offensive rebound play count:", pbp["Offensive_rebound"].sum())
print("Defensive rebound play count:", pbp["Defensive_rebound"].sum())
print("Steal play count:", pbp["Steal"].sum())
print("Block play count:", pbp["Block"].sum())

Offensive rebound play count: 39841
Defensive rebound play count: 89525
Steal play count: 18432
Block play count: 11989


In [6]:
# Compute shot clock time remaining
by_possession = pbp[["GameID", "Possession", "Time", "Offensive_rebound"]].copy()
by_possession["Time"] = by_possession["Time"].str.split(":").apply(lambda x: float(x[0]) * 60 + float(x[1]))
by_possession["Offensive_rebound_no"] = by_possession.groupby(["GameID", "Possession"])[["Offensive_rebound"]].cumsum()
by_possession["Initial_shot_clock"] = 24 * (by_possession["Offensive_rebound_no"] == 0) + 14 * (by_possession["Offensive_rebound_no"] > 0)

In [40]:
shot_clock = by_possession["Time"].copy()
idx = 0
while idx < len(shot_clock):

    game_id = by_possession.iloc[idx]["GameID"]
    possession = by_possession.iloc[idx]["Possession"]
    begin_time = by_possession.iloc[idx]["Time"]
    was_reset = False

    while idx < len(shot_clock) and \
            by_possession.iloc[idx]["GameID"] == game_id and \
            by_possession.iloc[idx]["Possession"] == possession:

        print(idx, begin_time, by_possession.iloc[idx]["Time"])
        if by_possession.iloc[idx]["Offensive_rebound"] and begin_time - by_possession.iloc[idx]["Time"] >= 10:
            print("reset {}".format(idx))
            begin_time = by_possession.iloc[idx]["Time"]
            was_reset = True

        if was_reset:
            shot_clock.iloc[idx] = 14 - (begin_time - by_possession.iloc[idx]["Time"])
        else:
            shot_clock.iloc[idx] = 24 - (begin_time - by_possession.iloc[idx]["Time"])
        idx += 1

0 700.0 700.0
1 697.0 697.0
2 697.0 678.0
3 674.0 674.0
4 674.0 662.0
5 653.0 653.0
6 653.0 653.0
7 653.0 653.0
8 645.0 645.0
9 642.0 642.0
10 642.0 637.0
11 624.0 624.0
12 604.0 604.0
13 600.0 600.0
14 600.0 591.0
15 579.0 579.0
16 579.0 576.0
17 579.0 575.0
18 562.0 562.0
19 562.0 562.0
20 562.0 556.0
21 562.0 554.0
22 562.0 554.0
23 562.0 550.0
24 562.0 536.0
25 535.0 535.0
26 535.0 524.0
27 520.0 520.0
28 508.0 508.0
29 504.0 504.0
30 489.0 489.0
