In [1]:
import numpy as np
import pandas as pd 
from itertools import izip

Offensive Rating is defined as the team points scored per 100 possessions while the player is on the court.

Defensive Rating is defined as the number of points per 100 possessions that the team alows while that individual player is on the court. 

A possession is ended by
 -1 made field goal attempts
 -2 Made final free throw attempt
 -3 Missed final free throw attempt that results in a defensive reboud
 -4 Missed field goal attempt that resutls in a defensive rebound
 -5 turnover
 -6 end of time period

In [2]:
EventCodes = pd.read_csv('Event_Codes.txt', sep='\t')
#Strip excess whitespace that I noticed in Event Message Type Description
EventCodes['Event_Msg_Type_Description'] = EventCodes['Event_Msg_Type_Description'].map(str.strip)

GameLineup = pd.read_csv('Game_Lineup.txt', sep='\t')
PlayByPlay = pd.read_csv('Play_by_Play.txt', sep='\t')

In [3]:
from collections import defaultdict

Create accessory objects defining how certain events affect computing ratings. 

In [4]:
EventScoringValues = defaultdict(dict) #the point value of a play, for the offensive team
EventPossChange= defaultdict(dict) #whether the team could constitute a change of possesion
EventSub = defaultdict(dict) #whether an event can contain a substituion.

In [5]:
for row in EventCodes.iterrows():
    #initialize
    e_type, a_type = row[1]['Event_Msg_Type'], row[1]['Action_Type']
    EventScoringValues[e_type][a_type] = False
    EventPossChange[e_type][a_type] = False
    EventSub[e_type][a_type] = False

In [6]:
#Hardcore the scoring values of points
# TODO did we fix 3's?
for action in EventScoringValues[1]:
    EventScoringValues[1][action] = True # i think some of these will be to be 3's
    
for action in EventScoringValues[3]:
    # are all free throw worth 1? technical?
    EventScoringValues[3][action] = True
    
#all others 0

In [7]:
# shots (made and missed) as well as rebounds are possible possesion changes
for action in EventPossChange[1]:
    EventPossChange[1][action] = True
    
for action in EventPossChange[3]:
    if action not in (11, 13, 14):
        EventPossChange[3][action] = True
        
for action in EventPossChange[4]:
    EventPossChange[4][action] = True # all rebounds have to be checked
    
for action in EventPossChange[5]:
    EventPossChange[5][action] = True

In [8]:
# subs and starts of periods are sub events
for action in EventSub[8]:
    EventSub[8][action] = True
    
for action in EventPossChange[12]:
    EventSub[12][action] = True

### Order of operations
For Game in Games:

    For possession in Game:
        for player in possession:
            update scoring
    normalize by count

In [102]:
class Game(object):
    
    def __init__(self, game_id, game_lineup, play_by_play):
        """
        An object that computes relevant statistics for one game from the 
        play-by-play data
        game_id: string identifying the id of the game we are interested in 
        game_lineup: the Lineup of each game. The object will select the 
                     relvant portions from it
        play_by_play: play by play info, similarly used as game_lineup
        """
        self.game_id = game_id
        self.max_period = np.max(play_by_play[play_by_play['Game_id'] == game_id]['Period'])
        self.play_by_play = self.chunk_pbp(play_by_play[play_by_play['Game_id'] == game_id])
        
        # tuple of lists of each player on the team
        self.teams = self.make_teams(game_lineup[game_lineup['Game_id']==game_id])
        self.poss_team_0 = None
        self.poss_change = None
        

        
        self.lineup_by_period =\
        [game_lineup[np.logical_and(game_lineup['Game_id']==game_id, \
                                    game_lineup['Period'] == i)] for i in xrange(1,self.max_period+1)]
        
    
    def chunk_pbp(self, game_pbp):
        """
        Divide the play-by-play of the game into events at the same
        Play clock time. This is the optimal way to break up events
        to score posessions and compute ratings
        
        game_pbp: the full play-by-play of the game. 
        """
        sorted_pbp = self.sort_pbp(game_pbp)
        # unique time for each play
        real_time = sorted_pbp['PC_Time'].as_matrix()+  (self.max_period-1.001*sorted_pbp['Period'].as_matrix())*7200
        _, unique_idxs = np.unique(real_time, return_index=True)
        split_pbp = np.split(sorted_pbp, indices_or_sections=unique_idxs[::-1] )[1:] # remove the first one thats empty
        return split_pbp
        
    def sort_pbp(self, game_pbp):
        """
        Sort the full play-by-play by increasing period, decreasing
        play clock time, increasing arena time, and increasing event number.
        
        game_pbp: the full play-by-play of the game.         
        """
        return game_pbp.sort_values(['Period', 'PC_Time', 'WC_Time', 'Event_Num'],\
                                    ascending=[True, False, True,True])
    
    def make_teams(self, game_lineup):
        """
        Given the lineup of the game, initialize the team dictionary
        game_lineup: Lineup with the given game selected
        """
        team1_id, team2_id = np.unique(game_lineup['Team_id'])
        team1, team2 = {},{}
        for team_id, team_arr in izip((team1_id, team2_id), (team1, team2)):
            players_on_team = game_lineup[game_lineup['Team_id']==team_id]['Person_id'].unique()
            for player in players_on_team:

                player_data = game_lineup[game_lineup['Person_id'] == player] #ignore possessions
                active_to_start = 1 in set(player_data['Period'])
                team_arr[player] = Player(player_data.iloc[0], active_to_start)
                
        return team1, team2
    
    def score_event(self, idx, event):
        """
        Given an individual event at index idx, apply the relevant scoring
        to active players
        idx:   the index of the event in the play_by_play
        event: the event (list of unique actions at one play clock time) at 
               index idx
        """
        for play in event.itertuples():
            # retrieve the value of the play
            play_e, play_a  = getattr(play, 'Event_Msg_Type'), getattr(play, 'Action_Type') #play[2], play[6]
            is_scoring_play = EventScoringValues[play_e][play_a]
            play_value = getattr(play, 'Option1')
            
            if not is_scoring_play:
                continue
            
            # unfortunately due to edge cases with technical fouls, some
            # subs have to occur here
            if (getattr(play, 'Person1') in self.teams[0]) != self.poss_team_0:
                self._add_possessions()
                self.incr_poss()
                self.poss_team_0 = not self.poss_team_0
                
            if (play_e == 3) and (play_value != 1):
                #On free throws, only a made ft if play_value ('Option1' = 1), it is sometimes 2 or another non-zero number
                continue
            #assign the score to active players
            for team, offense_or_defense in zip(self.teams, (self.poss_team_0, not self.poss_team_0)):
                for player in team.itervalues():
                    if player.active:
                        if offense_or_defense: #team 0 on offense?
                            player.off_points+=play_value
                        else:
                            player.def_points+=play_value
                        
                        
    def possession_change(self, idx,event):
        """
        Compute a possible possesion change given the event at idx
        idx:   the index of the event in the play_by_play
        event: the event (list of unique actions at one play clock time) at 
               index idx
        """
        # protect against the edge case where end of periods cause
        # end of possesions 
        if self._EoP_in_event(event):
            self._add_possessions()
            self.poss_team_0 = None
            return
        
        for play in event.itertuples():
            play_e, play_a = getattr(play, 'Event_Msg_Type'), getattr(play, 'Action_Type') #play[2], play[6]
            play_value = getattr(play, 'Option1')

            poss_change = EventPossChange[play_e][play_a]
            if poss_change:
                self.poss_change = poss_change
                #Exception for rebounds which can happen same time as a foul after and then fts and thus need to check current time
                if play_e == 4 and (self.poss_team_0 != self._get_curr_poss(play, idx)):
                    self._add_possessions()
                    self.poss_team_0 = not self.poss_team_0
                next_poss_team_0 = self._get_next_poss(idx)
                # if so, increement the active player possessions
                if self.poss_team_0 != next_poss_team_0 and next_poss_team_0 !=None:
                    self._add_possessions()
        
    def substitution(self,idx, event):
        """
        Perform substitutions if event requires it
        idx:   the index of the event in the play_by_play
        event: the event (list of unique actions at one play clock time) at 
               index idx
        """
        for play in event.itertuples():
            play_e, play_a = getattr(play, 'Event_Msg_Type'), getattr(play, 'Action_Type') #play[2], play[6]
            sub = EventSub[play_e][play_a]
            
            if sub:
                if play_e == 8: #Garden variety substitution
                    outgoing_player, ingoing_player = getattr(play, 'Person1'), getattr(play, 'Person2') #play[11], play[12]
                    
                    # a bit of repeated logic here
                    # but make sure all possesion and team combinations are covered
                    if outgoing_player in self.teams[0]:
                        self.teams[0][outgoing_player].active = False
                        self.teams[0][ingoing_player].active = True
                        
                        #an outgoing player's posession ends, unless it would otherwise be counted
                        if self.poss_team_0:
                            self.teams[0][outgoing_player].incr_off_poss =True
                        else:
                            self.teams[0][outgoing_player].incr_def_poss =True

                    else:
                        self.teams[1][outgoing_player].active = False
                        self.teams[1][ingoing_player].active = True
                        
                        if self.poss_team_0:
                            self.teams[1][outgoing_player].incr_def_poss =True
                        else:
                            self.teams[1][outgoing_player].incr_off_poss =True

                        
                else: #start of period
                    period = getattr(play, 'Period')-1 #play[3]-1
                    if period == 0:
                        continue # we handle this on startup
                        
                    for team in self.teams:
                        for player in team.itervalues():
                            player.active = False # turn everyone off
                    period_lineup = self.lineup_by_period[period]
                    for _, row in period_lineup.iterrows():
                        player_id = row[2]
                        
                        active = row[4] == 'A'
                        if player_id in self.teams[0]:
                            self.teams[0][player_id].active = active
                        else:
                            self.teams[1][player_id].active = active

            #hacky way to fix rebound-ft-sub-missed ft- time elapses - def rebound
            #previously the first rebound would cause poss change to trigger before subs done, and NBA counts being subbed in
            #missed ft and defensive rebound as a possession for that sub (although I don't necessarily agree)
            play_value = getattr(play, 'Option1')
            if play_e == 3 and (play_a in (10, 12, 15)) and (play_value != 1):
                next_poss_team_0 = self._get_next_poss(idx)
                #get_next_poss would return None if it can't find a valid next possession
                if self.poss_team_0 != next_poss_team_0 and next_poss_team_0 !=None:
                    self._add_possessions()
                
    def _EoP_in_event(self, event):
        """
        Helper function to determine if an End of Period is in the event
        chunk. 
        event: the event (list of unique actions at one play clock time) at 
               index idx
        """
        return np.any([getattr(p, 'Event_Msg_Type') == 13\
                              for p in event.itertuples()])
    
    def _get_curr_poss(self, curr_play, idx):
        """
        Returns the current possession (same PC_Time as play), 
        but after play (later WC_Time or Event_Num). 
        Certain events create ambiguous possession scenarios, and we have
        to peek ahead to find out how they were resolved. 
        curr_play: the play whose possession we seek to establish
        idx: the idex of the play
        """
        event_num = getattr(curr_play, 'Event_Num')
        
        df_series = (self.play_by_play[idx]['Event_Num'] == event_num)
        df_index = np.where(df_series.values == True)[0][0]
        for play in self.play_by_play[idx][df_index:].itertuples():
            play_e, play_a = getattr(play, 'Event_Msg_Type'), getattr(play, 'Action_Type') #play[2], play[6]
            person1 = getattr(play, 'Person1')
            #Only made shot, missed shot, free throw, turnover, and end of period establishes posession
            if play_e in (1, 2, 3, 5, 13):
                    #may need to check on free throw (event 3) that it doesn't correspond to No Shot (action type 0)
                    
                    #if next time step has end of period, return opposite of current possession so add_possessions gets run
                    #Change -- trust that end of period team is correct, should make sure this is true
                    #Reason for change is that made shot -> end of period would work correctly, 
                    #but missed shot -> off rebound -> end of period mistakenly adds two possessions instead of one
                if play_e == 13:
                        #return not self.poss_team_0
                        #Check whether end of period team is the same as a player on team 0's team ID
                    return getattr(play, 'Team_id') == self.teams[0].values()[0].team_id
                if (play_e == 3) and (play_a not in (10, 11, 12, 13, 14, 15)):
                        #Only consider normal free throws (tech/flagrant fts can happen regardless of possession)
                    continue
                return person1 in self.teams[0] #play[11] in self.teams[0]
            #Rebounds have a lot of weird bugs associated with them. Normally wouldn't use rebounds to establish possession, 
            #but for cases such as
            #rebound - foul - subs at same time, the rebound and foul establish a new possession whereas something like
            #rebound - subs does not establish a new possession. The difference being the second must be an out of bounds
            #'team rebound'. Will only trust rebound if Player1 is an actual player
            
            #The len check is due to a sequence ft- sub- miss ft- rebound at same time. Rebound causes poss switch before sub occurs
            #See substitution method where also have to introduce if last ft occurs to change poss as well
            if play_e == 4:
                if ((person1 in self.teams[0]) or (person1 in self.teams[1])) and (df_index+1 < len(self.play_by_play[idx])):
                    return person1 in self.teams[0]
                
        #If no possession defining plays in current time step just return current possession to continue as normal
        #07/02 
        return self.poss_team_0
       
    def _get_next_poss(self, idx):
        """
        I dentifies who has possession next after an ambiguous play. 
        idx: the idex of the play
        """
        if self._EoP_in_event(self.play_by_play[idx]):
            return not self.poss_team_0
        
        for event in self.play_by_play[idx+1:]:
            for play in event.itertuples():
                # gonna have to check that this makes sense...
                play_e, play_a = getattr(play, 'Event_Msg_Type'), getattr(play, 'Action_Type') #play[2], play[6]

                #Only made shot, missed shot, free throw, turnover, and end of period establishes posession
                if play_e in (1, 2, 3, 5, 13):
                    #Reason for change is that made shot -> end of period would work correctly, 
                    #but missed shot -> off rebound -> end of period mistakenly adds two possessions instead of one
                    if play_e == 13:
                        #return not self.poss_team_0
                        #Check whether end of period team is the same as a player on team 0's team ID
                        return getattr(play, 'Team_id') == self.teams[0].values()[0].team_id
                    if (play_e == 3) and (play_a not in (10, 11, 12, 13, 14, 15)):
                        #Only consider normal free throws (tech/flagrant fts can happen regardless of possession)
                        continue
                    
                    #Needed for cases like a shot clock turnover where I've seen the player not correspond to a real person
                    #Alternative is to trust the next play only on turnovers
                    return self._get_team(getattr(play, 'Person1'), getattr(play, 'Team_id'))
                    
    def _get_team(self, playerID, teamID):
        """
        Returns the team of the playerID if a valid player, and otherwise returns teamID. 
            Needed for cases like a shot clock turnover where the player may
            not correspond to a real person
        playerID: ID of the player
        teamID: ID of the default team
        """
        if (playerID in self.teams[0]) or (playerID in self.teams[1]):
            return playerID in self.teams[0]
        else:
            return teamID == self.teams[0].values()[0].team_id
            
    def _add_possessions(self):
        """
        Increment possessions. Doesn't actually increment them here,
        as they are incremented at the end of each event. Instead
        just marks them to be incremented.
        """
        for team, offense_or_defense in zip(self.teams, (self.poss_team_0, not self.poss_team_0)):
            for player in team.itervalues():
                #Check that player is both active and has not already had a possession added in this time step
                if player.active:# and not (player.incr_def_poss or player.incr_off_poss):
                    if offense_or_defense: #team 0 on offense?
                        player.incr_off_poss=True
                    else:
                        player.incr_def_poss=True
                            
    def incr_poss(self):
        """
        Increment possession counters and reset them
        """
        for team in self.teams:
            for player in team.itervalues(): 
                if player.incr_off_poss:
                    player.off_poss+=1
                if player.incr_def_poss:
                    player.def_poss+=1
                player.incr_off_poss=False
                player.incr_def_poss=False
    
    def compute_ratings(self, up_to_period = -1):
        """
        Using all the above methods, compute the offensive and defensive
        ratings of each player for the game
        up_to_period (optional): Compute ratings up to a certian period,
        useful for debugging/
        """
        if up_to_period != -1:
            assert int(up_to_period) == up_to_period
            assert 1<=up_to_period<=self.max_period
        
        for idx, event in enumerate(self.play_by_play):
            if self.poss_team_0 == None:
                #At start of periods, don't know who has possession so use get_next_poss method to establish
                self.poss_team_0 = self._get_next_poss(idx)
            self.score_event(idx, event)
            self.possession_change(idx, event)
            self.substitution(idx, event)
            self.incr_poss()
            #Moved the following from possession_change to here. This at least worked for Q1, but may break other stuff.
            #Reason is when subbed out on a ft, the possession change function would flip possession and then substitution method
            #would add possession to the opposite of possession change (offense/defense). Moving substitution before possession_change
            #not possible because want treat players on court during fts before subbing them.
            #Normally this is only done when poss_change == True, but might be valid to do on every iteration:
            #get_next_poss essentially tells you who has the ball except in the cases where the current time step is one of the
            #poss_change == True events (made, missed shot, ft, turnover). 
            #Now need to check that poss_change == True because of moving poss change check from missed shot to rebounds so don't want
            #poss to flip before we iterate through rebound play
            if self.poss_change:
                self.poss_team_0 = self._get_next_poss(idx)
                self.poss_change = None
 
            if self._EoP_in_event(event):
                if event.iloc[0,3] == up_to_period: #event[0][3] == up_to_period:
                    #print 'End Of Period', idx
                    break
            for teamno, team in enumerate(self.teams):
                active_players = sum(int(player.active) for player in team.itervalues())
                assert active_players == 5, "At index number %d: Team %d has %d players active"%(idx,teamno+1, active_players)
        
        gameDict = {}
        for team in self.teams:
            for player in team.itervalues():
                player.finalize_ratings()

                gameDict[player.player_id] = np.array([player.off_points, player.def_points, 
                                                       player.off_poss, player.def_poss, 
                                                       round(player.off_rating, 1), round(player.def_rating,1)])
        return gameDict

In [19]:
class Player(object):
    """
    An object to hold all the statistics of a given player in a game
    """
    def __init__(self, player_data, active_to_start):
        """
        Create the player
        player_data: the row corresponding to this player in the lineup
        active_to_start: whether the player is active in the initial lineup
        """
        self.player_id = player_data['Person_id']
        self.team_id = player_data['Team_id']
        self.active = active_to_start
        
        self.off_points = 0
        self.def_points = 0
        
        self.off_poss = 0
        self.def_poss = 0
        
        self.incr_off_poss = False
        self.incr_def_poss = False
        
        self.off_rating = np.nan
        self.def_rating = np.nan
        
    def finalize_ratings(self):
        """
        Assuming all statistics have been accounted, compute the rating
        of this player in this game. 
        """
        if self.off_poss>0:
            self.off_rating = self.off_points*100.0/self.off_poss
        if self.def_poss>0:
            self.def_rating = self.def_points*100.0/self.def_poss

In [106]:
dataRows = []
i = 0

for i, gameID in enumerate(np.unique(GameLineup['Game_id'])):
    print(i)
    #For some reason inputting hte entire PlayByPlay gets a not 5 players on team error for i= 41
    game = Game(gameID, GameLineup, PlayByPlay[PlayByPlay['Game_id'] == gameID])
    game.compute_ratings()
    for team in game.teams:
        for player in team.itervalues():
            dataDict = {'Game_ID': gameID}
            #Check at least one possession played
            offPossessions = player.off_poss
            defPossessions = player.def_poss
            if (offPossessions + defPossessions) != 0:
                playerTeam = player.team_id
                offRating = player.off_rating
                defRating = player.def_rating
                pointsFor = player.off_points
                pointsAgainst = player.def_points

                dataDict.update({'Player_ID': player.player_id, 'Team_ID': playerTeam, 'OffRtg': round(offRating,1), 
                                 'DefRtg': round(defRating,1), 'OffPossessions': offPossessions, 
                                 'DefPossessions': defPossessions, 'PointsFor': pointsFor, 'PointsAgainst': pointsAgainst})

                dataRows.append(dataDict)
df = pd.DataFrame(dataRows)
df= df[['Game_ID', 'Player_ID', 'Team_ID','OffRtg', 'DefRtg', 'OffPossessions', 'DefPossessions', 'PointsFor', 'PointsAgainst']]
df.to_csv("GameRatings_Compare4.csv", index=False)

0




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81


In [105]:
game = Game(np.unique(GameLineup['Game_id'])[19], GameLineup, PlayByPlay)
game.compute_ratings()



{'0e6ad08f07aa1d0f125692af405249b9': array([ 82. ,  98. ,  80. ,  81. , 102.5, 121. ]),
 '11f83d15ab55554432254788bba1e58f': array([ 62. ,  69. ,  61. ,  62. , 101.6, 111.3]),
 '1d457397b925e5819869db6e806c9967': array([  4. ,   7. ,   6. ,   6. ,  66.7, 116.7]),
 '1ddd9c3d32c6f8245158a15b58b38992': array([ 0.,  0.,  0.,  0., nan, nan]),
 '2fd8063b8cb4a66e2658b3a33dfa4850': array([ 57. ,  49. ,  49. ,  50. , 116.3,  98. ]),
 '3007e387a7d41a704390cb5878271b9a': array([ 80. ,  77. ,  72. ,  71. , 111.1, 108.5]),
 '324b2f11b1cd08156cceb7206ef69820': array([ 33. ,  34. ,  33. ,  32. , 100. , 106.3]),
 '3773aefed22b2274d2178f90bb6a858a': array([ 90. ,  90. ,  80. ,  81. , 112.5, 111.1]),
 '387ed8d45a1786fa72e95732df2dac4f': array([ 39. ,  28. ,  28. ,  30. , 139.3,  93.3]),
 '3c4073c5a42291d8db41f245f22940c7': array([  4. ,  26. ,  15. ,  18. ,  26.7, 144.4]),
 '4cf82b670bd014fbd6b99ca1ffa25a4f': array([  2.,   0.,   2.,   2., 100.,   0.]),
 '769cf4fe28af6cafc5887879e7f7f40f': array([ 0.,  