In [91]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown
import matplotlib.pyplot as plt

In [101]:
# per 100
lineup_stats = pd.read_csv("/Users/sydnee/Downloads/16th Grade/Data Science/alllineup_stats.csv")
# per 100
player_stats = pd.read_csv("/Users/sydnee/Downloads/16th Grade/Data Science/player_100stats.csv")
# not per 100
team_stats = pd.read_csv("/Users/sydnee/Downloads/16th Grade/Data Science/team_and_opponent.csv")
# not per 100
team_misc_stats = pd.read_csv("/Users/sydnee/Downloads/16th Grade/Data Science/team_misc.csv")

In [57]:
player_stats_orig = player_stats.copy()
lineup_stats_orig = lineup_stats.copy()

In [33]:
# format names to first initial. last name
def format_name(full_name):
    name_parts = full_name.split()
    if len(name_parts) > 1:
        return f"{name_parts[0][0]}. {name_parts[1]}"
    else:
        return full_name
player_stats['Name'] = player_stats['Name'].apply(format_name)
player_stats.to_csv("/Users/sydnee/Downloads/16th Grade/Data Science/player_100stats.csv", index=False)

In [None]:
# format lineup states better
lineup_stats = lineup_stats.apply(lambda x: x.str.replace(u'\xa0', u' ') if x.dtype == "object" else x)
lineup_stats = lineup_stats[lineup_stats['Lineup'] != 'Team Average']
lineup_stats[['Player 1','Player 2']] = lineup_stats['Lineup'].str.split(' , ', expand=True)
lineup_stats.drop(columns=['Lineup'], inplace=True)

lineup_stats.to_csv("/Users/sydnee/Downloads/16th Grade/Data Science/alllineup_stats.csv", index=False)

In [34]:
# team abbreviations
teams_dict = {'hawks' : 'ATL',
              'celtics' : 'BOS',
              'nets' : 'BRK',
              'hornets' : 'CHO',
              'bulls' : 'CHI',
              'cavaliers' : 'CLE',
              'mavericks' : 'DAL',
              'nuggets' : 'DEN',
              'pistons' : 'DET',
              'warriors' : 'GSW',
              'rockets' : 'HOU',
              'pacers' : 'IND',
              'clippers' : 'LAC',
              'lakers' : 'LAL',
              'grizzlies' : 'MEM',
              'heat' : 'MIA',
              'bucks' : 'MIL',
              'timberwolves' : 'MIN',
              'pelicans' : 'NOP',
              'knicks' : 'NYK',
              'thunder' : 'OKC',
              'magic' : 'ORL',
              '76ers' : 'PHI',
              'suns' : 'PHO',
              'trailblazers' : 'POR',
              'kings' : 'SAC',
              'spurs' : 'SAS',
              'raptors' : 'TOR',
              'jazz' : 'UTA',
              'wizards' : 'WAS'}

In [35]:
# only get rows that pertain to team totals
header = pd.DataFrame(columns = team_stats.columns)
team_abbrs = list(teams_dict.values())
match = team_stats[team_stats['Team'].isin(team_abbrs)]
TM_stats = pd.concat([header, match], ignore_index=True)

# team/game
cols = ['G', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
TM_stats[cols] = TM_stats[cols].astype(int)
TM_stats[cols] = TM_stats[cols].div(82)
TM_stats['MP'] = TM_stats['MP'].div(5)

TM_stats.to_csv('team_stats.csv')

In [36]:
# only get rows that pertain to team stats
header = pd.DataFrame(columns = team_misc_stats.columns)
team_abbrs = list(teams_dict.values())
match = team_misc_stats[team_misc_stats['Team'].isin(team_abbrs)]
TM_misc_stats = pd.concat([header, match], ignore_index=True)

# pace
TM_pace = TM_misc_stats[['Team', 'Pace']]
print(TM_pace)

   Team   Pace
0   ATL  100.1
1   BOS   97.2
2   BRK   96.9
3   CHO   97.3
4   CHI   96.3
5   CLE   97.2
6   DAL  100.1
7   DEN   96.8
8   DET   99.8
9   GSW   99.2
10  HOU   99.0
11  IND  101.7
12  LAC   97.2
13  LAL  100.9
14  MEM   98.2
15  MIA   96.2
16  MIL   99.9
17  MIN   97.1
18  NOP   97.9
19  NYK   95.2
20  OKC   99.8
21  ORL   96.9
22  PHI   97.5
23  PHO   98.3
24  POR   97.2
25  SAC   98.8
26  SAS  101.1
27  TOR   99.4
28  UTA   99.5
29  WAS  102.7


**Calculating 2-Player Net Ratings**

$$
Net Rating = Offensive Rating - Defensive Rating
$$

Individual offensive and defensive rating was introduced by Dean Oliver in his book "Basketball on Paper." Here, I create my own way to calculate offensive and defensive ratings for two player lineups: calculate individual offensive and defensive ratings, then find a weighted sum of their ratings based on usage rates. 

$$
ORtg_{2 player} = \frac{ORtg_{1}W_{1} + ORtg_{2}W_{2}}{W_1+W_2}
$$

where $ORtg_{1}$ and $ORtg_{2}$ correspond to the offensive rating of players 1 and 2 respectively, and $W_{1}$ and $W_{2}$ correspond to the usage rate of each player.


Usage rate is an estimate of the percentage of team plays used by a player while he is on the floor. To calculate usage rate, the following equation is given by Basketball Reference:

$$ 
W = 100 * \frac{(FGA+0.44*FTA+TOV)*(TMMIN/5)}{(MP*(TMFGA+0.44*TMFTA+TMTOV))}
$$

In [37]:
# merge team and opponent stats with pace
TM_per100 = pd.merge(TM_stats, TM_pace, on='Team')
TM_per100_orig = TM_per100.copy()

$$ 
\frac{FGA}{MP}*48\text{ min}*\frac{100\text{ poss}}{\text{pace}}
$$

In [38]:
# convert team and opponent stats to per 100
TM_per100['FGA'] = (TM_per100['FGA']/TM_per100['MP'])*48*(100/TM_per100['Pace'])
TM_per100['FTA'] = (TM_per100['FTA']/TM_per100['MP'])*48*(100/TM_per100['Pace'])
TM_per100['TOV'] = (TM_per100['TOV']/TM_per100['MP'])*48*(100/TM_per100['Pace'])
TM_per100['MP'] = (TM_per100['MP']/TM_per100['MP'])*48*(100/TM_per100['Pace'])

In [39]:
player_stats_orig = player_stats.copy()
TM_per100.to_csv("test.csv")

In [40]:
# merge per 100 player stats with per 100 team stats
merged = pd.merge(player_stats, TM_per100, on="Team", how="left")
merged.to_csv("test.csv")

# usage rate per 100 possessions
USG = 100 * ((merged['FGA_x'] + 0.44 * merged['FTA_x'] + merged['TOV_x']) * merged['MP_y']) / \
      ((merged['MP_x'] / merged['G_x']) * (merged['FGA_y'] + 0.44 * merged['FTA_y'] + merged['TOV_y']))
player_stats['USG%'] = USG
player_stats.to_csv('/Users/sydnee/Downloads/16th Grade/Data Science/player_100stats.csv', index=False)

In [41]:
player_stats_orig = player_stats.copy()

In [44]:
player_in_lineup = (player_stats['Name'].isin(lineup_stats['Player 1'])|player_stats['Name'].isin(lineup_stats['Player 2']))
player_stats = player_stats[player_in_lineup]
player_stats.reset_index(drop=True, inplace=True)

In [102]:
print(pd.unique(pd.concat([lineup_stats['Player 1'], lineup_stats['Player 2']])).shape)
print(pd.unique(player_stats['Name']).shape)

unique_lineup_players = pd.unique(pd.concat([lineup_stats['Player 1'], lineup_stats['Player 2']]))
player_stats_names = player_stats['Name'].unique()
players_not_in_stats = unique_lineup_players[~pd.Series(unique_lineup_players).isin(player_stats_names)]
players_not_in_stats_list = players_not_in_stats.tolist()
print(players_not_in_stats_list)

(268,)
(268,)
[]


In [56]:
# these are the names that appeared twice in lineup stats and player stats. some were players that were traded mid season and others were players with same first initial last name. 
# i went through each name manually and checked which case it was. for players with same names, i changed to their full name
# some players traded midseason were not included in lineup_stats, so i simply deleted their stats from player_stats
# if there are going to be issues in the future, these names are the most likely culprits
name_counts = player_stats['Name'].value_counts()
duplicate_names = name_counts[name_counts > 1]
print(duplicate_names.index.tolist())

['B. Bogdanovic', 'J. Green', "R. O'Neale", 'K. Olynyk', 'G. Williams', 'T. Mann', 'D. Schroder', 'G. Hayward', 'P. Achiuwa', 'S. Dinwiddie', 'D. Gafford', 'C. Martin', 'M. Bridges', 'I. Quickley', 'T. Jones', 'T. Rozier', 'J. Smith', 'R. Barrett', 'P. Beverley', 'B. Hield', 'P. Siakam', 'K. Murray', 'A. Thompson', 'A. Wiggins', 'B. Brown', 'D. Mitchell', 'P. Washington']


In [60]:
# resulting players traded midseason who made big enough impact for both teams to be included in lineup_stats
name_counts = player_stats['Name'].value_counts()
duplicate_names = name_counts[name_counts > 1]
print(duplicate_names.index.tolist())

['P. Siakam', 'G. Williams', 'I. Quickley', 'K. Olynyk', "R. O'Neale", 'D. Schroder', 'T. Rozier', 'P. Washington', 'B. Hield']


In [45]:
# # player_stats is all player stats
# # only want players that play more than 24 games and more than 14 minutes per game
player_stats = player_stats[(player_stats['G'].astype(int) >= 25) & ((player_stats['MP']/player_stats['G']).astype(int) >= 15)]
player_stats.reset_index(drop=True, inplace=True)
player_stats.to_csv('/Users/sydnee/Downloads/16th Grade/Data Science/player_100stats.csv', index=False)

In [66]:
lineup_stats_orig = lineup_stats.copy()

In [68]:
print(lineup_stats[['Player 1', 'Player 2']])

def calculate_metric(player_1, player_2):
    stats_1 = player_stats[player_stats['Name'] == player_1].iloc[0]
    ortg_1 = stats_1['ORTG']
    drtg_1 = stats_1['DRTG']
    usg_1 = stats_1['USG%']
    stats_2 = player_stats[player_stats['Name'] == player_2].iloc[0]
    ortg_2 = stats_2['ORTG']
    drtg_2 = stats_2['DRTG']
    usg_2 = stats_2['USG%']

    ortg_12 = (ortg_1 * usg_1 + ortg_2 * usg_2) / (usg_1 + usg_2)
    drtg_12 = (drtg_1 * usg_1 + drtg_2 * usg_2) / (usg_1 + usg_2)
    
    return ortg_12, drtg_12

ortg2 = []
drtg2 = []
for index, row in lineup_stats.iterrows():
    player_1 = row['Player 1']
    player_2 = row['Player 2']
    ortg_12, drtg_12 = calculate_metric(player_1, player_2)
    ortg2.append(ortg_12)
    drtg2.append(drtg_12)
    
lineup_stats['ORTG2'] = ortg2
lineup_stats['DRTG2'] = drtg2
lineup_stats.to_csv('/Users/sydnee/Downloads/16th Grade/Data Science/alllineup_stats.csv')

              Player 1      Player 2
0            C. Capela     D. Murray
1               S. Bey     D. Murray
2    Bogdan Bogdanovic     D. Murray
3            D. Hunter     D. Murray
4           J. Johnson     D. Murray
..                 ...           ...
591          D. Avdija    D. Gafford
592       B. Coulibaly      J. Poole
593          D. Avdija  B. Coulibaly
594         Tyus Jones    C. Kispert
595         C. Kispert     L. Shamet

[596 rows x 2 columns]


In [88]:
top_20_o = lineup_stats[['Player 1', 'Player 2', 'Team', 'MP', 'ORTG2', 'DRTG2']].nlargest(20, 'ORTG2')
top_20_o['Rank'] = range(1, len(top_20_o) + 1)
top_20_o = top_20_o[['Rank', 'Player 1', 'Player 2', 'Team', 'MP', 'ORTG2', 'DRTG2']]
top_20_o = top_20_o.reset_index(drop=True)
display(Markdown("## Top 20 Offensive Ratings"))
display(top_20_o)

## Top 20 Offensive Ratings

Unnamed: 0,Rank,Player 1,Player 2,Team,MP,ORTG2,DRTG2
0,1,L. Kornet,P. Pritchard,BOS,726:42:00,142.34544,112.616886
1,2,D. Gafford,Tyus Jones,WAS,1073:37:00,131.320259,117.614381
2,3,A. Horford,D. White,BOS,1095:11:00,129.973703,111.558477
3,4,M. Conley,R. Gobert,MIN,1805:05:00,129.895385,107.866152
4,5,Terance Mann,I. Zubac,LAC,1104:42:00,129.662039,115.805554
5,6,S. Bey,O. Okongwu,ATL,890:52:00,129.141365,119.476439
6,7,T. Haliburton,O. Toppin,IND,882:34:00,129.0,119.0
7,8,K. Irving,D. Lively,DAL,611:46:00,128.842862,114.389915
8,9,A. Gordon,N. Jokic,DEN,1990:12:00,128.589801,110.410199
9,10,J. Harden,I. Zubac,LAC,1365:52:00,128.570497,114.469787


In [89]:
top_20_d = lineup_stats[['Player 1', 'Player 2', 'Team', 'MP', 'ORTG2', 'DRTG2']].nsmallest(20, 'DRTG2')
top_20_d['Rank'] = range(1, len(top_20_d) + 1)
top_20_d = top_20_d[['Rank', 'Player 1', 'Player 2', 'Team', 'MP', 'ORTG2', 'DRTG2']]
top_20_d = top_20_d.reset_index(drop=True)
display(Markdown("## Top 20 Defensive Ratings"))
display(top_20_d)

## Top 20 Defensive Ratings

Unnamed: 0,Rank,Player 1,Player 2,Team,MP,ORTG2,DRTG2
0,1,R. Gobert,N. Reid,MIN,918:35:00,117.413553,106.037967
1,2,K. Anderson,R. Gobert,MIN,865:48:00,121.008022,106.938817
2,3,R. Gobert,K. Towns,MIN,1279:35:00,120.587085,107.254036
3,4,K. Anderson,N. Reid,MIN,1117:45:00,112.20687,107.80458
4,5,M. Conley,R. Gobert,MIN,1805:05:00,129.895385,107.866152
5,6,C. Anthony,J. Isaac,ORL,713:36:00,117.020712,107.991124
6,7,A. Edwards,R. Gobert,MIN,1922:53:00,118.23923,108.029717
7,8,N. Alexander-Walker,R. Gobert,MIN,1064:49:00,121.653446,108.089117
8,9,R. Gobert,J. McDaniels,MIN,1577:35:00,118.48866,108.35177
9,10,G. Bitadze,J. Suggs,ORL,655:44:00,121.116333,108.376733
