In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from collections import Counter

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize

In [5]:
sns.set()

## Data Prep

In [6]:
df = pd.read_csv('nba_cleaned.csv')
df.shape

(1420, 116)

In [145]:
label_cols = ['Player', 'Season', 'Team', 'player_cluster']
label_df = df[label_cols].rename(columns={'Season': 'season',
                                          'Team': 'team'})
label_df.head()

Unnamed: 0,Player,season,team,player_cluster
0,Marc Gasol,2017,Memphis Grizzlies,2
1,Tyreke Evans,2017,Memphis Grizzlies,1
2,Dillon Brooks,2017,Memphis Grizzlies,1
3,Jarell Martin,2017,Memphis Grizzlies,0
4,Mario Chalmers,2017,Memphis Grizzlies,1


In [8]:
lineup_df = pd.read_csv('ctg_nba_lineup_stats.csv')
lineup_df = lineup_df[lineup_df['season'] != 2016].reset_index(drop=True)
lineup_df.shape

(67655, 30)

In [9]:
lineup_cols = ['PG', 'SG', 'SF', 'PF', 'C', 'season', 'team', 'Poss'] + \
              [col for col in lineup_df.columns if 'off_' in col]
lineup_df = lineup_df[lineup_cols]
lineup_df.head()

Unnamed: 0,PG,SG,SF,PF,C,season,team,Poss,off_Pts/Poss_rank,off_Pts/Poss,off_eFG%_rank,off_eFG%,off_TOV%_rank,off_TOV%,off_OREB%_rank,off_OREB%,off_FTr_rank,off_FTr
0,Dennis Schroder,Kent Bazemore,Taurean Prince,Ersan Ilyasova,Miles Plumlee,2017,Atlanta Hawks,733,18.0,99.9,41.0,51.5%,12.0,17.7%,59.0,25.9%,37.0,15.0
1,Dennis Schroder,Kent Bazemore,Taurean Prince,John Collins,Dewayne Dedmon,2017,Atlanta Hawks,245,66.0,112.7,81.0,57.9%,15.0,17.1%,37.0,23.1%,41.0,15.7
2,Dennis Schroder,Kent Bazemore,Taurean Prince,Ersan Ilyasova,Dewayne Dedmon,2017,Atlanta Hawks,193,22.0,101.0,18.0,48.8%,78.0,12.4%,22.0,20.4%,73.0,22.1
3,Dennis Schroder,Kent Bazemore,Taurean Prince,Luke Babbitt,Dewayne Dedmon,2017,Atlanta Hawks,142,62.0,112.0,91.0,59.7%,7.0,19.0%,44.0,23.8%,5.0,8.9
4,Isaiah Taylor,Taurean Prince,Damion Lee,John Collins,Dewayne Dedmon,2017,Atlanta Hawks,140,36.0,105.7,41.0,51.5%,62.0,13.6%,36.0,23.1%,4.0,8.3


## Feature Engineering

### Get League/Team Offensive Rating for each Season

In [10]:
tm_lineup_df = lineup_df.copy()
tm_lineup_df['off_Pts'] = lineup_df['off_Pts/Poss'] * lineup_df['Poss']

In [11]:
season_off_ratings = tm_lineup_df.groupby('season')[['off_Pts', 'Poss']].sum()
season_off_ratings['off_Pts/Poss'] = season_off_ratings['off_Pts'] / \
                                     season_off_ratings['Poss']
season_off_ratings[['off_Pts/Poss']].reset_index()

Unnamed: 0,season,off_Pts/Poss
0,2017,108.329988
1,2018,110.480204
2,2019,110.793898
3,2020,112.795304
4,2021,112.255389


In [12]:
tm_off_ratings = tm_lineup_df.groupby(['team', 'season'])[['off_Pts', 'Poss']].sum()
tm_off_ratings['off_Pts/Poss'] = tm_off_ratings['off_Pts'] / tm_off_ratings['Poss']
tm_off_ratings[['off_Pts/Poss']].reset_index()

Unnamed: 0,team,season,off_Pts/Poss
0,Atlanta Hawks,2017,104.452049
1,Atlanta Hawks,2018,107.673913
2,Atlanta Hawks,2019,107.546858
3,Atlanta Hawks,2020,115.375155
4,Atlanta Hawks,2021,116.125381
...,...,...,...
145,Washington Wizards,2017,108.547292
146,Washington Wizards,2018,110.997148
147,Washington Wizards,2019,110.553583
148,Washington Wizards,2020,111.587056


### Merge

In [146]:
label_df['player_alt'] = label_df['Player'].apply(lambda x: x.replace(' ', '').lower())\
                                           .apply(lambda x: x.replace("'", ''))\
                                           .apply(lambda x: x.replace('.', ''))
label_df = label_df.drop('Player', axis=1)
label_df.head()

Unnamed: 0,season,team,player_cluster,player_alt
0,2017,Memphis Grizzlies,2,marcgasol
1,2017,Memphis Grizzlies,1,tyrekeevans
2,2017,Memphis Grizzlies,1,dillonbrooks
3,2017,Memphis Grizzlies,0,jarellmartin
4,2017,Memphis Grizzlies,1,mariochalmers


In [181]:
label_df[label_df['player_alt'].str.contains('iii')]

Unnamed: 0,season,team,player_cluster,player_alt
675,2019,Golden State Warriors,5,glennrobinsoniii


In [183]:
# temp[temp['PF_alt'].str.contains('iii')]

In [175]:
synergy_name_dict = {'kevinporterjr': 'kevinporter',
                     'patrickbeverly': 'patrickbeverley',
                     'terryrozieriii': 'terryrozier',
                     'kellyoubrejr': 'kellyoubre',
                     'lonniewalkeriv': 'lonniewalker',
                     'ottoporterjr': 'ottoporter',
                     'robertwilliamsiii': 'robertwilliams',
                     'wendellcarterjr': 'wendellcarter',
                     'alfarouqaminu': 'al-farouqaminu',
                     'jarenjacksonjr': 'jarenjackson',
                     'marvinbagleyiii': 'marvinbagley',
                     'marcusmorrissr': 'marcusmorris',
                     'wesleyiwundu': 'wesiwundu',
                     'troybrownjr': 'troybrown'}
label_df['player_alt'] = label_df['player_alt'].apply(lambda x: synergy_name_dict[x] 
                                                      if x in synergy_name_dict.keys()
                                                      else x)

In [176]:
positions = ['PG', 'SG', 'SF', 'PF', 'C']
for pos in positions:
    lineup_df[f'{pos}_alt'] = lineup_df[pos].apply(lambda x: x.replace(' ', ''))\
                                            .apply(lambda x: x.replace("'", ''))\
                                            .apply(lambda x: x.replace('.', ''))\
                                            .apply(lambda x: x.lower())
lineup_df.head()

Unnamed: 0,PG,SG,SF,PF,C,season,team,Poss,off_Pts/Poss_rank,off_Pts/Poss,off_eFG%_rank,off_eFG%,off_TOV%_rank,off_TOV%,off_OREB%_rank,off_OREB%,off_FTr_rank,off_FTr,PG_alt,SG_alt,SF_alt,PF_alt,C_alt
0,Dennis Schroder,Kent Bazemore,Taurean Prince,Ersan Ilyasova,Miles Plumlee,2017,Atlanta Hawks,733,18.0,99.9,41.0,51.5%,12.0,17.7%,59.0,25.9%,37.0,15.0,dennisschroder,kentbazemore,taureanprince,ersanilyasova,milesplumlee
1,Dennis Schroder,Kent Bazemore,Taurean Prince,John Collins,Dewayne Dedmon,2017,Atlanta Hawks,245,66.0,112.7,81.0,57.9%,15.0,17.1%,37.0,23.1%,41.0,15.7,dennisschroder,kentbazemore,taureanprince,johncollins,dewaynededmon
2,Dennis Schroder,Kent Bazemore,Taurean Prince,Ersan Ilyasova,Dewayne Dedmon,2017,Atlanta Hawks,193,22.0,101.0,18.0,48.8%,78.0,12.4%,22.0,20.4%,73.0,22.1,dennisschroder,kentbazemore,taureanprince,ersanilyasova,dewaynededmon
3,Dennis Schroder,Kent Bazemore,Taurean Prince,Luke Babbitt,Dewayne Dedmon,2017,Atlanta Hawks,142,62.0,112.0,91.0,59.7%,7.0,19.0%,44.0,23.8%,5.0,8.9,dennisschroder,kentbazemore,taureanprince,lukebabbitt,dewaynededmon
4,Isaiah Taylor,Taurean Prince,Damion Lee,John Collins,Dewayne Dedmon,2017,Atlanta Hawks,140,36.0,105.7,41.0,51.5%,62.0,13.6%,36.0,23.1%,4.0,8.3,isaiahtaylor,taureanprince,damionlee,johncollins,dewaynededmon


In [177]:
temp = lineup_df.copy()
for pos in positions:
    temp = temp.merge(label_df.rename(columns={'player_alt': f'{pos}_alt'}), 
                      how='left')\
               .rename(columns={'player_cluster': f'{pos}_cluster'})

In [178]:
temp[(temp['SF_cluster'].isna()) &
     (temp['Poss'] >= 200)].sort_values('Poss', ascending=False).head(10)

Unnamed: 0,PG,SG,SF,PF,C,season,team,Poss,off_Pts/Poss_rank,off_Pts/Poss,off_eFG%_rank,off_eFG%,off_TOV%_rank,off_TOV%,off_OREB%_rank,off_OREB%,off_FTr_rank,off_FTr,PG_alt,SG_alt,SF_alt,PF_alt,C_alt,PG_cluster,SG_cluster,SF_cluster,PF_cluster,C_cluster
47243,Chris Paul,Shai Gilgeous-Alexander,Terrance Ferguson,Danilo Gallinari,Steven Adams,2019,Oklahoma City Thunder,681,61.0,114.5,75.0,57.0%,49.0,14.0%,39.0,23.4%,76.0,23.3,chrispaul,shaigilgeous-alexander,terranceferguson,danilogallinari,stevenadams,1.0,1.0,,,2.0
27962,Shai Gilgeous-Alexander,Patrick Beverley,Landry Shamet,Danilo Gallinari,Ivica Zubac,2018,Los Angeles Clippers,538,49.0,111.9,69.0,56.1%,39.0,14.5%,41.0,24.5%,59.0,19.1,shaigilgeous-alexander,patrickbeverley,landryshamet,danilogallinari,ivicazubac,1.0,1.0,,4.0,
55940,Damian Lillard,CJ McCollum,Trevor Ariza,Carmelo Anthony,Hassan Whiteside,2019,Portland Trail Blazers,483,87.0,123.2,85.0,59.3%,64.0,13.0%,70.0,27.4%,57.0,19.5,damianlillard,cjmccollum,trevorariza,carmeloanthony,hassanwhiteside,1.0,1.0,,2.0,3.0
40234,Jeff Teague,Andrew Wiggins,Robert Covington,Taj Gibson,Karl-Anthony Towns,2018,Minnesota Timberwolves,468,30.0,107.1,33.0,51.8%,13.0,17.3%,92.0,34.0%,43.0,16.8,jeffteague,andrewwiggins,robertcovington,tajgibson,karl-anthonytowns,1.0,1.0,,2.0,2.0
47244,Chris Paul,Shai Gilgeous-Alexander,Luguentz Dort,Danilo Gallinari,Steven Adams,2019,Oklahoma City Thunder,438,50.0,111.6,43.0,53.5%,83.0,11.2%,35.0,22.9%,63.0,20.6,chrispaul,shaigilgeous-alexander,luguentzdort,danilogallinari,stevenadams,1.0,1.0,,,2.0
10029,Kris Dunn,Zach LaVine,Otto Porter,Lauri Markkanen,Robin Lopez,2018,Chicago Bulls,431,50.0,112.1,45.0,53.5%,60.0,13.0%,50.0,25.7%,51.0,18.4,krisdunn,zachlavine,ottoporter,laurimarkkanen,robinlopez,1.0,1.0,,2.0,2.0
23638,Chris Paul,James Harden,James Ennis,PJ Tucker,Clint Capela,2018,Houston Rockets,430,18.0,102.6,50.0,54.0%,2.0,20.2%,78.0,29.7%,51.0,18.2,chrispaul,jamesharden,jamesennis,pjtucker,clintcapela,4.0,4.0,,0.0,3.0
17703,Jamal Murray,Will Barton,Michael Porter,Paul Millsap,Nikola Jokic,2020,Denver Nuggets,430,88.0,127.7,89.0,61.7%,69.0,12.3%,80.0,30.4%,21.0,13.7,jamalmurray,willbarton,michaelporter,paulmillsap,nikolajokic,1.0,0.0,,2.0,2.0
42128,Jrue Holiday,E'Twaun Moore,Dante Cunningham,Anthony Davis,DeMarcus Cousins,2017,New Orleans Pelicans,394,35.0,105.6,61.0,54.1%,24.0,16.2%,38.0,23.3%,38.0,15.0,jrueholiday,etwaunmoore,dantecunningham,anthonydavis,demarcuscousins,1.0,5.0,,2.0,4.0
55632,Evan Turner,Seth Curry,Nik Stauskas,Zach Collins,Meyers Leonard,2018,Portland Trail Blazers,376,42.0,109.6,58.0,54.9%,67.0,12.5%,22.0,21.2%,18.0,13.1,evanturner,sethcurry,nikstauskas,zachcollins,meyersleonard,2.0,5.0,,2.0,0.0


In [184]:
for pos in positions:
    print(temp[(temp[f'{pos}_cluster'].isna()) & (temp['Poss'] >= 20)].shape)

(1603, 28)
(1926, 28)
(2554, 28)
(2293, 28)
(2452, 28)


In [191]:
pd.concat([temp[(temp[f'{pos}_cluster'].isna()) & (temp['Poss'] >= 100)]
           for pos in positions]).drop_duplicates().sort_values('Poss', ascending=False).head(20)

Unnamed: 0,PG,SG,SF,PF,C,season,team,Poss,off_Pts/Poss_rank,off_Pts/Poss,off_eFG%_rank,off_eFG%,off_TOV%_rank,off_TOV%,off_OREB%_rank,off_OREB%,off_FTr_rank,off_FTr,PG_alt,SG_alt,SF_alt,PF_alt,C_alt,PG_cluster,SG_cluster,SF_cluster,PF_cluster,C_cluster
46621,Russell Westbrook,Andre Roberson,Paul George,Carmelo Anthony,Steven Adams,2017,Oklahoma City Thunder,1086,56.0,110.7,52.0,53.0%,36.0,15.2%,97.0,36.0%,44.0,16.2,russellwestbrook,andreroberson,paulgeorge,carmeloanthony,stevenadams,4.0,,5.0,4.0,3.0
35905,Kendrick Nunn,Duncan Robinson,Jimmy Butler,Bam Adebayo,Meyers Leonard,2019,Miami Heat,977,66.0,115.6,85.0,59.1%,53.0,13.7%,27.0,21.3%,39.0,16.8,kendricknunn,duncanrobinson,jimmybutler,bamadebayo,meyersleonard,1.0,6.0,1.0,3.0,
46033,Elfrid Payton,Reggie Bullock,RJ Barrett,Julius Randle,Mitchell Robinson,2020,New York Knicks,781,23.0,105.6,21.0,50.7%,40.0,14.3%,74.0,29.4%,23.0,14.3,elfridpayton,reggiebullock,rjbarrett,juliusrandle,mitchellrobinson,1.0,0.0,1.0,4.0,
0,Dennis Schroder,Kent Bazemore,Taurean Prince,Ersan Ilyasova,Miles Plumlee,2017,Atlanta Hawks,733,18.0,99.9,41.0,51.5%,12.0,17.7%,59.0,25.9%,37.0,15.0,dennisschroder,kentbazemore,taureanprince,ersanilyasova,milesplumlee,1.0,5.0,5.0,0.0,
46622,Russell Westbrook,Corey Brewer,Paul George,Carmelo Anthony,Steven Adams,2017,Oklahoma City Thunder,732,57.0,110.8,38.0,51.1%,39.0,15.0%,88.0,31.3%,76.0,22.6,russellwestbrook,coreybrewer,paulgeorge,carmeloanthony,stevenadams,4.0,,5.0,4.0,3.0
27961,Shai Gilgeous-Alexander,Avery Bradley,Tobias Harris,Danilo Gallinari,Marcin Gortat,2018,Los Angeles Clippers,718,31.0,107.2,71.0,56.3%,22.0,16.2%,15.0,20.0%,41.0,16.5,shaigilgeous-alexander,averybradley,tobiasharris,danilogallinari,marcingortat,1.0,1.0,4.0,4.0,
11892,Jose Calderon,JR Smith,LeBron James,Jae Crowder,Kevin Love,2017,Cleveland Cavaliers,706,73.0,114.3,86.0,58.6%,65.0,13.5%,10.0,17.4%,49.0,16.9,josecalderon,jrsmith,lebronjames,jaecrowder,kevinlove,,5.0,4.0,0.0,2.0
47243,Chris Paul,Shai Gilgeous-Alexander,Terrance Ferguson,Danilo Gallinari,Steven Adams,2019,Oklahoma City Thunder,681,61.0,114.5,75.0,57.0%,49.0,14.0%,39.0,23.4%,76.0,23.3,chrispaul,shaigilgeous-alexander,terranceferguson,danilogallinari,stevenadams,1.0,1.0,,,2.0
25855,Darren Collison,Wesley Matthews,Bojan Bogdanovic,Thaddeus Young,Myles Turner,2018,Indiana Pacers,649,45.0,110.6,55.0,54.5%,36.0,14.8%,20.0,21.0%,69.0,21.3,darrencollison,wesleymatthews,bojanbogdanovic,thaddeusyoung,mylesturner,1.0,,5.0,2.0,2.0
51248,Ben Simmons,JJ Redick,Jimmy Butler,Wilson Chandler,Joel Embiid,2018,Philadelphia 76ers,628,69.0,116.9,74.0,56.6%,17.0,16.6%,82.0,30.6%,88.0,26.1,bensimmons,jjredick,jimmybutler,wilsonchandler,joelembiid,2.0,6.0,1.0,,2.0


In [190]:
temp[temp['Poss'] >= 100].shape

(1521, 28)