In [1]:
# General tools
import pandas as pd
import numpy as np

# Packages to perform dimensionality reduction
import sklearn.datasets
import sklearn.decomposition
import sklearn.manifold

# Plotting modules
import bokeh.io
import bokeh.plotting as bk
bokeh.io.output_notebook()

In [2]:
df = pd.read_csv('NBA_stats_RPM.csv').iloc[:,1:].drop('Tm', axis=1)

# Let's just take a look at the box score stats to start with
aux_cols = ['Name', 'Year', 'Exp', 'Age', 'Pos', 'G', 'Gs', 'BPM']
df_stats = df.drop(aux_cols, axis=1).fillna(1).values

# Now let's see if we can reduce the dimensionality of our data using PCA

# First we need to standardize the data such that the variances are similar
# between each dimension
df_stats_std = sklearn.preprocessing.StandardScaler().fit_transform(df_stats)

# Now implement PCA
n_components = 5
pca = sklearn.decomposition.PCA(n_components=n_components, svd_solver='full')

# Train model and fit data
pca.fit(df_stats_std)
stats_fitted = pca.transform(df_stats)

# Record our transformed axes and percentage of variance explained by each
axes = pca.components_
var = pca.explained_variance_

# Relabel our columns and add back our auxillary information
df_stats_pca = pd.DataFrame(stats_fitted,
                            columns=['PC' + str(x) for x in range(1, n_components+1)])
df_stats_pca[aux_cols] = df[aux_cols]
df_stats_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,Name,Year,Exp,Age,Pos,G,Gs,BPM
0,10.915541,-3.222678,0.773308,3.204905,0.391906,Alex Abrines,2017,0,23,2,68,6,-2.5
1,9.335846,0.334158,-0.404112,1.689268,-0.361877,Quincy Acy,2013,0,22,4,29,0,0.7
2,8.959057,0.733712,-0.447586,2.701992,0.188396,Quincy Acy,2014,1,23,3,63,0,-1.1
3,14.422009,-0.189102,-1.171733,3.116202,-0.026591,Quincy Acy,2015,2,24,4,68,22,-3.1
4,11.339161,-0.16801,-0.429034,2.427266,-0.136313,Quincy Acy,2016,3,25,4,59,29,0.2


In [43]:
# Choose what displays when points are hovered over
tools = [('Name', '@Name'),
         ('Position', '@Pos'),
         ('BPM', '@BPM')]

hover = bokeh.models.HoverTool(tooltips=tools)

# Make figures
plots = []
for index1, axes1 in enumerate(['PC' + str(x) for x in range(1, n_components)]):
    for index2, axes2 in enumerate(['PC' + str(x) for x in range(1, n_components)]):
        if index2 > index1:
            source = bokeh.models.ColumnDataSource(df_stats_pca)
            
            p = bk.figure(plot_width=500, 
                          plot_height=400,
                          x_axis_label=axes1,
                          y_axis_label=axes2)
            p.add_tools(hover)
            
            p.circle(x=axes1, y=axes2, source=source)
            plots.append(p)
            
bokeh.io.show(bokeh.layouts.gridplot(plots, ncols=2))

So this looks pretty cool! We can see that principle components 1, 2, and 3 all maintain a good portion of the original data's variance which is good. Another quick observation we can make just by hovering over the graphs is that they all do a good job of separating players based on position which is to be expected. This can be illustrated in the next set of plots where we separate position by color:

In [40]:
# Define colors in a dictionary to access them
keys = df_stats_pca.Pos.dropna().unique()
color_dict = {k: bokeh.palettes.brewer['Spectral'][5][i] 
                      for i, k in enumerate(sorted(keys))}

# Make figures
plots = []
for index1, axes1 in enumerate(['PC' + str(x) for x in range(1, n_components)]):
    for index2, axes2 in enumerate(['PC' + str(x) for x in range(1, n_components)]):
        if index2 > index1:
            
            p = bk.figure(plot_width=500, 
                              plot_height=400,
                              x_axis_label=axes1,
                              y_axis_label=axes2)
            p.background_fill_color = "black"
            p.background_fill_alpha = 0.3
            p.add_tools(hover)
                
            for key, group in df_stats_pca.groupby('Pos'):
                source = bokeh.models.ColumnDataSource(group)
                
                # player_name = 'Stephen Curry'
                
                # for name, group2 in group.groupby('Name'):
                #     source2 = bokeh.models.ColumnDataSource(group2)
                #     if name == player_name:
                #         p.diamond_cross(x=axes1, y=axes2, source=source2,
                #                         color='red', line_width=2, size=20,
                #                         legend=player_name)

                p.circle(x=axes1, y=axes2, source=source,
                         color=color_dict[key], legend=str(key))
            
            plots.append(p)
                
bokeh.io.show(bokeh.layouts.gridplot(plots, ncols=2))

We can pretty clearly see from the first two charts that the new axes clearly separate players based on their positions and we can determine players positions almost solely based on the first two transformed dimensions. One really interesting point is Manu Ginobli (black point at around 5, 2.5) who under the first metric played a very similar game to Andrew Bogut which is pretty mind-boggling. Now let's see how well our metric predicts BPM:

In [41]:
# Identify cutoffs for BPM tiers
bins = [-7, -5, -3, -1.5, 0, 1, 2, 3.5, 5, 8]               
names = ['< ' + str(bins[0])]
names += [str(bins[i]) + ' to ' + str(bins[i+1]) for i in range(len(bins)-1)]
names += ['> ' + str(bins[-1])]

# Add tiers to dataframe
df_stats_pca['binned BPM'] = np.digitize(df_stats_pca['BPM'].values, bins)

# Define colors in a dictionary to access them
color_dict = {i: bokeh.palettes.brewer['Spectral'][len(bins)+1][i] 
                      for i in range(len(bins)+1)}

# Make figures
plots = []
for index1, axes1 in enumerate(['PC' + str(x) for x in range(1, n_components)]):
    for index2, axes2 in enumerate(['PC' + str(x) for x in range(1, n_components)]):
        if index2 > index1:
            
            p = bk.figure(plot_width=500, 
                              plot_height=400,
                              x_axis_label=axes1,
                              y_axis_label=axes2)
            p.background_fill_color = "black"
            p.background_fill_alpha = 0.3
            p.add_tools(hover)
                
            for key, group in df_stats_pca.groupby('binned BPM'):
                source = bokeh.models.ColumnDataSource(group)
                
                p.circle(x=axes1, y=axes2, source=source,
                         color=color_dict[key], legend=names[key])
            
            plots.append(p)
                
bokeh.io.show(bokeh.layouts.gridplot(plots, ncols=2))

In [31]:
df_stats_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,Name,Year,Exp,Age,Pos,G,Gs,BPM,binned BPM
0,10.915541,-3.222678,0.773308,3.204905,0.391906,Alex Abrines,2017,0,23,2,68,6,-2.5,3
1,9.335846,0.334158,-0.404112,1.689268,-0.361877,Quincy Acy,2013,0,22,4,29,0,0.7,6
2,8.959057,0.733712,-0.447586,2.701992,0.188396,Quincy Acy,2014,1,23,3,63,0,-1.1,4
3,14.422009,-0.189102,-1.171733,3.116202,-0.026591,Quincy Acy,2015,2,24,4,68,22,-3.1,3
4,11.339161,-0.16801,-0.429034,2.427266,-0.136313,Quincy Acy,2016,3,25,4,59,29,0.2,6
