# PSTAT134 Assignment
## Andrew Zhang

### Data Preparation
Creating the pickle file allpatterns2016-17.pkl

In [None]:
import pickle
import pandas as pd
import numpy as np
import sklearn.decomposition as skld
import matplotlib.pyplot as plt
import seaborn as sns

import helper_basketball as h
import imp
imp.reload(h);

In [None]:
allshots = pickle.load(open('allshots2016-17.pkl', 'rb'))

allmade = allshots

In [None]:
## players info
player_ids = allmade.PlayerID.unique()
num_players = player_ids.size
 
## bin edge definitions in inches
xedges = (np.linspace(start=-25, stop=25, num=151, dtype=np.float)) * 12
yedges = (np.linspace(start= -4, stop=31, num=106, dtype=np.float)) * 12

## number of bins is one less than number of edges
nx = xedges.size - 1
ny = yedges.size - 1

## 2d histogram containers for binned counts and smoothed binned counts
all_counts = {}
all_smooth = {}

## data matrix: players (row) by vectorized 2-d court locations (column)
for i, one in enumerate(allmade.groupby('PlayerID')):
    
    ## what does this line do?
    pid, pdf = one
    
    ## h.bin_shots: what is this function doing?
    tmp1, xedges, yedges = h.bin_shots(pdf, bin_edges=(xedges, yedges), density=True, sigma=2)
    tmp2, xedges, yedges = h.bin_shots(pdf, bin_edges=(xedges, yedges), density=False)
    
    ## vectorize and store into dictionary
    all_smooth[pid] = tmp1.reshape(-1)
    all_counts[pid] = tmp2.reshape(-1)

In [None]:
X = np.stack(all_smooth.values()).T

In [None]:
pickle.dump(X, open('allpatterns2016-17.pkl', 'wb'))

### Import Data

In [None]:
X = pickle.load(open('allpatterns2016-17.pkl', 'rb'))

In [None]:
%%bash
cat helper_basketball.py

In [None]:
## get all 2016-17 teams
params = {'LeagueID':'00','Season': '2016-17'}
teams = h.get_nba_data('commonTeamYears', params).set_index('TEAM_ID')
allteams = teams.loc[teams.MAX_YEAR=='2017'].index.values

## get all 2016-17 players
params = {'LeagueID':'00', 'Season': '2016-17', 'IsOnlyCurrentSeason': '0'}
players = h.get_nba_data('commonallplayers', params).set_index('PERSON_ID')
allplyrs = players.loc[players.TEAM_ID.isin(allteams)].index.values

## Problem 1

`n_components` = 10

In [None]:
model_10 = skld.NMF(n_components=10, init='nndsvda', max_iter=500, random_state=0)
W_10 = model_10.fit_transform(X)
H_10 = model_10.components_

In [None]:
xedges = (np.linspace(start=-25, stop=25, num=151, dtype=np.float)) * 12
yedges = (np.linspace(start= -4, stop=31, num=106, dtype=np.float)) * 12

In [None]:
fig, ax = plt.subplots(5, 2, figsize=(20,40))

for i, axi in enumerate(ax.flatten()):
    h.plot_shotchart(W_10[:,i], xedges, yedges, ax=axi)
    axi.set_title('NMF component ' + str(i))

`n_components` = 7

In [None]:
model_7 = skld.NMF(n_components=7, init='nndsvda', max_iter=500, random_state=0)
W_7 = model_7.fit_transform(X)
H_7 = model_7.components_

In [None]:
fig, ax = plt.subplots(7, 1, figsize=(20,40))

for i, axi in enumerate(ax.flatten()):
    h.plot_shotchart(W_7[:,i], xedges, yedges, ax=axi)
    axi.set_title('NMF component ' + str(i))

`n_components` = 6

In [None]:
model_6 = skld.NMF(n_components=6, init='nndsvda', max_iter=500, random_state=0)
W_6 = model_6.fit_transform(X)
H_6 = model_6.components_

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(20,40))

for i, axi in enumerate(ax.flatten()):
    h.plot_shotchart(W_6[:,i], xedges, yedges, ax=axi)
    axi.set_title('NMF component ' + str(i))

We can see that as we lower the `r`, the visualizations of shot location become more and more generalized as demonstrated by coefficient `6` when `r`=7. If we look at when `r`=6, we can see that the visualizations do not show the heatmap for mid range shots. Thus, we can assume that `r` is too small when `r`=6.

`n_components` = 15

In [None]:
model_15 = skld.NMF(n_components=15, init='nndsvda', max_iter=500, random_state=0)
W_15 = model_15.fit_transform(X)
H_15 = model_15.components_

In [None]:
fig, ax = plt.subplots(5, 3, figsize=(20,40))

for i, axi in enumerate(ax.flatten()):
    h.plot_shotchart(W_15[:,i], xedges, yedges, ax=axi)
    axi.set_title('NMF component ' + str(i))

After testing when n_components are greater than 10, when n_components=15, we can see repeating bases for corner three pointers and shots left of the rim. Therefore, we can assume that r is too large when it equals 15.

## Problem 2

In [None]:
#Create a list of players to create correlation matrix
playerslist = 'Carmelo Anthony|Tyson Chandler|Paul George|Tristan Thompson|LaMarcus Aldridge|LeBron James|Kawhi Leonard|Rajon Rondo'
playerslist_ids = players[players.DISPLAY_FIRST_LAST.str.contains(playerslist)].loc[player_ids].dropna()

In [None]:
playersHd = pd.DataFrame(H_10, columns=all_smooth.keys())
players_coeff = playersHd.loc[:,playerslist_ids.index.values]
players_coeff.columns = playerslist_ids.DISPLAY_FIRST_LAST

In [None]:
players_coeff /= players_coeff.sum(0)
players_coeff.T

In [None]:
player_corr = players_coeff.corr()
player_corr

In [None]:
sns.heatmap(player_corr)

#### Dimensions of R
Depending on the number of players you want to compare, the dimensions of the matrix `R` will be `n`x`n`. For example, if we are comparing LeBron James and Stephen Curry, we will get a matrix with dimension 2x2. If we are comparing Tristan Thompson, Brook Lopez, and Al Horford, we will get a matrix of dimension 3x3.

#### Player Correlation
From the correlation plot, we can see that Tristan Thompson and LeBron James have a strong correlation while Paul George and Rajon Rondo have a negative correlation. We can plot their shot selection below.

In [None]:
corrlist = 'Paul George|Tristan Thompson|LeBron James|Rajon Rondo'
corrlist_ids = players[players.DISPLAY_FIRST_LAST.str.contains(corrlist)].loc[player_ids].dropna()

In [None]:
corrHd = pd.DataFrame(H_10, columns=all_smooth.keys())
corr_coeff = corrHd.loc[:,corrlist_ids.index.values]
corr_coeff.columns = corrlist_ids.DISPLAY_FIRST_LAST

In [None]:
corr_coeff /= corr_coeff.sum(0)
corr_coeff.T

In [None]:
#Plot positive correlation shooting pattern
coeff_range = range(0,10)
plt.plot(coeff_range, corr_coeff.T.iloc[1,:])
plt.plot(coeff_range, corr_coeff.T.iloc[3,:])
plt.legend()
plt.title('James vs. Thompson Shooting Patterns')
plt.show()

In [None]:
#Plot negative correlation shooting pattern
coeff_range = range(0,10)
plt.plot(coeff_range, corr_coeff.T.iloc[0,:])
plt.plot(coeff_range, corr_coeff.T.iloc[2,:])
plt.legend()
plt.title('George vs. Rondo Shooting Patterns')
plt.show()

In [None]:
corr_corr = corr_coeff.corr()
sns.heatmap(corr_corr)

After plotting the shooting patterns, we can see an overall similarity in terms of the shooting pattern for James and Thompson. This is expected as they have a positive correlation in their shooting. However, we can see a significant difference in the plots of George and Rondo. This also makes sense since they display a negative correlation. 

## Problem 3

If we look at the visualized bases, we can see the location of the shots for each of the coefficients. For example, `0` represents shots immediately around the hoop, `1` represents shots on the wings of the three point line, and `7` represents mid range shots. From this, we can see the coefficients may represent shots made or attempted from these regions. For instance, when we look at the coefficients scaled to 1, we see Tyson Chandler yields values in coefficents `0`, `2`, `3`, and `6`, potentially signaling he shoots shots close to the rim. However, if we look at Stephen Curry, he yields values in all coefficient categories, particularly in coefficients `1`, `8`. Depending on the coefficients, we can assume that if the player shoots most of their shots in or around the rim, that they may play a center or power forward position. If they shoot from all over the court, they may play a forward position or guard position depending on their emphasis on three point shots. Let us look at the correlation plots for a select number of bigs and guards.

In [None]:
#select bigs 
bigs = 'Brook Lopez|Tyson Chandler|DeAndre Jordan|Tristan Thompson|LaMarcus Aldridge|Anthony Davis|Al Horford|Dwight Howard|Andre Drummond'
bigsids = players[players.DISPLAY_FIRST_LAST.str.contains(bigs)].loc[player_ids].dropna()

In [None]:
bigHd = pd.DataFrame(H_10, columns=all_smooth.keys())
bigs_coeff = bigHd.loc[:,bigsids.index.values]
bigs_coeff.columns = bigsids.DISPLAY_FIRST_LAST
bigs_coeff.T

In [None]:
bigs_coeff /= bigs_coeff.sum(0)
bigs_coeff.T

In [None]:
bigs_corr = bigs_coeff.corr()
sns.heatmap(bigs_corr)

In [None]:
#select guards
guards = 'Stephen Curry|Kyrie Irving|Chris Paul|Kemba Walker|James Harden|Russell Westbrook|Damien Lillard|Dwyane Wade'
guardsids = players[players.DISPLAY_FIRST_LAST.str.contains(guards)].loc[player_ids].dropna()

In [None]:
guardHd = pd.DataFrame(H_10, columns=all_smooth.keys())
guards_coeff = guardHd.loc[:,guardsids.index.values]
guards_coeff.columns = guardsids.DISPLAY_FIRST_LAST
guards_coeff.T

In [None]:
guards_coeff /= guards_coeff.sum(0)
guards_coeff.T

In [None]:
guards_corr = guards_coeff.corr()
sns.heatmap(guards_corr)

We can see similarity amongst the guards and bigs as displayed by the correlation plots for each of the respective groups.

## Problem 4

Many teams have issues with players being too one dimensional or not expanding their game to other important aspects. Although this is solely limited to shot patterns, given this data, it is possible to help players work on improving their shot selection. For example, LeBron James is overall, the most well rounded player as he is able to shoot three points, two pointers, and score in the paint as he pleases. This would help his teammates get open as opposing defenders have a difficult time covering all options. On the other hand, players like Rajon Rondo, who is a guard not known for his jumpshooting, are easier for defenders to guard as they can assume he will try to shoot a shot close to the rim. This makes it harder to teammates to find open opportunities as Rondo does not have as much of a threat on the court as James does. 

However, this information is only related to shot patterns, so it cannot account for other factors. Although James creates more of an offensive threat than Rondo in terms of his shot selection, Rondo provides a huge threat on the floor through his ability to pass the ball and find openings in the defense. Another important question to address would be offensive patterns. Will a player be more likely to shoot the ball, pass to a teammate, or turn the ball over?