# K nearest neighbors in python Nba Stats 2013

In [11]:
import math
import pandas

with open("../data/nba_2013.csv", 'r') as csvfile:
    nba = pandas.read_csv(csvfile)


nba.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,1,Quincy Acy,PF,22,TOR,29,342,15.9,0.632,0.027,...,15.6,14.7,0.7,0.4,1.1,0.157,-0.6,1.0,0.5,0.2
1,2,Jeff Adrien,PF,26,CHA,52,713,13.4,0.493,0.012,...,13.1,15.6,0.5,0.4,1.0,0.064,-2.9,-0.4,-3.3,-0.2
2,3,Arron Afflalo,SF,27,ORL,64,2307,13.0,0.527,0.265,...,12.1,22.5,1.5,0.5,2.0,0.042,-0.4,-1.9,-2.3,-0.2
3,4,Josh Akognon,PG,26,DAL,3,9,15.3,0.625,0.5,...,0.0,20.3,0.0,0.0,0.0,0.196,4.3,-4.9,-0.6,0.0
4,5,Cole Aldrich,C,24,TOT,45,388,11.1,0.563,0.0,...,20.6,12.7,0.1,0.4,0.6,0.07,-4.4,0.4,-3.9,-0.2


In [7]:
print(nba.columns.values)

['Rk' 'Player' 'Pos' 'Age' 'Tm' 'G' 'MP' 'PER' 'TS%' '3PAr' 'FTr' 'ORB%'
 'DRB%' 'TRB%' 'AST%' 'STL%' 'BLK%' 'TOV%' 'USG%' 'OWS' 'DWS' 'WS' 'WS/48'
 'OBPM' 'DBPM' 'BPM' 'VORP']


In [9]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 573 entries, 0 to 572
Data columns (total 27 columns):
Rk        573 non-null int64
Player    573 non-null object
Pos       573 non-null object
Age       573 non-null int64
Tm        573 non-null object
G         573 non-null int64
MP        573 non-null int64
PER       572 non-null float64
TS%       569 non-null float64
3PAr      569 non-null float64
FTr       569 non-null float64
ORB%      572 non-null float64
DRB%      572 non-null float64
TRB%      572 non-null float64
AST%      572 non-null float64
STL%      572 non-null float64
BLK%      572 non-null float64
TOV%      570 non-null float64
USG%      572 non-null float64
OWS       573 non-null float64
DWS       573 non-null float64
WS        573 non-null float64
WS/48     572 non-null float64
OBPM      573 non-null float64
DBPM      573 non-null float64
BPM       573 non-null float64
VORP      573 non-null float64
dtypes: float64(20), int64(4), object(3)
memory usage: 125.3+ KB


In [3]:
# Select Lebron James from our dataset
selected_player = nba[nba["Player"] == "LeBron James"].iloc[0]
selected_player

Rk                 212
Player    LeBron James
Pos                 PF
Age                 28
Tm                 MIA
G                   76
MP                2877
PER               31.6
TS%               0.64
3PAr             0.188
FTr              0.395
ORB%               4.4
DRB%              20.8
TRB%              13.1
AST%              36.4
STL%               2.4
BLK%               1.9
TOV%              12.4
USG%              30.2
OWS               14.6
DWS                4.7
WS                19.3
WS/48            0.322
OBPM                 9
DBPM               1.8
BPM               10.8
VORP               9.2
Name: 254, dtype: object

In [14]:
# Choose only the numeric columns (we'll use these to compute euclidean distance)
distance_columns = ['Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%','TRB%','AST%', 'STL%', 'BLK%',
                    'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

# Find the distance from each player in the dataset to lebron.
lebron_distance = nba.apply(euclidean_distance, axis=1)


## Normalizing columns

In [19]:
# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]

# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
nba_normalized.head()

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,-1.07209,-0.779918,-0.877945,0.624448,1.327789,-0.983686,0.937551,0.830651,0.330493,0.633034,...,0.118567,-0.769908,-0.247152,-0.613711,-0.423094,0.93837,0.329792,0.748417,0.638,-0.272843
1,-0.126638,0.107234,-0.455442,0.21058,-0.063135,-1.054071,1.297836,0.884715,0.982459,1.058268,...,-0.184333,-0.596823,-0.348743,-0.613711,-0.458475,0.03124,-0.313918,0.046546,-0.237626,-0.611055
2,0.109725,0.570095,1.359842,0.144362,0.277091,0.133093,-0.13102,-0.773247,-0.562418,-0.772085,...,-0.305494,0.730167,0.159213,-0.527202,-0.104662,-0.183349,0.385767,-0.705458,-0.007198,-0.611055
3,-0.126638,-1.782785,-1.257173,0.52512,1.257742,1.235795,-1.138179,-1.025546,-0.3073,-0.698132,...,-1.771531,0.307069,-0.602722,-0.959744,-0.812288,1.318779,1.701174,-2.209466,0.384529,-0.441949
4,-0.599364,-0.162769,-0.825559,-0.170178,0.63733,-1.11038,-0.114643,0.54231,1.761984,1.428036,...,0.724368,-1.154543,-0.551926,-0.613711,-0.6,0.089765,-0.733729,0.447615,-0.375883,-0.611055


## Finding the nearest neighbor

In [24]:
from scipy.spatial import distance

# Fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["Player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

In [26]:
# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort("dist", inplace=True)
distance_frame.head()

Unnamed: 0,dist,idx
254,0.0,254
148,3.230224,148
409,5.843237,409
207,6.135291,207
551,6.620225,551


In [27]:
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["Player"]

In [28]:
most_similar_to_lebron

'Kevin Durant'

# Generating training and testing sets

In [37]:
import random
from numpy.random import permutation

nba.fillna(0, inplace=True)

# Randomly shuffle the index of nba.
random_indices = permutation(nba_normalized.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba_normalized)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba_normalized.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
train = nba_normalized.loc[random_indices[test_cutoff:]]

# Using sklearn for k nearest neighbors

In [38]:
x_columns = [x for x in distance_columns if x != "MP"]

# The column that we want to predict.
y_column = ["MP"]

In [39]:
from sklearn.neighbors import KNeighborsRegressor

# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])

# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])

## Computing Error

In [40]:
# Get the actual values for the test set.
actual = test[y_column]

# Compute the mean squared error of our predictions.
mse = (((predictions - actual) ** 2).sum()) / len(predictions)

print(mse)

MP    0.150107
dtype: float64
