In [1]:
# 'In this assignment, students will be using the K-nearest neighbors algorithm to predict
# how many points NBA players scored in the 2013-2014 season.

from __future__ import print_function

# ###1: A look at the data

#A look at the data
#Before we dive into the algorithm, letâ€TMs take a look at our data. Each row in the data
#contains information on how a player performed in the 2013-2014 NBA season.
#Download 'nba_2013.csv' file from this link:
#https://www.dropbox.com/s/b3nv38jjo5dxcl6/nba_2013.csv?dl=0
#Here are some selected columns from the data:
#
# 
# - player -- name of the player
# - pos -- the position of the player
# - g -- number of games the player was in
# - gs -- number of games the player started
# - pts -- total points the player scored
# 
# See <a href = "http://www.databasebasketball.com/about/aboutstats.htm">this site</a> for an explanation of the rest of them.


import pandas
with open("C:\\Users\\zabiulla.khan\\Downloads\\nba_2013.csv", "r") as csvfile:
    nba_raw = pandas.read_csv(csvfile)

# Replace NaN values with zeros.
nba = nba_raw.fillna(0)

# Convert strings to NaN and drop.
nba = nba.convert_objects(convert_numeric=True).dropna()
    
# The names of the columns in the data.
print("nba.columns.values:", nba.columns.values)

nba.head(5)

nba.columns.values: ['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [2]:
# ###2: KNN overview

# The k-nearest neighbors is based around the simple idea of predicting unknown values by matching them with the most similar known values.
# 
# Let's say that we have 3 different types of cars:
# 
#     car,horsepower,racing_stripes,is_fast
#     Honda Accord,180,False,False
#     Yugo,500,True,True
#     Delorean DMC-12,200,True,True
# 
# Let's say that we now have another car:
# 
#     Chevrolet Camaro,400,True,Unknown
# 
# We don't know whether or not this car is fast. In order to predict if it is, we find the most similar known car. In this case, we would compare the horsepower and racing_stripes values to find the most similar car, which is the Yugo. Since the Yugo is fast, we would predict that the Camaro is also fast. This is an example of 1-nearest neighbors -- we only looked at the most similar car.
# 
# If we performed a 2-nearest neighbors, we would end up with 2 True values (for the Delorean and the Yugo), which would average out to True.
# 
# If we did 3-nearest neighbors, we would end up with 2 True values and a False value, which would average out to True.

# ###3: Euclidean distance

# Before we can predict using KNN, we need to find some way to figure out which data rows are "closest" to the row we're trying to predict on.
# 
# A simple way to do this is to use Euclidean distance. The formula is 
# $\sqrt{(q_1-p_1)^2 + (q_2-p_2)^2 + \cdots + (q_n-p_n)^2}$
# 
# Let's say we have these two rows (True/False has been converted to 1/0), and we want to find the distance between them:
# 
#     Honda Accord,180,0
#     Chevrolet Camaro,400,1
# 
# We would first only select the numeric columns. Then the distance becomes 
# $\sqrt{(180-400)^2 + (0-1)^2}$, which is about equal to 220.

# ####Instructions

# Make a function for calculating the euclidean distance between two pandas series. Use the function to find the euclidean distance between selected_player and each row in nba. Use the .apply(func, axis=1) method on dataframes to apply function func to each row. The function should take row as its first argument. Only use the columns in distance_columns to compute the distance. <a href= "http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.apply.html">Here's</a> more on the method.
# 
# Assign the resulting pandas series to lebron_distance.

import math

#We can use the principle of euclidean distance to find the most similar NBA players to Lebron James.
    
# Select Lebron James from our dataset
selected_player = nba[nba["player"] == "LeBron James"].iloc[0]

# Choose only the numeric columns (we'll use these to compute euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

# Find the distance from each player in the dataset to lebron.
lebron_distance = nba.apply(euclidean_distance, axis=1)
print("lebron_distance[:5]:\n", lebron_distance[:5])

lebron_distance[:5]:
 0    3475.792868
1    3148.395020
2    3161.567361
3    1189.554979
4    3216.773098
dtype: float64


In [3]:

# ###4: Normalizing columns

# Variables which are much larger in absolute terms have the potential to have a larger impact on distance. This can be bad, because a variable having larger values doesn't necessarily make it better at predicting what rows are similar.
# 
# A simple way to deal with this is to normalize all the columns to have a mean of 0, and a standard deviation of 1. This will ensure that no single column has a dominant impact on the euclidean distance calculations.
# 
# To set the mean to 0, we have to find the mean of a column, then subtract the mean from every value in the column. To set the standard deviation to 1, we divide every value in the column by the standard deviation. The formula is $x=\frac{x-\mu}{\sigma}$.

# ####Instructions

# Normalize the columns in nba_numeric. Using .mean() on a dataframe will return the mean of each column. Using .std() will return the standard deviation of each column.

# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]
nba_numeric.head(5)

# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
nba_normalized.head(5)

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,-0.835906,0.384886,-0.862207,-0.435088,-0.738401,-0.768505,0.325957,-0.700282,-0.716608,0.12052,...,-0.151926,0.26069,-0.129462,-0.013116,-0.64522,-0.468056,0.06141,-0.66765,0.226515,-0.734621
1,-1.550487,1.095711,-0.187863,-0.045011,-0.581271,-0.649215,0.667749,-0.778936,-0.829601,-1.390497,...,-0.522588,1.387883,0.18702,0.565852,-0.530733,0.02068,1.065446,-0.01376,1.363938,-0.534801
2,0.116868,-0.010016,-0.4576,-0.308035,-0.290291,-0.405214,0.833763,-0.778936,-0.829601,-1.390497,...,-0.250457,0.743773,0.28334,0.436083,-0.568895,-0.439307,0.385292,-0.524113,0.029924,-0.328603
3,0.355062,0.779789,1.599148,1.465144,1.577804,1.590172,0.238067,1.737992,1.430256,1.02713,...,0.57532,-0.38342,0.462221,0.216475,1.033919,-0.123066,-0.68352,1.18238,0.423107,1.729123
4,-0.359519,0.108454,0.149309,-0.31918,-0.331028,-0.475703,1.087666,-0.778936,-0.822068,-1.390497,...,0.673851,0.614951,0.138859,0.291341,-0.55363,-0.468056,0.709175,-0.141348,1.139262,-0.400878


In [4]:

# ###5: Finding the nearest neighbor

# We now know enough to find the nearest neighbor of a given row. We can use the distance.euclidean function from scipy.spatial, a much faster way to calculate euclidean distance.

# ####Instructions

# Find the player most similar to LeBron James by our distance metric. You can do this by finding the second lowest value in the euclidean_distances series (the lowest value will correspond to lebron, as he is most similar to himself), and then cross-referencing the nba dataframe with the same index.
# 
# Assign the name of the player to most_similar_to_lebron.

from scipy.spatial import distance

# Fill in NA values in nba_normalized.
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
#distance_frame.sort("dist", inplace=True)

second_smallest = distance_frame.iloc[1]["idx"]

most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]
print("most_similar_to_lebron:", most_similar_to_lebron)

most_similar_to_lebron: Steven Adams


In [5]:

# ###6: Generating training and testing sets

# Now that we know how to find the nearest neighbors, we can make predictions on a test set.
# 
# First, we have to generate test and train sets. In order to do this, we'll use random sampling. We'll randomly shuffle the index of the nba dataframe, and then pick rows using the randomly shuffled values.
# 
# If we didn't do this, we'd end up predicting and training on the same data set, which would overfit. We could do cross validation also, which would be slightly better, but slightly more complex.

import random
from numpy.random import permutation

# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items).
test_cutoff = math.floor(len(nba)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = nba.loc[random_indices[test_cutoff:]]

In [6]:
train.isnull().sum()

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

In [7]:
final_train=train.fillna(0)

In [8]:
final_train.isnull().sum()

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

In [9]:
test.isnull().sum()

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

In [10]:
final_test=train.fillna(0)

In [11]:
final_test.isnull().sum()

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

In [12]:
# ###7: Using sklearn

# Instead of having to do it all ourselves, we can use the k-nearest neighbors implementation in scikit-learn. <a href = "http://scikit-learn.org/stable/modules/neighbors.html">Here's</a> the documentation. There's a regressor and a classifier available, but we'll be using the regressor, as we have continuous values to predict on.
# 
# Sklearn performs the normalization and distance finding automatically, and lets us specify how many neighbors we want to look at.

from sklearn.neighbors import KNeighborsRegressor

# The columns that we will be making predictions with.
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict.
y_column = ["pts"]


# Create the knn model.
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data.
knn.fit(final_train[x_columns], final_train[y_column])

# Make predictions on the test set using the fit model.
predictions = knn.predict(final_test[x_columns])

print("predictions[:5]:\n", predictions[:5])

predictions[:5]:
 [[1048.2]
 [ 313. ]
 [ 325.2]
 [ 625.8]
 [ 147.6]]


In [13]:

# ###8: Computing error

# Now that we know our predictions, we can compute the error involved. We can compute <a href = "http://en.wikipedia.org/wiki/Mean_squared_error">mean squared error</a>. The formula is $\frac{1}{n}\sum_{i=1}^{n}(\hat{y_{i}} - y_{i})^{2}$.

# ####Instructions

# Compute the mean squared error between actual and predictions. Assign the result to mse.

actual = final_test[y_column]

mse = (((predictions - actual) ** 2).sum()) / len(predictions)

print("actual[:20]:\n", actual[:20])
print("mse:", mse)

actual[:20]:
       pts
141  1096
357   213
4     328
427   586
289   109
181   404
358    44
422    14
144   970
448  1028
335   429
376   879
459  1002
88    480
402  1264
383   626
152   273
58   1042
411   772
139   214
mse: pts    4836.777695
dtype: float64
