In [8]:
import pandas
with open("C:/Users/tulasi/Desktop/DS recordings/nba_2013.csv", 'r') as csvfile:
    nba_raw = pandas.read_csv(csvfile)

In [9]:
# Replace NaN values with zeros.
nba = nba_raw.fillna(0)

# Convert strings to NaN and drop.
nba = nba.convert_objects(convert_numeric=True).dropna()
    
# The names of the columns in the data.
print("nba.columns.values:", nba.columns.values)

nba.head(5)

nba.columns.values: ['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """


Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [18]:
#Euclidean distance

# Before we can predict using KNN, we need to find some way to figure out which data rows are "closest" to the row we're trying to predict on.
# 
# A simple way to do this is to use Euclidean distance. The formula is 
# $\sqrt{(q_1-p_1)^2 + (q_2-p_2)^2 + \cdots + (q_n-p_n)^2}$
# 
# Let's say we have these two rows (True/False has been converted to 1/0), and we want to find the distance between them:
# 
#     Honda Accord,180,0
#     Chevrolet Camaro,400,1
# 
# We would first only select the numeric columns. Then the distance becomes 
# $\sqrt{(180-400)^2 + (0-1)^2}$, which is about equal to 220.

# ####Instructions

# Make a function for calculating the euclidean distance between two pandas series. Use the function to find the euclidean distance between selected_player and each row in nba. Use the .apply(func, axis=1) method on dataframes to apply function func to each row. The function should take row as its first argument. Only use the columns in distance_columns to compute the distance. <a href= "http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.apply.html">Here's</a> more on the method.
# 
# Assign the resulting pandas series to lebron_distance.



import math

selected_player = nba[nba["player"] == "LeBron James"].iloc[0]
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa',
 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast',
 'stl', 'blk', 'tov', 'pf', 'pts']

def euclidean_distance(row):
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

lebron_distance = nba.apply(euclidean_distance, axis=1)
print("lebron_distance[:5]:\n", lebron_distance[:5])

lebron_distance[:5]:
 0    3475.792868
1    3148.395020
2    3161.567361
3    1189.554979
4    3216.773098
dtype: float64


In [20]:
# ###4: Normalizing columns

# Variables which are much larger in absolute terms have the potential to have a larger impact on distance. This can be bad, because a variable having larger values doesn't necessarily make it better at predicting what rows are similar.
# 
# A simple way to deal with this is to normalize all the columns to have a mean of 0, and a standard deviation of 1. This will ensure that no single column has a dominant impact on the euclidean distance calculations.
# 
# To set the mean to 0, we have to find the mean of a column, then subtract the mean from every value in the column. To set the standard deviation to 1, we divide every value in the column by the standard deviation. The formula is $x=\frac{x-\mu}{\sigma}$.

# ####Instructions

# Normalize the columns in nba_numeric. Using .mean() on a dataframe will return the mean of each column. Using .std() will return the standard deviation of each column.

# In[4]:

nba_numeric = nba[distance_columns]
nba_numeric.head(5)

nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()
nba_normalized.head(5)


# ###5: Finding the nearest neighbor

# We now know enough to find the nearest neighbor of a given row. We can use the distance.euclidean function from scipy.spatial, a much faster way to calculate euclidean distance.

# ####Instructions

# Find the player most similar to LeBron James by our distance metric. You can do this by finding the second lowest value in the euclidean_distances series (the lowest value will correspond to lebron, as he is most similar to himself), and then cross-referencing the nba dataframe with the same index.
# 
# Assign the name of the player to most_similar_to_lebron.

# In[5]:

from scipy.spatial import distance

# Fill in NA values in nba_normalized.
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)

second_smallest = distance_frame.iloc[1]["idx"]

most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]
print("most_similar_to_lebron:", most_similar_to_lebron)


most_similar_to_lebron: Carmelo Anthony


In [21]:
# ###6: Generating training and testing sets

# Now that we know how to find the nearest neighbors, we can make predictions on a test set.
# 
# First, we have to generate test and train sets. In order to do this, we'll use random sampling. We'll randomly shuffle the index of the nba dataframe, and then pick rows using the randomly shuffled values.
# 
# If we didn't do this, we'd end up predicting and training on the same data set, which would overfit. We could do cross validation also, which would be slightly better, but slightly more complex.

# In[6]:

import random
from numpy.random import permutation

# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items).
test_cutoff = math.floor(len(nba)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = nba.loc[random_indices[test_cutoff:]]

In [22]:
# ###7: Using sklearn

# Instead of having to do it all ourselves, we can use the k-nearest neighbors implementation in scikit-learn. 
# 
# Sklearn performs the normalization and distance finding automatically, and lets us specify how many neighbors we want to look at.

# In[10]:

from sklearn.neighbors import KNeighborsRegressor

# The columns that we will be making predictions with.
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa',
 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast',
 'stl', 'blk', 'tov', 'pf', 'pts']
# The column that we want to predict.
y_column = ['pts']

# Create the knn model.
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])

# Make predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])

print("predictions[:5]:\n", predictions[:5])




predictions[:5]:
 [[213.4]
 [427. ]
 [  1.6]
 [526.4]
 [757.8]]


In [23]:
# ###8: Computing error

# Now that we know our predictions, we can compute the error involved. 

# ####Instructions

# Compute the mean squared error between actual and predictions. Assign the result to mse.

# In[9]:

actual = test[y_column]

mse = (((predictions - actual) ** 2).sum()) / len(predictions)

print("actual[:20]:\n", actual[:20])
print("mse:", mse)

actual[:20]:
       pts
391   250
405   508
460     0
130   511
264   761
293   280
478  1417
6    1603
268    98
65    159
279   159
315   636
474    48
149   638
357   213
257    15
59   1281
102    16
389    29
145    19
mse: pts    1853.979371
dtype: float64
