In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score

In [2]:
# Import datafile
data = pd.read_csv("nba_2013.csv")
data.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [3]:
data.shape

(481, 31)

In [4]:
data.describe(include='all')

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
count,481,481,481.0,481,481.0,481.0,481.0,481.0,481.0,479.0,...,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481.0,481,481.0
unique,481,7,,31,,,,,,,...,,,,,,,,,1,
top,Rashard Lewis,SG,,TOT,,,,,,,...,,,,,,,,,2013-2014,
freq,1,109,,63,,,,,,,...,,,,,,,,,481,
mean,,,26.509356,,53.253638,25.571726,1237.386694,192.881497,424.463617,0.436436,...,162.817048,218.627859,112.536383,39.280665,24.10395,71.862786,105.869023,516.582121,,2013.0
std,,,4.198265,,25.322711,29.658465,897.25884,171.832793,368.850833,0.098672,...,145.348116,200.356507,131.019557,34.78359,30.875381,62.70169,71.213627,470.422228,,0.0
min,,,19.0,,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2013.0
25%,,,23.0,,32.0,0.0,388.0,47.0,110.0,0.4005,...,43.0,55.0,20.0,9.0,4.0,21.0,44.0,115.0,,2013.0
50%,,,26.0,,61.0,10.0,1141.0,146.0,332.0,0.438,...,135.0,168.0,65.0,32.0,14.0,58.0,104.0,401.0,,2013.0
75%,,,29.0,,76.0,54.0,2016.0,307.0,672.0,0.4795,...,230.0,310.0,152.0,60.0,32.0,108.0,158.0,821.0,,2013.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 31 columns):
player          481 non-null object
pos             481 non-null object
age             481 non-null int64
bref_team_id    481 non-null object
g               481 non-null int64
gs              481 non-null int64
mp              481 non-null int64
fg              481 non-null int64
fga             481 non-null int64
fg.             479 non-null float64
x3p             481 non-null int64
x3pa            481 non-null int64
x3p.            414 non-null float64
x2p             481 non-null int64
x2pa            481 non-null int64
x2p.            478 non-null float64
efg.            479 non-null float64
ft              481 non-null int64
fta             481 non-null int64
ft.             461 non-null float64
orb             481 non-null int64
drb             481 non-null int64
trb             481 non-null int64
ast             481 non-null int64
stl             481 non-null int64
blk    

In [6]:
data.isna().sum()

player           0
pos              0
age              0
bref_team_id     0
g                0
gs               0
mp               0
fg               0
fga              0
fg.              2
x3p              0
x3pa             0
x3p.            67
x2p              0
x2pa             0
x2p.             3
efg.             2
ft               0
fta              0
ft.             20
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
tov              0
pf               0
pts              0
season           0
season_end       0
dtype: int64

In [7]:
data.duplicated().any()

False

In [8]:
data.drop('player', axis=1, inplace=True)

In [9]:
data.drop('bref_team_id', axis=1, inplace=True)

In [10]:
data.drop('season', axis=1, inplace=True)

In [11]:
data.drop('season_end', axis=1, inplace=True)

In [12]:
data.columns

Index(['pos', 'age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa',
       'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb',
       'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts'],
      dtype='object')

In [13]:
data['pos'].value_counts()

SG    109
SF     99
PF     96
C      90
PG     85
F       1
G       1
Name: pos, dtype: int64

In [14]:
pos_new = pd.get_dummies(data['pos'], prefix='pos', drop_first=True)
pos_new.head()

Unnamed: 0,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
0,0,0,0,0,1,0
1,0,0,0,0,0,0
2,0,0,1,0,0,0
3,0,0,0,0,0,1
4,0,0,0,0,0,0


In [15]:
data.drop('pos', axis=1, inplace=True)

In [16]:
data = pd.concat([data, pos_new], axis=1)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 32 columns):
age       481 non-null int64
g         481 non-null int64
gs        481 non-null int64
mp        481 non-null int64
fg        481 non-null int64
fga       481 non-null int64
fg.       479 non-null float64
x3p       481 non-null int64
x3pa      481 non-null int64
x3p.      414 non-null float64
x2p       481 non-null int64
x2pa      481 non-null int64
x2p.      478 non-null float64
efg.      479 non-null float64
ft        481 non-null int64
fta       481 non-null int64
ft.       461 non-null float64
orb       481 non-null int64
drb       481 non-null int64
trb       481 non-null int64
ast       481 non-null int64
stl       481 non-null int64
blk       481 non-null int64
tov       481 non-null int64
pf        481 non-null int64
pts       481 non-null int64
pos_F     481 non-null uint8
pos_G     481 non-null uint8
pos_PF    481 non-null uint8
pos_PG    481 non-null uint8
pos_SF    481 n

In [18]:
data.isna().sum()

age        0
g          0
gs         0
mp         0
fg         0
fga        0
fg.        2
x3p        0
x3pa       0
x3p.      67
x2p        0
x2pa       0
x2p.       3
efg.       2
ft         0
fta        0
ft.       20
orb        0
drb        0
trb        0
ast        0
stl        0
blk        0
tov        0
pf         0
pts        0
pos_F      0
pos_G      0
pos_PF     0
pos_PG     0
pos_SF     0
pos_SG     0
dtype: int64

In [19]:
data['fg.'].fillna(data['fg.'].median(),inplace=True)

In [20]:
data['x3p.'].fillna(data['x3p.'].median(),inplace=True)

In [21]:
data['x2p.'].fillna(data['x2p.'].median(),inplace=True)

In [22]:
data['efg.'].fillna(data['efg.'].median(),inplace=True)

In [23]:
data['ft.'].fillna(data['ft.'].median(),inplace=True)

In [24]:
data.isna().sum()

age       0
g         0
gs        0
mp        0
fg        0
fga       0
fg.       0
x3p       0
x3pa      0
x3p.      0
x2p       0
x2pa      0
x2p.      0
efg.      0
ft        0
fta       0
ft.       0
orb       0
drb       0
trb       0
ast       0
stl       0
blk       0
tov       0
pf        0
pts       0
pos_F     0
pos_G     0
pos_PF    0
pos_PG    0
pos_SF    0
pos_SG    0
dtype: int64

In [25]:
df_y = data['pts']
df_y.head()

0     171
1     265
2     362
3    1330
4     328
Name: pts, dtype: int64

In [26]:
df_x = data.drop(['pts'], 1)
df_x.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,stl,blk,tov,pf,pos_F,pos_G,pos_PF,pos_PG,pos_SF,pos_SG
0,23,63,0,847,66,141,0.468,4,15,0.266667,...,23,26,30,122,0,0,0,0,1,0
1,20,81,20,1197,93,185,0.503,0,0,0.330976,...,40,57,71,203,0,0,0,0,0,0
2,27,53,12,961,143,275,0.52,0,0,0.330976,...,24,36,39,108,0,0,1,0,0,0
3,28,73,73,2552,464,1011,0.459,128,300,0.426667,...,35,3,146,136,0,0,0,0,0,1
4,25,56,30,951,136,249,0.546,0,1,0.0,...,23,46,63,187,0,0,0,0,0,0


In [27]:
from sklearn import preprocessing
normalized_nba_feature = preprocessing.normalize(df_x)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(normalized_nba_feature, df_y, train_size=0.75, test_size=0.25, random_state=100)

In [29]:
for K in range(25):
    K_value = K+1
    neigh = KNeighborsClassifier(n_neighbors = K_value, weights='distance', algorithm='auto', metric='minkowski')
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    print("Accuracy is", accuracy_score(y_test,y_pred)*1000,"% for K-value:", K_value)

Accuracy is 57.85123966942149 % for K-value: 1
Accuracy is 57.85123966942149 % for K-value: 2
Accuracy is 66.11570247933885 % for K-value: 3
Accuracy is 57.85123966942149 % for K-value: 4
Accuracy is 49.586776859504134 % for K-value: 5
Accuracy is 49.586776859504134 % for K-value: 6
Accuracy is 57.85123966942149 % for K-value: 7
Accuracy is 57.85123966942149 % for K-value: 8
Accuracy is 57.85123966942149 % for K-value: 9
Accuracy is 57.85123966942149 % for K-value: 10
Accuracy is 66.11570247933885 % for K-value: 11
Accuracy is 66.11570247933885 % for K-value: 12
Accuracy is 66.11570247933885 % for K-value: 13
Accuracy is 66.11570247933885 % for K-value: 14
Accuracy is 66.11570247933885 % for K-value: 15
Accuracy is 66.11570247933885 % for K-value: 16
Accuracy is 66.11570247933885 % for K-value: 17
Accuracy is 57.85123966942149 % for K-value: 18
Accuracy is 57.85123966942149 % for K-value: 19
Accuracy is 57.85123966942149 % for K-value: 20
Accuracy is 49.586776859504134 % for K-value: 2

It shows that we are getting 66.11% accuracy on K = 3,11,12,13,14,15,16,17. Choosing a large value of K will lead to greater amount of execution time & underfitting. Selecting the small value of K will lead to overfitting. There is no such guaranteed way to find the best value of K. So, to run it quickly we are considering K = 3 for this problem.