In [33]:
#Importing Libraries required for performance evaluation and KNNClassifier algorithm building

import numpy as np
import pandas as pd
import operator
from math import sqrt
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import confusion_matrix as cm

In [2]:
#loading data
nbaData = pd.read_csv("nba_logreg.csv",header = 0)
nbaData.head()

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0


In [3]:
nbaData.shape

(1340, 21)

In [4]:
#number of entity in each rows
nbaData.count(axis="rows")

Name           1340
GP             1340
MIN            1340
PTS            1340
FGM            1340
FGA            1340
FG%            1340
3P Made        1340
3PA            1340
3P%            1329
FTM            1340
FTA            1340
FT%            1340
OREB           1340
DREB           1340
REB            1340
AST            1340
STL            1340
BLK            1340
TOV            1340
TARGET_5Yrs    1340
dtype: int64

In [5]:
#Droping NA Value
nbaData = nbaData.dropna()

In [6]:
#Checking Entity again
nbaData.count(axis="rows")

Name           1329
GP             1329
MIN            1329
PTS            1329
FGM            1329
FGA            1329
FG%            1329
3P Made        1329
3PA            1329
3P%            1329
FTM            1329
FTA            1329
FT%            1329
OREB           1329
DREB           1329
REB            1329
AST            1329
STL            1329
BLK            1329
TOV            1329
TARGET_5Yrs    1329
dtype: int64

In [7]:
#Last 5 attributes
nbaData.tail()

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
1335,Chris Smith,80,15.8,4.3,1.6,3.6,43.3,0.0,0.2,14.3,...,1.5,79.2,0.4,0.8,1.2,2.5,0.6,0.2,0.8,0.0
1336,Brent Price,68,12.6,3.9,1.5,4.1,35.8,0.1,0.7,16.7,...,1.0,79.4,0.4,1.1,1.5,2.3,0.8,0.0,1.3,1.0
1337,Marlon Maxey,43,12.1,5.4,2.2,3.9,55.0,0.0,0.0,0.0,...,1.6,64.3,1.5,2.3,3.8,0.3,0.3,0.4,0.9,0.0
1338,Litterial Green,52,12.0,4.5,1.7,3.8,43.9,0.0,0.2,10.0,...,1.8,62.5,0.2,0.4,0.7,2.2,0.4,0.1,0.8,1.0
1339,Jon Barry,47,11.7,4.4,1.6,4.4,36.9,0.4,1.3,33.3,...,1.0,67.3,0.2,0.7,0.9,1.4,0.7,0.1,0.9,1.0


In [8]:
#Checking Shape of Data set
nbaData.shape

(1329, 21)

In [9]:
#Dividing train and test data for library value to predict the outcome
train_data = np.array(nbaData.iloc[0:1000,1:]).tolist()
test_data = np.array(nbaData.iloc[1000:,1:]).tolist()

In [24]:
#Euclidean Distance for two point
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

#Get neighbors and get predicted value for particular test value. Train Data, test value and K value are parameter
def get_neighbors(train,test,neighbor):
    #Distances stored multiple value of distances based on train and test value
    distances = []
    for train_value in train:
        distance_func = euclidean_distance(test,train_value)
        distances.append((train_value,distance_func))
#Below function will sort the list in an ascending manner. 
    distances.sort(key=lambda tup:tup[1])
    neighbors = list()
#Getting neighbor with the help of k value and get predicted value 
    for i in range(neighbor):
        neighbors.append(distances[i][0])
    outputVal = [row[-1] for row in neighbors]
#max with count will return the mode value of K
    predictedval = max(set(outputVal), key= outputVal.count)
    return predictedval

#A function is defined to use when test rows are more than one. It will help to get predicted value for all test rows
def getAllneighbors(train,test,k):
    output = []
    neig = []
    for j in test:
        predictval = get_neighbors(train,j,k)
        #Neig help to check if we receive all neighbor for all test case or not
        #neig.append((j,predictval))
        output.append((predictval))
    return output

In [25]:
#Calculate KNN when k value is 5
kVal5 = getAllneighbors(train_data,test_data,5)

In [26]:
#Calculate KNN when k value is 25
kVal25 = getAllneighbors(train_data,test_data,25)

In [27]:
#Calculate KNN when k value is 51
kVal51 = getAllneighbors(train_data,test_data,51)

In [28]:
#Creating train and test with response and predictive data to use for Sklearn ML Library

#train response and predictive variable
train_response = nbaData.iloc[0:1000,1:-1]
train_predict = nbaData.iloc[0:1000,-1]

#test response and predictive varibale
test_response = nbaData.iloc [1000:,1:-1]
test_predidct = nbaData.iloc[1000:,-1].tolist()

In [37]:
#Training model with Sklearn library where K value is 5 and P value= 2 define euclidean distance
model5 = KNeighborsClassifier(n_neighbors=5, p=2)
#Training model with train data set
model5.fit(train_response,train_predict)
#predicting data set 
y_pred5 = model5.predict(test_response).tolist()

#Calculating accuracy of the model using Confusion matrix
cm(test_pred,y_pred5)

array([[ 59,  51],
       [ 61, 158]])

In [38]:
#Training model with Sklearn library where K value is 25 and P value= 2 define euclidean distance
model25 = KNeighborsClassifier(n_neighbors=25, p=2)
#Training model with train data set
model25.fit(train_response,train_predict)
#predicting data set 
y_pred25 = model25.predict(test_response).tolist()

#Calculating accuracy of the model using Confusion matrix
cm(test_pred,y_pred25)

array([[ 67,  43],
       [ 61, 158]])

In [39]:
#Training model with Sklearn library where K value is 51 and P value= 2 define euclidean distance
model51 = KNeighborsClassifier(n_neighbors=5, p=2)
#Training model with train data set
model51.fit(train_response,train_predict)
#predicting data set 
y_pred51 = model51.predict(test_response).tolist()

#Calculating accuracy of the model using Confusion matrix
cm(test_pred,y_pred51)

array([[ 59,  51],
       [ 61, 158]])

In [45]:
#Checking accuracy for all KValue fetched using Function
print("Accuracy when K value is 5:\n",cm(test_pred,kVal5))
print("\n")
print("Accuracy when K value is 25:\n",cm(test_pred,kVal25))
print("\n")
print("Accuracy when K value is 51:\n",cm(test_pred,kVal51))

Accuracy when K value is 5:
 [[ 57  53]
 [ 61 158]]


Accuracy when K value is 25:
 [[ 67  43]
 [ 61 158]]


Accuracy when K value is 51:
 [[ 66  44]
 [ 60 159]]


In [47]:
#Cross verifying accuracy with Sklearn library based k value and function based k value
print("Accuracy when K value is 5:\n",cm(y_pred5,kVal5))
print("\n")
print("Accuracy when K value is 25:\n",cm(y_pred25,kVal25))
print("\n")
print("Accuracy when K value is 51:\n",cm(y_pred51,kVal51))
print("\n")

Accuracy when K value is 5:
 [[118   2]
 [  0 209]]


Accuracy when K value is 25:
 [[128   0]
 [  0 201]]


Accuracy when K value is 51:
 [[ 94  26]
 [ 32 177]]


