In [63]:
import numpy as np 
import pandas as pd
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn import linear_model, preprocessing
import statistics as st

In [37]:
car_data = pd.read_csv('car.data', names=["buying","maintenance","door","persons","lug_boot","safety","class"]
                      ) #use names to add column label
car_data.head()

Unnamed: 0,buying,maintenance,door,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [40]:
#let's the preprocessing module to encode our text into integers that we can work with for kneighbors
#Encode target labels with value between 0 and n_classes-1.
encoder = preprocessing.LabelEncoder() #this object will encode our labels into appropriate values
buying = encoder.fit_transform(list(car_data["buying"])) #here we are encoding the buying label of course 
maint = encoder.fit_transform(list(car_data["maintenance"])) #Fit label encoder and return encoded labels, parameters are array like so convert to list in there
door = encoder.fit_transform(list(car_data["door"]))
persons = encoder.fit_transform(list(car_data["persons"]))
lug_boot = encoder.fit_transform(list(car_data["lug_boot"]))
safety = encoder.fit_transform(list(car_data["safety"]))
class1 = encoder.fit_transform(list(car_data["class"]))

#label we are predicting 
predict = "class"
print(buying) #without the list passed in it does not convert to dtype int 64, this is without the print
print(maint) #with the list passed in you can see the dtype int 64 array returned, this is without the print

[3 3 3 ... 1 1 1]
[3 3 3 ... 1 1 1]


In [48]:
# X will be our features and y will be our labels 
X = list(zip(buying, maint, door, persons, lug_boot, safety)) #zip method converts values into tuple objects with values corresponsing to list given
y = list(class1) #convert to list
#now let's split the data! 
train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(X, y, test_size=0.33)
#print(train_x, "\n", train_y, "\n", test_x, "\n", test_y) #proportions look about right but I'm eyeballing this 
#lets create the model 
model = KNeighborsClassifier(n_neighbors=5) #play with the k number or neighbors to see what works better, this is the hyper-parameter
model.fit(train_x, train_y)
accuracy = model.score(test_x, test_y)
accuracy #not bad!

0.8931698774080561

In [81]:
#lets create the model testing different hyperparameter numbers of k
best_k = []
lists = []

hyper_params = [1,2,3,4,5,6,7,8,9] #test k=1... k=9
#we are going to test each k 1000 times then look at the mean, mode and max and possibly variance to see which one is best
for k in hyper_params:
    k_list = [] #create a list for each k to add the scores into
    best = 0 #calculate the best score for each k and put into separate list
    for i in range(20):
        train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(X, y, test_size=0.33) #split the data
        model = KNeighborsClassifier(n_neighbors=k) #create a model for each k 
        model.fit(train_x, train_y)
        accuracy = model.score(test_x, test_y)
        k_list.append(accuracy) #add the accuracy score to the list for that k
        if accuracy > best: #track the best for each k
            best = accuracy
    best_k.append(best) #for the best score in each k in hyper_parameters, add them to a list 
    lists.append(k_list) #add the all the lists with all of the scores to a list of lists of all the scores
print("best in each k ",best_k) 
print("\n",lists)
means = []
for l in lists:
    mean = st.mean(l)
    print(mean)
    means.append(mean) #find the means and add them to a list to see which k has the highest mean
print("\n the means: ",means)
max_mean = max(means)
max_accuracy = max(float(n) for n in best_k)
the_max = max(lists)
max_k = max(best_k)

print("\n max mean", max_mean)
print("\n max accuracy", max_accuracy)
print("\n the max", the_max)
print("\n the max from best k", max_k) #looks like k=7 had the highest max but k=7

best in each k  [0.840630472854641, 0.8073555166374781, 0.8914185639229422, 0.8774080560420315, 0.9194395796847635, 0.9281961471103327, 0.9404553415061296, 0.9106830122591943, 0.9071803852889667]

 [[0.8003502626970228, 0.7845884413309983, 0.7968476357267951, 0.7845884413309983, 0.7670753064798599, 0.8038528896672504, 0.8126094570928196, 0.7495621716287215, 0.8388791593695272, 0.840630472854641, 0.7863397548161121, 0.7723292469352014, 0.7810858143607706, 0.7828371278458844, 0.7740805604203153, 0.8248686514886164, 0.7688266199649737, 0.8231173380035026, 0.7863397548161121, 0.7530647985989493], [0.7460595446584939, 0.7478108581436077, 0.7635726795096323, 0.7828371278458844, 0.7408056042031523, 0.7670753064798599, 0.8073555166374781, 0.7513134851138353, 0.7740805604203153, 0.8021015761821366, 0.7688266199649737, 0.8021015761821366, 0.7845884413309983, 0.7425569176882661, 0.7338003502626971, 0.7635726795096323, 0.7845884413309983, 0.7688266199649737, 0.7635726795096323, 0.7267950963222417]

In [84]:
#lets play with kneighbors
predicted = model.predict(test_x)
for x in range(len(predicted)):
    n = model.kneighbors([test_x[x]],9)#add the brackets bc if you're pass in one value it's still a 2d array since it only takes 2d arrays
    print("N: ", n) #ugly but shows what we want. gives us distance with 1 and another array after "array[...]" with indexes
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
"""Returns

    neigh_distarray, shape (n_queries, n_neighbors)

        Array representing the lengths to points, only present if return_distance=True
    neigh_indarray, shape (n_queries, n_neighbors)

        Indices of the nearest points in the population matrix."""

N:  (array([[1.        , 1.        , 1.        , 1.41421356, 1.41421356,
        1.41421356, 1.41421356, 1.41421356, 1.41421356]]), array([[ 586,  394,  611, 1048,  754,  625,  783,  747,  628]],
      dtype=int64))
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.41421356, 1.41421356, 1.41421356]]), array([[335, 880, 929, 665,  90, 786, 754, 872,   5]], dtype=int64))
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.41421356, 1.41421356, 1.41421356, 1.41421356]]), array([[569, 949, 503, 633, 983, 485, 810, 842, 693]], dtype=int64))
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.41421356, 1.41421356]]), array([[437, 528, 263, 491, 701,  53,  58, 933, 242]], dtype=int64))
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.41421356, 1.41421356, 1.41421356, 1.41421356]]), array([[ 754,  783, 1123,  988, 1048,  532,  394,

'Returns\n\n    neigh_distarray, shape (n_queries, n_neighbors)\n\n        Array representing the lengths to points, only present if return_distance=True\n    neigh_indarray, shape (n_queries, n_neighbors)\n\n        Indices of the nearest points in the population matrix.'