In [3]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

In [41]:
d = pd.read_csv("abalone.data")

4176

In [15]:
# Looks like there are no class labels
d.head()

Unnamed: 0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [42]:
column_names = ["Sex","Length","Diameter","Height","Whole weight","Shucked weight","Viscera weight","Shell weight","Rings"]

In [43]:
d.columns = column_names

In [19]:
d.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [47]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
d["Sex"] = ordinal_encoder.fit_transform(d[["Sex"]])

In [48]:
d.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,2.0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,0.0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,2.0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,1.0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,1.0,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


In [49]:
d.describe()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0
mean,1.052682,0.524009,0.407892,0.139527,0.828818,0.3594,0.180613,0.238852,9.932471
std,0.822208,0.120103,0.09925,0.041826,0.490424,0.22198,0.10962,0.139213,3.223601
min,0.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.0,0.45,0.35,0.115,0.4415,0.186,0.093375,0.13,8.0
50%,1.0,0.545,0.425,0.14,0.79975,0.336,0.171,0.234,9.0
75%,2.0,0.615,0.48,0.165,1.15325,0.502,0.253,0.329,11.0
max,2.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [50]:
from sklearn.utils import shuffle

d_shuffled = shuffle(d)

In [51]:
data_labels = d_shuffled["Rings"]

In [52]:
data = d_shuffled.drop("Rings", axis=1)

In [53]:
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
2597,2.0,0.6,0.46,0.15,1.247,0.5335,0.2735,0.29
2194,1.0,0.26,0.215,0.08,0.099,0.037,0.0255,0.045
2124,2.0,0.47,0.375,0.115,0.4265,0.1685,0.0755,0.15
3168,2.0,0.56,0.45,0.155,0.9125,0.3595,0.271,0.35
685,0.0,0.535,0.4,0.15,0.8045,0.3345,0.2125,0.21


In [58]:
training_data = data.iloc[:3133]
training_labels = data_labels.iloc[:3133]

In [55]:
test_data = data.iloc[3133:]
test_labels = data_labels.iloc[3133:]

In [90]:
def EuclideanDistance(v1, v2):
    sum = 0.0
    for index in range(len(v1)):
        sum += (v1[index] - v2[index]) ** 2
    return sum ** 0.5

In [None]:
dists = []
for i in range(len(d) - 1):
    for j in range(i + 1, len(d)):
        dist = EuclideanDistance(d.iloc[i], d.iloc[j])
        dists.append(dist)
        
fig = plt.hist(dists, 100)

In [78]:
knn_cfr = KNeighborsClassifier(n_neighbors=5)

In [79]:
knn_cfr.fit(training_data, training_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [80]:
# Test predictions on training data
knn_cfr.predict(training_data)

array([10,  5,  9, ..., 10,  8, 11])

In [81]:
from sklearn.metrics import mean_squared_error

training_predictions = knn_cfr.predict(training_data)

knn_mse = mean_squared_error(training_labels, training_predictions)
knn_rmse = np.sqrt(knn_mse)
knn_rmse

2.4819167247895924

In [77]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_neighbors': [2,3,4,5,6,7,8,9,10]}
]

grid_search = GridSearchCV(knn_cfr, param_grid, cv=3,
                           scoring='f1',
                           return_train_score=True)

In [82]:
#grid_search.fit(training_data, training_labels)

In [84]:
test_predictions = knn_cfr.predict(test_data)

knn_mse = mean_squared_error(test_labels, test_predictions)
knn_rmse = np.sqrt(knn_mse)
knn_rmse

2.6763765376054733

In [88]:
def knn_kiterator(k_val=3):
    knn_cfr = KNeighborsClassifier(k_val)
    knn_cfr.fit(training_data, training_labels)
    training_predictions = knn_cfr.predict(training_data)
    knn_mse = mean_squared_error(training_labels, training_predictions)
    knn_rmse = np.sqrt(knn_mse)
    return knn_rmse

for i in range(1,10,1):
    print("RMSE with k= ",i," is: ", knn_kiterator(i))

RMSE with k=  1  is:  0.0
RMSE with k=  2  is:  2.0537774745812536
RMSE with k=  3  is:  2.418212100005463
RMSE with k=  4  is:  2.4774759351518036
RMSE with k=  5  is:  2.4819167247895924
RMSE with k=  6  is:  2.464882623038243
RMSE with k=  7  is:  2.450662215004823
RMSE with k=  8  is:  2.432753415309486
RMSE with k=  9  is:  2.4996648354965645
