In [243]:
import math
def euclidean_dist(x1,y1,x2,y2):
    return math.sqrt((x1-x2)**2 + (y1-y2)**2)

In [244]:
def gaussian(dist):
    return 1./((math.sqrt(2.*math.pi))*math.exp(-dist/2))

In [251]:
import operator
class KNN(object):
    
    def __init__(self, x, y, k, weighted = False):
        assert(k <= len(x))
        self.x = x
        self.y = y
        self.k = k
        self.weighted = weighted
        
    def predict(self, test):
        predictions = []
        for i, j in test.values:
            distances = []
            
            for id_x, (l,m) in enumerate(self.x.values):
                dist = euclidean_dist(i,j,l,m)
                distances.append((self.y[id_x], dist))
            distances.sort(key=operator.itemgetter(1))
            v = 0
            total_weight = 0
            
            for i in range(self.k):
                if(self.weighted):
                    weight = gaussian(distances[i][1])
                    v += distances[i][0]*weight
                    total_weight += weight
                else:
                    v += distances[i][0]
                
            if(self.weighted):
                predictions.append(v/total_weight)
            else:
                predictions.append(v/self.k)
        return predictions

In [252]:
import pandas as pd

data = pd.read_csv("automobiles.csv")
data.head
#features = data.iloc[:, []]

<bound method NDFrame.head of       mpg  cyl  displ   hp  weight  accel          yr  origin  \
0    18.0    8  307.0  130    3504   12.0  1970-01-01      US   
1    15.0    8  350.0  165    3693   11.5  1970-01-01      US   
2    18.0    8  318.0  150    3436   11.0  1970-01-01      US   
3    16.0    8  304.0  150    3433   12.0  1970-01-01      US   
4    17.0    8  302.0  140    3449   10.5  1970-01-01      US   
..    ...  ...    ...  ...     ...    ...         ...     ...   
387  27.0    4  140.0   86    2790   15.6  1982-01-01      US   
388  44.0    4   97.0   52    2130   24.6  1982-01-01  Europe   
389  32.0    4  135.0   84    2295   11.6  1982-01-01      US   
390  28.0    4  120.0   79    2625   18.6  1982-01-01      US   
391  31.0    4  119.0   82    2720   19.4  1982-01-01      US   

                          name  
0    chevrolet chevelle malibu  
1            buick skylark 320  
2           plymouth satellite  
3                amc rebel sst  
4                  ford 

In [253]:
features = data.iloc[:, [2,3]]
features.head

<bound method NDFrame.head of      displ   hp
0    307.0  130
1    350.0  165
2    318.0  150
3    304.0  150
4    302.0  140
..     ...  ...
387  140.0   86
388   97.0   52
389  135.0   84
390  120.0   79
391  119.0   82

[392 rows x 2 columns]>

In [254]:
target = data.mpg.values

In [255]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(313, 2) (313,)
(79, 2) (79,)


In [256]:
from sklearn.metrics import mean_squared_error

k_list = [1,3,10,20]
for k in k_list:
    model = KNN(X_train,y_train,k)
    prediciton = model.predict(X_test)
    error = mean_squared_error(y_test, prediciton)
    print(error)

18.072151898734173
12.409015471167372
12.340588607594945
11.644805696202537
