## KNN
是一种基本的回归算法，K近邻的输入为实例的特征向量，对应于空间的点，输出为实例的类别

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
# 欧几里得距离
def euclidean_distance(x1, x2):
    """ Calculates the l2 distance between two vectors """
    distance = 0
    # Squared distance between each coordinate
    for i in range(len(x1)):
        distance += pow((x1[i] - x2[i]), 2)
    return math.sqrt(distance)

In [3]:
# 计算准确率
def accuracy_score(y_true, y_pred):
    """ Compare y_true to y_pred and return the accuracy """
    accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

In [4]:
import math

def normalize(X, axis=-1, order=2):
    l2 = np.atleast_1d(np.linalg.norm(X, order, axis)) #将给定的所有数组，output的arrays中所有的array的维数均大于等于1
    l2[l2 == 0] = 1                                 # np.linalg.norm 求范数
    return X / np.expand_dims(l2, axis)

In [8]:
def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

In [9]:
# 分解数据集
def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    if shuffle:
        X, y = shuffle_data(X, y, seed) # 先进行洗牌操作
        
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

In [10]:
# 构造knn
class KNN(object):
    def __init__(self, k=5):
        self.k = 5
        
    def _vote(self,neighbours): #以单下划线开头，便是不能直接访问的类的属性，需通过类提供的接口进行访问，不能from xxx import * 导入
        counts = np.bincount(neighbours[:, 1].astype('int'))
        # np.bincount
        return counts.argmax()
    
    def predict(self, X_test, X_train, y_train):
        y_pred = np.empty(X_test.shape[0])
        # 对每一个test进行循环
        for i,test in enumerate(X_test):
            neighbours = np.empty((X_train.shape[0],2))
            # 对每一个train进行计算
            for j, train in enumerate(X_train):
                dis = euclidean_distance(train,test)
                label = y_train[j]
                neighbours[j] = [dis,label]
            k_nearest_neighbors = neighbours[neighbours[:,0].argsort()][:self.k]
            label = self._vote(k_nearest_neighbors)
            y_pred[i] = label
        return y_pred

In [11]:
data = datasets.load_iris()
X = normalize(data.data)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [12]:
clf = KNN(k=5)
y_pred = clf.predict(X_test, X_train, y_train)
accuracy = accuracy_score(y_test, y_pred)

In [13]:
accuracy

0.97959183673469385