# K Nearest Neighbors

[Reference 1](https://scikit-learn.org/stable/modules/neighbors.html)  
[Reference 2](https://en.wikipedia.org/wiki/Nearest_neighbor_search)  
[Reference 3](https://booking.ai/k-nearest-neighbours-from-slow-to-fast-thanks-to-maths-bec682357ccd)

### Naive Approach  
No transforming on original dataset (No training)  
For prediction, iter through the original dataset, and find the nearest K data, by a given metrics  

Can use euclidean distance:
$$\text{dist}(X_1, X_2) = \|X_1 - X_2\|$$

A better approach is cosine similarity:
$$\text{sim}(X_1, X_2) = \frac{X_1 \cdot X_2}{\|X_1\| \|X_2\|}$$
which computes the cos value between two vectors  
1 for 0 degree, and less than 1 for $(0, \pi]$

In [1]:
include("../tools.jl")
import .JuTools

In [2]:
import Statistics
import Random
import LinearAlgebra

In [3]:
function cosine_sim(X1::Array{T} where T<:Number, X2::Array{T} where T<:Number)::AbstractFloat
    @assert size(X1) == size(X2)
    @assert ndims(X1) == ndims(X2) == 1
    product = LinearAlgebra.dot(X1, X2)
    X1_norm = LinearAlgebra.norm(X1, 2)
    X2_norm = LinearAlgebra.norm(X2, 2)
    return product / (X1_norm * X2_norm)
end

cosine_sim (generic function with 1 method)

In [4]:
X_data, Y_data = JuTools.data_generate_linear_2d()
println(size(X_data))
println(size(Y_data))

(1000, 2)
(1000,)


In [5]:
X_data[1:2, :]

2×2 Array{Float64,2}:
 75.3  91.3
  3.4  38.5

In [6]:
cosine_sim(X_data[1, :], X_data[2, :])

0.8244479883848689

In [7]:
cosine_sim(X_data[1, :], X_data[3, :])

0.8140504040179221

In [8]:
# define majority vote function
function majority_vote(y::Array{T} where T<:Number)::Number
    @assert ndims(y) == 1
    unique_votes = Dict{Number, Integer}()
    for y_val in y
        if !haskey(unique_votes, y_val)
            push!(unique_votes, y_val => 1)
        else
            unique_votes[y_val] += 1
        end
    end
    result = sort(collect(unique_votes), by=m->m[2])
    return result[end][1]
end

majority_vote (generic function with 1 method)

In [9]:
majority_vote([1,1,0])

1

Output ordering is affected by input ordering

In [10]:
majority_vote([1,1,0,0])

1

In [11]:
majority_vote([1,1,0,0,0])

0

In [12]:
X_train, X_test, Y_train, Y_test = JuTools.split_data(X_data, Y_data, shuffle=true, ratio=0.3)
println(size(X_train))
println(size(X_test))
println(size(Y_train))
println(size(Y_test))

(700, 2)
(300, 2)
(700,)
(300,)


In [17]:
# define predict function, naive approach
function predict_naive(X_predict::Array{T} where T<:Number, K::Integer, X_data::Array{T} where T<:Number, Y_data::Array{T} where T<:Number)::Array
    @assert ndims(X_data) == 2
    @assert ndims(Y_data) == 1
    @assert size(X_data)[1] == size(Y_data)[1]
    @assert 0 < ndims(X_predict) <= 2
    @assert 0 < K < size(X_data)[1]
    if ndims(X_predict) < 2
        X_predict = reshape(X_predict, (1, size(X_predict)[1]))
    end
    @assert size(X_predict)[2] == size(X_data)[2]
    result = Array{Number}(undef, size(X_predict)[1])
    sim = Array{Tuple{Integer, AbstractFloat}}(undef, size(X_data)[1])
    for i in 1:size(X_predict)[1]
        vec_predict = X_predict[i, :]
        for j in 1:size(X_data)[1]
            vec_data = X_data[j, :]
            vec_similarity = cosine_sim(vec_predict, vec_data)
            sim[j] = (j, vec_similarity)
        end
        sort!(sim, by=m->m[2], rev=true)
        K_nearest_votes = Y_data[[m[1] for m in sim[1:K]]]
        result[i] = majority_vote(K_nearest_votes)
    end
    return result
end

predict_naive (generic function with 1 method)

In [18]:
Y_predict = predict_naive(X_test, 5, X_train, Y_train)

300-element Array{Number,1}:
 1.0
 0.0
 1.0
 1.0
 0.0
 1.0
 0.0
 0.0
 1.0
 1.0
 0.0
 1.0
 1.0
 ⋮
 0.0
 0.0
 0.0
 1.0
 0.0
 1.0
 1.0
 1.0
 1.0
 1.0
 0.0
 1.0

In [19]:
JuTools.compute_accuracy(Y_predict, Y_test)

0.9433333333333334

In [20]:
# what about dist similarity?
function dist_sim(X1::Array{T} where T<:Number, X2::Array{T} where T<:Number)::AbstractFloat
    @assert size(X1) == size(X2)
    @assert ndims(X1) == ndims(X2) == 1
    return sqrt(sum((X1 .- X2).^2))
end

dist_sim (generic function with 1 method)

In [21]:
dist_sim(X_data[1, :], X_data[2, :])

89.20454024319613

In [22]:
dist_sim(X_data[2, :], X_data[3, :])

55.791845282263246

It's greatly affected by the scale of data!

In [23]:
function predict_naive_fun(X_predict::Array{T} where T<:Number, K::Integer, X_data::Array{T} where T<:Number, Y_data::Array{T} where T<:Number)::Array
    @assert ndims(X_data) == 2
    @assert ndims(Y_data) == 1
    @assert size(X_data)[1] == size(Y_data)[1]
    @assert 0 < ndims(X_predict) <= 2
    @assert 0 < K < size(X_data)[1]
    if ndims(X_predict) < 2
        X_predict = reshape(X_predict, (1, size(X_predict)[1]))
    end
    @assert size(X_predict)[2] == size(X_data)[2]
    result = Array{Number}(undef, size(X_predict)[1])
    sim = Array{Tuple{Integer, AbstractFloat}}(undef, size(X_data)[1])
    for i in 1:size(X_predict)[1]
        vec_predict = X_predict[i, :]
        for j in 1:size(X_data)[1]
            vec_data = X_data[j, :]
            vec_similarity = dist_sim(vec_predict, vec_data)
            sim[j] = (j, vec_similarity)
        end
        sort!(sim, by=m->m[2])
        K_nearest_votes = Y_data[[m[1] for m in sim[1:K]]]
        result[i] = majority_vote(K_nearest_votes)
    end
    return result
end

predict_naive_fun (generic function with 1 method)

In [24]:
JuTools.compute_accuracy(predict_naive_fun(X_test, 5, X_train, Y_train), Y_test)

0.95

It produces better score because X_data has 2 dimensions, which is best fit for computing euclidean distance  
Eventually we'll be using cosine similarity in implementation

Although it (`predict_naive`) may be slow on large dataset, it is easy to implement and it works as expected