Skip to content

Commit

Permalink
Merge 14b6978 into 6632035
Browse files Browse the repository at this point in the history
  • Loading branch information
zgornel committed Feb 20, 2020
2 parents 6632035 + 14b6978 commit 2ab711f
Show file tree
Hide file tree
Showing 15 changed files with 158 additions and 24 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -9,3 +9,7 @@ docs/build/
benchmarks/
scripts/
old/
src/data/loaders/custom
src/input/custom
src/search/rankers/custom
src/search/recommenders/custom
14 changes: 10 additions & 4 deletions Manifest.toml
Expand Up @@ -204,6 +204,12 @@ git-tree-sha1 = "5c49dab19938b119fe204fd7d7e8e174f4e9c68b"
uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
version = "0.8.8"

[[IVFADC]]
deps = ["Clustering", "DataStructures", "Distances", "HNSW", "QuantizedArrays"]
git-tree-sha1 = "dea090959d6e08236b22a719728f93cb76ee49f6"
uuid = "c28a5fd4-166f-4f83-b04d-195ccdb4ae51"
version = "0.1.0"

[[IndexedTables]]
deps = ["DataAPI", "DataValues", "Distributed", "IteratorInterfaceExtensions", "OnlineStatsBase", "PooledArrays", "SparseArrays", "Statistics", "StructArrays", "TableTraits", "TableTraitsUtils", "Tables", "WeakRefStrings"]
git-tree-sha1 = "e41ee5688e404b49795a85dcb1da2dafb4409645"
Expand Down Expand Up @@ -404,9 +410,9 @@ version = "1.2.0"

[[Quadmath]]
deps = ["Printf", "Random", "Requires"]
git-tree-sha1 = "7a11344be0bd27a98ce3038573c4df518f051a8f"
git-tree-sha1 = "5cbc75c9cd5edc72111681f0b30c0a6b1128c582"
uuid = "be4d8f0f-7fa4-5f49-b795-2f01399ab2dd"
version = "0.5.2"
version = "0.5.3"

[[QuantizedArrays]]
deps = ["Clustering", "Distances", "InteractiveUtils", "LinearAlgebra", "StatsBase"]
Expand Down Expand Up @@ -480,9 +486,9 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[[StatsBase]]
deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
git-tree-sha1 = "c53e809e63fe5cf5de13632090bc3520649c9950"
git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
version = "0.32.0"
version = "0.32.1"

[[StrTables]]
deps = ["Dates"]
Expand Down
1 change: 1 addition & 0 deletions Project.toml
Expand Up @@ -15,6 +15,7 @@ EmbeddingsAnalysis = "827a7f00-4ef9-11e9-22c6-8dc9c6ee7bf4"
Glowe = "d04f5788-f3bf-50c4-8044-9206750af6c9"
HNSW = "540f64fa-c57e-11e8-081c-41422cda4629"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
IVFADC = "c28a5fd4-166f-4f83-b04d-195ccdb4ae51"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
JuliaDB = "a93385a2-3734-596a-9a66-3cfbb77141e6"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
Expand Down
17 changes: 12 additions & 5 deletions src/Garamond.jl
Expand Up @@ -20,7 +20,6 @@
#
module Garamond

# Using section
using Unicode
using Random
using Logging
Expand All @@ -46,35 +45,43 @@ module Garamond
using EmbeddingsAnalysis
using HNSW
using NearestNeighbors
using IVFADC
using Distances
using LightGraphs: Graph, pagerank
using JSON
using HTTP
using TSVD
using JuliaDB

# Import section (extendable methods)
import Base: size, length, show, keys, values, push!,
import Base: size, length, show, keys, values,
push!, pop!, pushfirst!, popfirst!,
delete!, getindex, names, convert, lowercase,
occursin, isempty, parse, sort
import StringAnalysis: id
import Word2Vec: WordVectors
import HNSW: knn_search

# Exports
export
search,
recommend,
rank,
indexfilter,

build_search_env,
parse_configuration,
parse_input,

AbstractEmbedder,

AbstractIndex,
NaiveIndex, BruteTreeIndex,
KDTreeIndex,
HNSWIndex, IVFIndex,

Searcher,
SearchConfig,
SearchResult,

id,
description,
isenabled, enable!, disable!,
Expand Down Expand Up @@ -125,7 +132,6 @@ module Garamond
end
end

# Include section
include("data/db.jl")
include("data/text.jl")
include("data/parse_and_eval.jl")
Expand All @@ -149,6 +155,7 @@ module Garamond
include("index/brutetree.jl")
include("index/kdtree.jl")
include("index/hnsw.jl")
include("index/ivfadc.jl")

include("searchable/config_parser.jl")
include("searchable/searcher.jl")
Expand Down
39 changes: 36 additions & 3 deletions src/index/abstractindex.jl
@@ -1,3 +1,13 @@
# Exceptions
struct IndexOperationException <: Exception
op::String
type::String
end

Base.showerror(io::IO, e::IndexOperationException) =
print(io, "Failed call `$(e.op)` on `$(e.type)` index.")


# Abstract types
abstract type AbstractIndex end

Expand All @@ -13,8 +23,9 @@ present in `keep` are returned.
function knn_search(index::AbstractIndex,
point::AbstractVector,
k::Integer,
keep::AbstractVector)
throw(ErrorException("`search` is not implemented for $(typeof(index)) indexes."))
keep::AbstractVector;
kwargs...)
throw(IndexOperationException("knn_search", string(typeof(index))))
end


Expand All @@ -24,5 +35,27 @@ end
Returns the number of points indexed in `index`.
"""
function length(index::AbstractIndex)
throw(ErrorException("`length` is not implemented for $(typeof(index)) indexes."))
throw(IndexOperationException("length", string(typeof(index))))
end


# pop!, popfirst!, push!, pushfirst!
function pop!(index::AbstractIndex)
throw(IndexOperationException("pop!", string(typeof(index))))
end

function popfirst!(index::AbstractIndex)
throw(IndexOperationException("popfirst!", string(typeof(index))))
end

function push!(index::AbstractIndex)
throw(IndexOperationException("push!", string(typeof(index))))
end

function pushfirst!(index::AbstractIndex)
throw(IndexOperationException("pushfirst!", string(typeof(index))))
end

function delete_from_index!(index::AbstractIndex, points)
throw(IndexOperationException("delete_from_index!", string(typeof(index))))
end
5 changes: 3 additions & 2 deletions src/index/brutetree.jl
@@ -1,5 +1,5 @@
"""
BruteTree index type for storing text embeddings. It is a wrapper
BruteTree index type for storing vectors. It is a wrapper
around a `BruteTree` NN structure and performs brute search using
a distance-based similarity between vectors.
"""
Expand All @@ -17,7 +17,8 @@ BruteTreeIndex(data::SparseMatrixCSC{T,I}) where {T<:AbstractFloat, I<:Integer}
function knn_search(index::BruteTreeIndex{A,D},
point::AbstractVector,
k::Int,
keep::Vector{Int}=collect(1:length(index))
keep::Vector{Int}=collect(1:length(index));
kwargs...
) where {A<:AbstractArray, D<:Metric}
# Uses Euclidean distance by default
_k = min(k, length(keep))
Expand Down
9 changes: 4 additions & 5 deletions src/index/hnsw.jl
Expand Up @@ -2,15 +2,13 @@
# Search index structures and associated methods #
##################################################
"""
HNSW index type for storing text embeddings. It is a wrapper around a
HNSW index type for storing vectors. It is a wrapper around a
`HierarchicalNSW` (Hierarchical Navigable Small Worlds) NN graph
structure and performs a very efficient search using a distance-based
similarity between vectors.
# References
* [Y. A. Malkov, D.A. Yashunin "Efficient and robust approximate nearest
neighbor search using Hierarchical Navigable Small World graphs"]
(https://arxiv.org/abs/1603.09320)
* [Y. A. Malkov, D.A. Yashunin "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs"](https://arxiv.org/abs/1603.09320)
"""
struct HNSWIndex{I,E,A,D} <: AbstractIndex
tree::HierarchicalNSW{I,E,A,D}
Expand All @@ -31,7 +29,8 @@ __build_hnsw_data(data::Matrix) = [data[:,i] for i in 1:size(data,2)]
function knn_search(index::HNSWIndex{I,E,A,D},
point::AbstractVector,
k::Int,
keep::Vector{Int}=collect(1:length(index))
keep::Vector{Int}=collect(1:length(index));
kwargs...
) where {I<:Unsigned, E<:Real, A<:AbstractArray, D<:Metric}
# Uses Euclidean distance by default
_idxs, scores = knn_search(index.tree, Vector(point), k)
Expand Down
46 changes: 46 additions & 0 deletions src/index/ivfadc.jl
@@ -0,0 +1,46 @@
##################################################
# Search index structures and associated methods #
##################################################
"""
IVFADC index type for storing vectors. It is a wrapper around a
`IVFADCIndex` (inverted file system with asymmetric distance computation)
structure and performs a billion-scale search using a distance-based
similarity between vectors.
# References
* [Jègou et al. "Product quantization for nearest neighbor search"](https://hal.inria.fr/file/index/docid/514462/filename/paper_hal.pdf)
* [Baranchuk et al. "Revisiting the inverted indices for billion-scale approximate nearest neighbors"](http://openaccess.thecvf.com/content_ECCV_2018/papers/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.pdf)
"""
struct IVFIndex{U,I,Dc,Dr,T,Q} <: AbstractIndex
index::IVFADCIndex{U,I,Dc,Dr,T,Q}
end

IVFIndex(data::AbstractMatrix; kwargs...) = IVFIndex(IVFADCIndex(data; kwargs...))

IVFIndex(data::SparseMatrixCSC{T,I}; kwargs...) where {T<:AbstractFloat, I<:Integer} =
IVFIndex(IVFADCIndex(Matrix{T}(data); kwargs...))


# Nearest neighbor search method
function knn_search(index::IVFIndex{U,I,Dc,Dr,T,Q},
point::AbstractVector{T},
k::Int,
keep::Vector{Int}=collect(1:length(index));
w::Int=1
) where {U,I,Dc,Dr,T,Q}
# Uses Euclidean distance by default
_idxs, scores = knn_search(index.index, Vector(point), k; w=w)
idxs = Int.(_idxs) .+ 1
if length(keep) == length(index)
# all data points are valid
return idxs, scores
else
# this bit is slow if 'keep' is large
mask = map(idx->in(idx, keep), idxs)
return idxs[mask], scores[mask]
end
end


# Length method
length(index::IVFIndex) = length(index.index)
6 changes: 4 additions & 2 deletions src/index/kdtree.jl
@@ -1,5 +1,6 @@
#TODO(Corneliu) **cc-indexing** - consider removing the KDTree structure - if not, throw exceptions around
"""
K-D Tree index type for storing text embeddings. It is a wrapper
K-D Tree index type for storing vectors. It is a wrapper
around a `KDTree` NN structure and performs a more efficient
search using a distance-based similarity between vectors.
"""
Expand All @@ -17,7 +18,8 @@ KDTreeIndex(data::SparseMatrixCSC{T,I}) where {T<:AbstractFloat, I<:Integer} =
function knn_search(index::KDTreeIndex{A,D},
point::AbstractVector,
k::Int,
keep::Vector{Int}=collect(1:length(index))
keep::Vector{Int}=collect(1:length(index));
kwargs...
) where {A<:AbstractArray, D<:Metric}
# Uses Euclidean distance by default
_k = min(k, length(keep))
Expand Down
5 changes: 3 additions & 2 deletions src/index/naive.jl
@@ -1,5 +1,5 @@
"""
Naive index type for storing text embeddings. It is a wrapper
Naive index type for storing vectors. It is a wrapper
around a vector of embeddings and performs brute search using
the cosine similarity between vectors.
"""
Expand All @@ -16,7 +16,8 @@ end
function knn_search(index::NaiveIndex{E},
point::AbstractVector,
k::Int,
keep::Vector{Int}=collect(1:length(index))
keep::Vector{Int}=collect(1:length(index));
kwargs...
) where {E<:AbstractFloat}
# Turn sparse vectors into dense ones
__densify(v::AbstractVector) = v
Expand Down
2 changes: 1 addition & 1 deletion src/searchable/config_parser.jl
Expand Up @@ -230,7 +230,7 @@ function parse_configuration(filename::AbstractString)
sconfig.vectors_eltype= DEFAULT_VECTORS_ELTYPE
end
# search_index
if !(sconfig.search_index in [:naive, :brutetree, :kdtree, :hnsw])
if !(sconfig.search_index in [:naive, :brutetree, :kdtree, :hnsw, :ivfadc])
@warn "$(sconfig.id) Defaulting search_index=$DEFAULT_SEARCH_INDEX."
sconfig.search_index = DEFAULT_SEARCH_INDEX
end
Expand Down
1 change: 1 addition & 0 deletions src/searchable/searcher.jl
Expand Up @@ -117,6 +117,7 @@ function __get_search_index_type(config::SearchConfig)
search_index == :brutetree && return BruteTreeIndex
search_index == :kdtree && return KDTreeIndex
search_index == :hnsw && return HNSWIndex
search_index == :ivfadc && return IVFIndex
end


Expand Down
2 changes: 2 additions & 0 deletions src/utils/show.jl
Expand Up @@ -95,6 +95,8 @@ show(io::IO, srcher::Searcher{T,E,I}) where {T,E,I} = begin
_index_type = "KDTree index"
elseif I <: HNSWIndex
_index_type = "HNSW index"
elseif I <: IVFIndex
_index_type = "IVFADC index"
else
_index_type = "<Unknown index>"
end
Expand Down
29 changes: 29 additions & 0 deletions test/index.jl
@@ -0,0 +1,29 @@
@testset "Index: $IndexType" for IndexType in [NaiveIndex, BruteTreeIndex, KDTreeIndex, HNSWIndex, IVFIndex]
data = eltype(1.0)[0 0 0 5 5 5; 0 1 2 10 11 12]
spdata = sparse(data)
point = eltype(data)[5.1, 10]
true_length = size(data, 2)

if IndexType === IVFIndex
_idxfunc = d->IVFIndex(d; kc=4, k=2, m=1)
idx = _idxfunc(data)
spidx = _idxfunc(spdata)
else
idx = IndexType(data)
spidx = IndexType(data)
end
@test idx isa IndexType
idxs, scores = Garamond.knn_search(idx, point, 10; w=4)
@test idxs isa Vector{Int} && all(i in idxs for i in 1:true_length)
@test scores isa Vector{eltype(data)}

@test length(idx) == length(spidx) == true_length

# Test not implemented interface
@test_throws Garamond.IndexOperationException pop!(idx)
@test_throws Garamond.IndexOperationException push!(idx)
@test_throws Garamond.IndexOperationException pushfirst!(idx)
@test_throws Garamond.IndexOperationException popfirst!(idx)
@test_throws Garamond.IndexOperationException Garamond.delete_from_index!(idx, [1,2])

end
2 changes: 2 additions & 0 deletions test/runtests.jl
Expand Up @@ -2,6 +2,7 @@ module GaramondTesting

using Test
using Random
using SparseArrays
using JuliaDB
using EmbeddingsAnalysis
using Garamond
Expand All @@ -12,6 +13,7 @@ include("data/datagenerator.jl")
include("configs/configgenerator.jl")
include("input_parsers.jl")
include("db.jl")
include("index.jl")
include("indexfilter.jl")

end

0 comments on commit 2ab711f

Please sign in to comment.