diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj
index 1ff4f4c03..08921f244 100644
--- a/AnnService/CoreLibrary.vcxproj
+++ b/AnnService/CoreLibrary.vcxproj
@@ -164,6 +164,8 @@
+
+
@@ -178,6 +180,8 @@
+
+
diff --git a/AnnService/CoreLibrary.vcxproj.filters b/AnnService/CoreLibrary.vcxproj.filters
index 656682fea..94f27df3f 100644
--- a/AnnService/CoreLibrary.vcxproj.filters
+++ b/AnnService/CoreLibrary.vcxproj.filters
@@ -38,6 +38,12 @@
{8fb36afb-73ed-4c3d-8c9b-c3581d80c5d1}
+
+ {f7bc0bc7-1af5-4870-b8ee-fabdbabdb4c4}
+
+
+ {5c1449e0-38b7-4c82-976e-cbdc488d3fb5}
+
@@ -139,6 +145,12 @@
Header Files\Helper
+
+ Header Files\Helper\VectorSetReaders
+
+
+ Header Files\Helper
+
@@ -177,6 +189,12 @@
Source Files\Core\Common
+
+ Source Files\Helper\VectorSetReaders
+
+
+ Source Files\Helper
+
diff --git a/AnnService/IndexBuilder.vcxproj b/AnnService/IndexBuilder.vcxproj
index d3c9e0fcd..a5d05fb47 100644
--- a/AnnService/IndexBuilder.vcxproj
+++ b/AnnService/IndexBuilder.vcxproj
@@ -139,15 +139,11 @@
-
-
-
-
diff --git a/AnnService/IndexBuilder.vcxproj.filters b/AnnService/IndexBuilder.vcxproj.filters
index dcd29861c..0733fae1c 100644
--- a/AnnService/IndexBuilder.vcxproj.filters
+++ b/AnnService/IndexBuilder.vcxproj.filters
@@ -1,4 +1,4 @@
-
+
@@ -9,12 +9,6 @@
{93995380-89BD-4b04-88EB-625FBE52EBFB}
h;hh;hpp;hxx;hm;inl;inc;xsd
-
- {cf68b421-6a65-44f2-bf43-438b13940d7d}
-
-
- {41ac91f9-6b6d-4341-8791-12f672d6ad5c}
-
@@ -23,27 +17,15 @@
Header Files
-
- Header Files
-
-
- Header Files\VectorSetReaders
-
Source Files
-
- Source Files
-
-
+
Source Files
-
- Source Files\VectorSetReaders
-
-
+
Source Files
diff --git a/AnnService/inc/Core/Common/BKTree.h b/AnnService/inc/Core/Common/BKTree.h
index 4526d00a0..cce56dfc1 100644
--- a/AnnService/inc/Core/Common/BKTree.h
+++ b/AnnService/inc/Core/Common/BKTree.h
@@ -1,4 +1,4 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
+ // Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifndef _SPTAG_COMMON_BKTREE_H_
@@ -366,8 +366,7 @@ namespace SPTAG
int maxcluster = -1;
SizeType maxCount = 0;
for (int k = 0; k < m_iBKTKmeansK; k++) {
- void* currCenter = (void*)(args.centers + k * p_index->GetFeatureDim());
- if (args.newCounts[k] > maxCount && args.clusterDist[k] > p_index->ComputeDistance(currCenter, currCenter) + lambda*args.counts[k])
+ if (args.newCounts[k] > maxCount && DistanceUtils::ComputeL2Distance((T*)p_index->GetSample(args.clusterIdx[k]), args.centers + k * p_index->GetFeatureDim(), p_index->GetFeatureDim()) > 1e-6)
{
maxcluster = k;
maxCount = args.newCounts[k];
diff --git a/AnnService/inc/Core/VectorIndex.h b/AnnService/inc/Core/VectorIndex.h
index 3552d8eec..49475794d 100644
--- a/AnnService/inc/Core/VectorIndex.h
+++ b/AnnService/inc/Core/VectorIndex.h
@@ -61,6 +61,8 @@ class VectorIndex
virtual ErrorCode DeleteIndex(ByteArray p_meta);
+ virtual const void* GetSample(ByteArray p_meta);
+
virtual ErrorCode SearchIndex(const void* p_vector, int p_neighborCount, bool p_withMeta, BasicResult* p_results) const;
virtual std::string GetParameter(const std::string& p_param) const;
diff --git a/AnnService/inc/Helper/VectorSetReader.h b/AnnService/inc/Helper/VectorSetReader.h
new file mode 100644
index 000000000..cd148c1d0
--- /dev/null
+++ b/AnnService/inc/Helper/VectorSetReader.h
@@ -0,0 +1,59 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef _SPTAG_HELPER_VECTORSETREADER_H_
+#define _SPTAG_HELPER_VECTORSETREADER_H_
+
+#include "inc/Core/Common.h"
+#include "inc/Core/VectorSet.h"
+#include "inc/Core/MetadataSet.h"
+#include "inc/Helper/ArgumentsParser.h"
+
+#include
+
+namespace SPTAG
+{
+namespace Helper
+{
+
+class ReaderOptions : public ArgumentsParser
+{
+public:
+ ReaderOptions(VectorValueType p_valueType, DimensionType p_dimension, std::string p_vectorDelimiter = "|", std::uint32_t p_threadNum = 32);
+
+ ~ReaderOptions();
+
+ std::uint32_t m_threadNum;
+
+ DimensionType m_dimension;
+
+ std::string m_vectorDelimiter;
+
+ SPTAG::VectorValueType m_inputValueType;
+};
+
+class VectorSetReader
+{
+public:
+ VectorSetReader(std::shared_ptr p_options);
+
+ virtual ~VectorSetReader();
+
+ virtual ErrorCode LoadFile(const std::string& p_filePath) = 0;
+
+ virtual std::shared_ptr GetVectorSet() const = 0;
+
+ virtual std::shared_ptr GetMetadataSet() const = 0;
+
+ static std::shared_ptr CreateInstance(std::shared_ptr p_options);
+
+protected:
+ std::shared_ptr m_options;
+};
+
+
+
+} // namespace Helper
+} // namespace SPTAG
+
+#endif // _SPTAG_HELPER_VECTORSETREADER_H_
diff --git a/AnnService/inc/IndexBuilder/VectorSetReaders/DefaultReader.h b/AnnService/inc/Helper/VectorSetReaders/DefaultReader.h
similarity index 83%
rename from AnnService/inc/IndexBuilder/VectorSetReaders/DefaultReader.h
rename to AnnService/inc/Helper/VectorSetReaders/DefaultReader.h
index e3e1911a9..52c8404ca 100644
--- a/AnnService/inc/IndexBuilder/VectorSetReaders/DefaultReader.h
+++ b/AnnService/inc/Helper/VectorSetReaders/DefaultReader.h
@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
-#ifndef _SPTAG_INDEXBUILDER_VECTORSETREADERS_DEFAULTREADER_H_
-#define _SPTAG_INDEXBUILDER_VECTORSETREADERS_DEFAULTREADER_H_
+#ifndef _SPTAG_HELPER_VECTORSETREADERS_DEFAULTREADER_H_
+#define _SPTAG_HELPER_VECTORSETREADERS_DEFAULTREADER_H_
#include "../VectorSetReader.h"
#include "inc/Helper/Concurrent.h"
@@ -13,13 +13,13 @@
namespace SPTAG
{
-namespace IndexBuilder
+namespace Helper
{
class DefaultReader : public VectorSetReader
{
public:
- DefaultReader(std::shared_ptr p_options);
+ DefaultReader(std::shared_ptr p_options);
virtual ~DefaultReader();
@@ -44,7 +44,7 @@ class DefaultReader : public VectorSetReader
template
bool TranslateVector(char* p_str, DataType* p_vector)
{
- std::uint32_t eleCount = 0;
+ DimensionType eleCount = 0;
char* next = p_str;
while ((*next) != '\0')
{
@@ -85,11 +85,11 @@ class DefaultReader : public VectorSetReader
std::size_t m_subTaskBlocksize;
- std::atomic m_totalRecordCount;
+ std::atomic m_totalRecordCount;
std::atomic m_totalRecordVectorBytes;
- std::vector m_subTaskRecordCount;
+ std::vector m_subTaskRecordCount;
std::string m_vectorOutput;
@@ -102,7 +102,7 @@ class DefaultReader : public VectorSetReader
-} // namespace IndexBuilder
+} // namespace Helper
} // namespace SPTAG
-#endif // _SPTAG_INDEXBUILDER_VECTORSETREADERS_DEFAULT_H_
+#endif // _SPTAG_HELPER_VECTORSETREADERS_DEFAULT_H_
diff --git a/AnnService/inc/IndexBuilder/Options.h b/AnnService/inc/IndexBuilder/Options.h
index 7c939efae..4ebe6b27a 100644
--- a/AnnService/inc/IndexBuilder/Options.h
+++ b/AnnService/inc/IndexBuilder/Options.h
@@ -5,7 +5,7 @@
#define _SPTAG_INDEXBUILDER_OPTIONS_H_
#include "inc/Core/Common.h"
-#include "inc/Helper/ArgumentsParser.h"
+#include "inc/Helper/VectorsetReader.h"
#include
#include
@@ -16,21 +16,13 @@ namespace SPTAG
namespace IndexBuilder
{
-class BuilderOptions : public Helper::ArgumentsParser
+class BuilderOptions : public Helper::ReaderOptions
{
public:
BuilderOptions();
~BuilderOptions();
- std::uint32_t m_threadNum;
-
- std::uint32_t m_dimension;
-
- std::string m_vectorDelimiter;
-
- SPTAG::VectorValueType m_inputValueType;
-
std::string m_inputFiles;
std::string m_outputFolder;
diff --git a/AnnService/inc/IndexBuilder/VectorSetReader.h b/AnnService/inc/IndexBuilder/VectorSetReader.h
deleted file mode 100644
index 6bb3026aa..000000000
--- a/AnnService/inc/IndexBuilder/VectorSetReader.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifndef _SPTAG_INDEXBUILDER_VECTORSETREADER_H_
-#define _SPTAG_INDEXBUILDER_VECTORSETREADER_H_
-
-#include "inc/Core/Common.h"
-#include "inc/Core/VectorSet.h"
-#include "inc/Core/MetadataSet.h"
-#include "Options.h"
-
-#include
-
-namespace SPTAG
-{
-namespace IndexBuilder
-{
-
-class VectorSetReader
-{
-public:
- VectorSetReader(std::shared_ptr p_options);
-
- virtual ~VectorSetReader();
-
- virtual ErrorCode LoadFile(const std::string& p_filePath) = 0;
-
- virtual std::shared_ptr GetVectorSet() const = 0;
-
- virtual std::shared_ptr GetMetadataSet() const = 0;
-
- static std::shared_ptr CreateInstance(std::shared_ptr p_options);
-
-protected:
- std::shared_ptr m_options;
-};
-
-
-
-} // namespace IndexBuilder
-} // namespace SPTAG
-
-#endif // _SPTAG_INDEXBUILDER_VECTORSETREADER_H_
diff --git a/AnnService/src/Core/VectorIndex.cpp b/AnnService/src/Core/VectorIndex.cpp
index 5ed9b36af..9c7ccf549 100644
--- a/AnnService/src/Core/VectorIndex.cpp
+++ b/AnnService/src/Core/VectorIndex.cpp
@@ -315,6 +315,17 @@ VectorIndex::DeleteIndex(ByteArray p_meta) {
}
+const void* VectorIndex::GetSample(ByteArray p_meta)
+{
+ if (m_pMetaToVec == nullptr) return nullptr;
+
+ std::string meta((char*)p_meta.Data(), p_meta.Length());
+ auto iter = m_pMetaToVec->find(meta);
+ if (iter != m_pMetaToVec->end()) return GetSample(iter->second);
+ return nullptr;
+}
+
+
std::shared_ptr
VectorIndex::CreateInstance(IndexAlgoType p_algo, VectorValueType p_valuetype)
{
diff --git a/AnnService/src/Helper/VectorSetReader.cpp b/AnnService/src/Helper/VectorSetReader.cpp
new file mode 100644
index 000000000..44371ae24
--- /dev/null
+++ b/AnnService/src/Helper/VectorSetReader.cpp
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "inc/Helper/VectorSetReader.h"
+#include "inc/Helper/VectorSetReaders/DefaultReader.h"
+
+
+using namespace SPTAG;
+using namespace SPTAG::Helper;
+
+
+ReaderOptions::ReaderOptions(VectorValueType p_valueType, DimensionType p_dimension, std::string p_vectorDelimiter, std::uint32_t p_threadNum)
+ : m_threadNum(p_threadNum), m_dimension(p_dimension), m_vectorDelimiter(p_vectorDelimiter), m_inputValueType(p_valueType)
+{
+ AddOptionalOption(m_threadNum, "-t", "--thread", "Thread Number.");
+ AddOptionalOption(m_vectorDelimiter, "", "--delimiter", "Vector delimiter.");
+ AddRequiredOption(m_dimension, "-d", "--dimension", "Dimension of vector.");
+ AddRequiredOption(m_inputValueType, "-v", "--vectortype", "Input vector data type. Default is float.");
+}
+
+
+ReaderOptions::~ReaderOptions()
+{
+}
+
+
+VectorSetReader::VectorSetReader(std::shared_ptr p_options)
+ : m_options(p_options)
+{
+}
+
+
+VectorSetReader:: ~VectorSetReader()
+{
+}
+
+
+std::shared_ptr
+VectorSetReader::CreateInstance(std::shared_ptr p_options)
+{
+ return std::shared_ptr(new DefaultReader(std::move(p_options)));
+}
+
+
diff --git a/AnnService/src/IndexBuilder/VectorSetReaders/DefaultReader.cpp b/AnnService/src/Helper/VectorSetReaders/DefaultReader.cpp
similarity index 92%
rename from AnnService/src/IndexBuilder/VectorSetReaders/DefaultReader.cpp
rename to AnnService/src/Helper/VectorSetReaders/DefaultReader.cpp
index 7f7c4187b..fb079c895 100644
--- a/AnnService/src/IndexBuilder/VectorSetReaders/DefaultReader.cpp
+++ b/AnnService/src/Helper/VectorSetReaders/DefaultReader.cpp
@@ -1,17 +1,17 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "inc/IndexBuilder/VectorSetReaders/DefaultReader.h"
+#include "inc/Helper/VectorSetReaders/DefaultReader.h"
#include "inc/Helper/StringConvert.h"
#include "inc/Helper/CommonHelper.h"
-#include "inc/IndexBuilder/ThreadPool.h"
#include
#include
#include
+#include
using namespace SPTAG;
-using namespace SPTAG::IndexBuilder;
+using namespace SPTAG::Helper;
namespace
{
@@ -139,10 +139,13 @@ class BinaryLineReader
} // namespace Local
} // namespace
-DefaultReader::DefaultReader(std::shared_ptr p_options)
+
+DefaultReader::DefaultReader(std::shared_ptr p_options)
: VectorSetReader(std::move(p_options)),
- m_subTaskBlocksize(0)
+ m_subTaskBlocksize(0)
{
+ omp_set_num_threads(m_options->m_threadNum);
+
std::string tempFolder("tempfolder");
if (!direxists(tempFolder.c_str()))
{
@@ -180,7 +183,7 @@ DefaultReader::LoadFile(const std::string& p_filePaths)
{
const auto& files = GetFileSizes(p_filePaths);
std::vector> subWorks;
- subWorks.reserve(files.size() * ThreadPool::CurrentThreadNum());
+ subWorks.reserve(files.size() * m_options->m_threadNum);
m_subTaskCount = 0;
for (const auto& fileInfo : files)
@@ -197,7 +200,7 @@ DefaultReader::LoadFile(const std::string& p_filePaths)
std::size_t blockSize = m_subTaskBlocksize;
if (0 == blockSize)
{
- fileTaskCount = ThreadPool::CurrentThreadNum();
+ fileTaskCount = m_options->m_threadNum;
blockSize = (fileInfo.second + fileTaskCount - 1) / fileTaskCount;
}
else
@@ -223,9 +226,10 @@ DefaultReader::LoadFile(const std::string& p_filePaths)
m_waitSignal.Reset(m_subTaskCount);
- for (auto& workItem : subWorks)
+#pragma omp parallel for schedule(dynamic)
+ for (int64_t i = 0; i < (int64_t)subWorks.size(); i++)
{
- ThreadPool::Queue(std::move(workItem));
+ subWorks[i]();
}
m_waitSignal.Wait();
@@ -244,7 +248,7 @@ DefaultReader::GetVectorSet() const
std::ifstream inputStream;
inputStream.open(m_vectorOutput, std::ifstream::binary);
- inputStream.seekg(sizeof(uint32_t) + sizeof(uint32_t), std::ifstream::beg);
+ inputStream.seekg(sizeof(SizeType) + sizeof(DimensionType), std::ifstream::beg);
inputStream.read(vecBuf, m_totalRecordVectorBytes);
inputStream.close();
@@ -276,7 +280,7 @@ DefaultReader::LoadFileInternal(const std::string& p_filePath,
std::ofstream metaStreamContent;
std::ofstream metaStreamIndex;
- std::uint32_t recordCount = 0;
+ SizeType recordCount = 0;
std::uint64_t metaOffset = 0;
std::size_t totalRead = 0;
std::streamoff startpos = p_fileBlockID * p_fileBlockSize;
@@ -400,11 +404,11 @@ DefaultReader::MergeData()
std::unique_ptr bufferHolder(new char[bufferSize]);
char* buf = bufferHolder.get();
- std::uint32_t uint32Var = m_totalRecordCount;
+ SizeType totalRecordCount = m_totalRecordCount;
outputStream.open(m_vectorOutput, std::ofstream::binary);
- outputStream.write(reinterpret_cast(&uint32Var), sizeof(uint32Var));
+ outputStream.write(reinterpret_cast(&totalRecordCount), sizeof(totalRecordCount));
outputStream.write(reinterpret_cast(&(m_options->m_dimension)), sizeof(m_options->m_dimension));
for (std::uint32_t i = 0; i < m_subTaskCount; ++i)
@@ -442,7 +446,7 @@ DefaultReader::MergeData()
outputStream.open(m_metadataIndexOutput, std::ofstream::binary);
- outputStream.write(reinterpret_cast(&uint32Var), sizeof(uint32Var));
+ outputStream.write(reinterpret_cast(&totalRecordCount), sizeof(totalRecordCount));
std::uint64_t totalOffset = 0;
for (std::uint32_t i = 0; i < m_subTaskCount; ++i)
@@ -453,18 +457,18 @@ DefaultReader::MergeData()
file += ".tmp";
inputStream.open(file, std::ifstream::binary);
- for (std::uint32_t remains = m_subTaskRecordCount[i]; remains > 0;)
+ for (SizeType remains = m_subTaskRecordCount[i]; remains > 0;)
{
std::size_t readBytesCount = min(remains * sizeof(std::uint64_t), bufferSizeTrim64);
inputStream.read(buf, readBytesCount);
std::uint64_t* offset = reinterpret_cast(buf);
- for (std::uint32_t i = 0; i < readBytesCount / sizeof(std::uint64_t); ++i)
+ for (std::uint64_t i = 0; i < readBytesCount / sizeof(std::uint64_t); ++i)
{
offset[i] += totalOffset;
}
outputStream.write(buf, readBytesCount);
- remains -= static_cast(readBytesCount / sizeof(std::uint64_t));
+ remains -= static_cast(readBytesCount / sizeof(std::uint64_t));
}
inputStream.read(buf, sizeof(std::uint64_t));
diff --git a/AnnService/src/IndexBuilder/Options.cpp b/AnnService/src/IndexBuilder/Options.cpp
index d0fcd0fd8..6360b73c2 100644
--- a/AnnService/src/IndexBuilder/Options.cpp
+++ b/AnnService/src/IndexBuilder/Options.cpp
@@ -11,14 +11,8 @@ using namespace SPTAG::IndexBuilder;
BuilderOptions::BuilderOptions()
- : m_threadNum(32),
- m_inputValueType(VectorValueType::Float),
- m_vectorDelimiter("|")
+ : Helper::ReaderOptions(VectorValueType::Float, 0, "|", 32)
{
- AddOptionalOption(m_threadNum, "-t", "--thread", "Thread Number.");
- AddOptionalOption(m_vectorDelimiter, "", "--delimiter", "Vector delimiter.");
- AddRequiredOption(m_dimension, "-d", "--dimension", "Dimension of vector.");
- AddRequiredOption(m_inputValueType, "-v", "--vectortype", "Input vector data type. Default is float.");
AddRequiredOption(m_inputFiles, "-i", "--input", "Input raw data.");
AddRequiredOption(m_outputFolder, "-o", "--outputfolder", "Output folder.");
AddRequiredOption(m_indexAlgoType, "-a", "--algo", "Index Algorithm type.");
diff --git a/AnnService/src/IndexBuilder/VectorSetReader.cpp b/AnnService/src/IndexBuilder/VectorSetReader.cpp
deleted file mode 100644
index e50f6f5eb..000000000
--- a/AnnService/src/IndexBuilder/VectorSetReader.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "inc/IndexBuilder/VectorSetReader.h"
-#include "inc/IndexBuilder/VectorSetReaders/DefaultReader.h"
-
-
-using namespace SPTAG;
-using namespace SPTAG::IndexBuilder;
-
-VectorSetReader::VectorSetReader(std::shared_ptr p_options)
- : m_options(p_options)
-{
-}
-
-
-VectorSetReader:: ~VectorSetReader()
-{
-}
-
-
-std::shared_ptr
-VectorSetReader::CreateInstance(std::shared_ptr p_options)
-{
- return std::shared_ptr(new DefaultReader(std::move(p_options)));
-}
-
diff --git a/AnnService/src/IndexBuilder/main.cpp b/AnnService/src/IndexBuilder/main.cpp
index 055cd3265..040703c3c 100644
--- a/AnnService/src/IndexBuilder/main.cpp
+++ b/AnnService/src/IndexBuilder/main.cpp
@@ -1,9 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "inc/IndexBuilder/ThreadPool.h"
#include "inc/IndexBuilder/Options.h"
-#include "inc/IndexBuilder/VectorSetReader.h"
+#include "inc/Helper/VectorSetReader.h"
#include "inc/Core/VectorIndex.h"
#include "inc/Core/Common.h"
#include "inc/Helper/SimpleIniReader.h"
@@ -20,7 +19,7 @@ int main(int argc, char* argv[])
{
exit(1);
}
- IndexBuilder::ThreadPool::Init(options->m_threadNum);
+
auto indexBuilder = VectorIndex::CreateInstance(options->m_indexAlgoType, options->m_inputValueType);
Helper::IniReader iniReader;
@@ -82,7 +81,7 @@ int main(int argc, char* argv[])
indexBuilder->SaveIndex(options->m_outputFolder);
}
else {
- auto vectorReader = IndexBuilder::VectorSetReader::CreateInstance(options);
+ auto vectorReader = Helper::VectorSetReader::CreateInstance(options);
if (ErrorCode::Success != vectorReader->LoadFile(options->m_inputFiles))
{
fprintf(stderr, "Failed to read input file.\n");