From e574ee078ddcb1b5ffd58d5f54edd184a82008d0 Mon Sep 17 00:00:00 2001 From: MaggieQi Date: Tue, 20 Aug 2019 10:23:58 +0800 Subject: [PATCH] fix some type definition in the Reader and add more support to create Reader (#93) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * remove the tbb dependency (#71) (#10) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * fix type definition * change incremental update design * fix all type * fix debug mode memory delete assert * add deletePercentageForRefine judgement * add dump and load from byte array * add dump and load from byte array * fix getNumThreads * fix loadindex and add index bugs * Update AlgoTest to add metamapping test * fix compling error in g++7 * fix largest cluster cannot be split during clustering * update fresh ANN implementation (#85) (#12) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * remove the tbb dependency (#71) (#10) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * fix type definition * change incremental update design * fix all type * fix debug mode memory delete assert * add deletePercentageForRefine judgement * add dump and load from byte array * add dump and load from byte array * fix getNumThreads * fix loadindex and add index bugs * Update AlgoTest to add metamapping test * fix compling error in g++7 * fix largest cluster cannot be split during clustering * fix maxcluster is -1 bug * fix Reader type definition and add more support * fix maxcluster is -1 bug (#91) (#14) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * remove the tbb dependency (#71) (#10) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * fix type definition * change incremental update design * fix all type * fix debug mode memory delete assert * add deletePercentageForRefine judgement * add dump and load from byte array * add dump and load from byte array * fix getNumThreads * fix loadindex and add index bugs * Update AlgoTest to add metamapping test * fix compling error in g++7 * fix largest cluster cannot be split during clustering * update fresh ANN implementation (#85) (#12) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * remove the tbb dependency (#71) (#10) * remove dup code * Update Readme.md * Fix DataSet GNU compile fail bug * fix GNU Windows align alloc bugs * add copyright in each file * fix copy right in dataset * change kdt distance judgement * change code structure and add more wrappers * Update docs * fix search result * change IndexBuilder to support binary input data * temp remove java related projects * remove javaclient and javacore from the windows build * Fix SetData issue * Add vector record count and dimension for reuse and debug * change default parameter definition * add uint8 support * small fix for cosine distance of uint8 * fix AVX distance calculation epu8 * update readme * Update DistanceUtils.h * fix python wrapper cannot load larger than 4G memory error * try to add C# wrapper * fix owner of C# wrapper * add C# cmake support * fix byte array copy * fix tab to space * Try to make shared_ptr as Array template * fix copy * add Parameters documents * remove tbb dependency * fix concurrent_set * fix gcc 5.x cannot support shared_mutex * move concurrentset to Helper folder and change find to contains * Update README.md * try to use shared_lock to replace lock and unlock, try to use block to manage the increased memory * fix filling -1 * fix initialization * change to memset * add CLR CoreInterface for managed dll * try to reserve incBlocks capacity * fix return ErrorCode for AddBatch in Dataset.h * change return type to ErrorCode for AddBatch * fix type definition * change incremental update design * fix all type * fix debug mode memory delete assert * add deletePercentageForRefine judgement * add dump and load from byte array * add dump and load from byte array * fix getNumThreads * fix loadindex and add index bugs * Update AlgoTest to add metamapping test * fix compling error in g++7 * fix largest cluster cannot be split during clustering * fix maxcluster is -1 bug * move threadPool init into DefaultReader * try to move VectorsetReader into CordLibrary * fix bktree cluster split issue --- AnnService/CoreLibrary.vcxproj | 4 ++ AnnService/CoreLibrary.vcxproj.filters | 18 ++++++ AnnService/IndexBuilder.vcxproj | 4 -- AnnService/IndexBuilder.vcxproj.filters | 24 +------- AnnService/inc/Core/Common/BKTree.h | 5 +- AnnService/inc/Core/VectorIndex.h | 2 + AnnService/inc/Helper/VectorSetReader.h | 59 +++++++++++++++++++ .../VectorSetReaders/DefaultReader.h | 18 +++--- AnnService/inc/IndexBuilder/Options.h | 12 +--- AnnService/inc/IndexBuilder/VectorSetReader.h | 43 -------------- AnnService/src/Core/VectorIndex.cpp | 11 ++++ AnnService/src/Helper/VectorSetReader.cpp | 44 ++++++++++++++ .../VectorSetReaders/DefaultReader.cpp | 38 ++++++------ AnnService/src/IndexBuilder/Options.cpp | 8 +-- .../src/IndexBuilder/VectorSetReader.cpp | 27 --------- AnnService/src/IndexBuilder/main.cpp | 7 +-- 16 files changed, 179 insertions(+), 145 deletions(-) create mode 100644 AnnService/inc/Helper/VectorSetReader.h rename AnnService/inc/{IndexBuilder => Helper}/VectorSetReaders/DefaultReader.h (83%) delete mode 100644 AnnService/inc/IndexBuilder/VectorSetReader.h create mode 100644 AnnService/src/Helper/VectorSetReader.cpp rename AnnService/src/{IndexBuilder => Helper}/VectorSetReaders/DefaultReader.cpp (92%) delete mode 100644 AnnService/src/IndexBuilder/VectorSetReader.cpp diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj index 1ff4f4c03..08921f244 100644 --- a/AnnService/CoreLibrary.vcxproj +++ b/AnnService/CoreLibrary.vcxproj @@ -164,6 +164,8 @@ + + @@ -178,6 +180,8 @@ + + diff --git a/AnnService/CoreLibrary.vcxproj.filters b/AnnService/CoreLibrary.vcxproj.filters index 656682fea..94f27df3f 100644 --- a/AnnService/CoreLibrary.vcxproj.filters +++ b/AnnService/CoreLibrary.vcxproj.filters @@ -38,6 +38,12 @@ {8fb36afb-73ed-4c3d-8c9b-c3581d80c5d1} + + {f7bc0bc7-1af5-4870-b8ee-fabdbabdb4c4} + + + {5c1449e0-38b7-4c82-976e-cbdc488d3fb5} + @@ -139,6 +145,12 @@ Header Files\Helper + + Header Files\Helper\VectorSetReaders + + + Header Files\Helper + @@ -177,6 +189,12 @@ Source Files\Core\Common + + Source Files\Helper\VectorSetReaders + + + Source Files\Helper + diff --git a/AnnService/IndexBuilder.vcxproj b/AnnService/IndexBuilder.vcxproj index d3c9e0fcd..a5d05fb47 100644 --- a/AnnService/IndexBuilder.vcxproj +++ b/AnnService/IndexBuilder.vcxproj @@ -139,15 +139,11 @@ - - - - diff --git a/AnnService/IndexBuilder.vcxproj.filters b/AnnService/IndexBuilder.vcxproj.filters index dcd29861c..0733fae1c 100644 --- a/AnnService/IndexBuilder.vcxproj.filters +++ b/AnnService/IndexBuilder.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -9,12 +9,6 @@ {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hh;hpp;hxx;hm;inl;inc;xsd - - {cf68b421-6a65-44f2-bf43-438b13940d7d} - - - {41ac91f9-6b6d-4341-8791-12f672d6ad5c} - @@ -23,27 +17,15 @@ Header Files - - Header Files - - - Header Files\VectorSetReaders - Source Files - - Source Files - - + Source Files - - Source Files\VectorSetReaders - - + Source Files diff --git a/AnnService/inc/Core/Common/BKTree.h b/AnnService/inc/Core/Common/BKTree.h index 4526d00a0..cce56dfc1 100644 --- a/AnnService/inc/Core/Common/BKTree.h +++ b/AnnService/inc/Core/Common/BKTree.h @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. + // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #ifndef _SPTAG_COMMON_BKTREE_H_ @@ -366,8 +366,7 @@ namespace SPTAG int maxcluster = -1; SizeType maxCount = 0; for (int k = 0; k < m_iBKTKmeansK; k++) { - void* currCenter = (void*)(args.centers + k * p_index->GetFeatureDim()); - if (args.newCounts[k] > maxCount && args.clusterDist[k] > p_index->ComputeDistance(currCenter, currCenter) + lambda*args.counts[k]) + if (args.newCounts[k] > maxCount && DistanceUtils::ComputeL2Distance((T*)p_index->GetSample(args.clusterIdx[k]), args.centers + k * p_index->GetFeatureDim(), p_index->GetFeatureDim()) > 1e-6) { maxcluster = k; maxCount = args.newCounts[k]; diff --git a/AnnService/inc/Core/VectorIndex.h b/AnnService/inc/Core/VectorIndex.h index 3552d8eec..49475794d 100644 --- a/AnnService/inc/Core/VectorIndex.h +++ b/AnnService/inc/Core/VectorIndex.h @@ -61,6 +61,8 @@ class VectorIndex virtual ErrorCode DeleteIndex(ByteArray p_meta); + virtual const void* GetSample(ByteArray p_meta); + virtual ErrorCode SearchIndex(const void* p_vector, int p_neighborCount, bool p_withMeta, BasicResult* p_results) const; virtual std::string GetParameter(const std::string& p_param) const; diff --git a/AnnService/inc/Helper/VectorSetReader.h b/AnnService/inc/Helper/VectorSetReader.h new file mode 100644 index 000000000..cd148c1d0 --- /dev/null +++ b/AnnService/inc/Helper/VectorSetReader.h @@ -0,0 +1,59 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifndef _SPTAG_HELPER_VECTORSETREADER_H_ +#define _SPTAG_HELPER_VECTORSETREADER_H_ + +#include "inc/Core/Common.h" +#include "inc/Core/VectorSet.h" +#include "inc/Core/MetadataSet.h" +#include "inc/Helper/ArgumentsParser.h" + +#include + +namespace SPTAG +{ +namespace Helper +{ + +class ReaderOptions : public ArgumentsParser +{ +public: + ReaderOptions(VectorValueType p_valueType, DimensionType p_dimension, std::string p_vectorDelimiter = "|", std::uint32_t p_threadNum = 32); + + ~ReaderOptions(); + + std::uint32_t m_threadNum; + + DimensionType m_dimension; + + std::string m_vectorDelimiter; + + SPTAG::VectorValueType m_inputValueType; +}; + +class VectorSetReader +{ +public: + VectorSetReader(std::shared_ptr p_options); + + virtual ~VectorSetReader(); + + virtual ErrorCode LoadFile(const std::string& p_filePath) = 0; + + virtual std::shared_ptr GetVectorSet() const = 0; + + virtual std::shared_ptr GetMetadataSet() const = 0; + + static std::shared_ptr CreateInstance(std::shared_ptr p_options); + +protected: + std::shared_ptr m_options; +}; + + + +} // namespace Helper +} // namespace SPTAG + +#endif // _SPTAG_HELPER_VECTORSETREADER_H_ diff --git a/AnnService/inc/IndexBuilder/VectorSetReaders/DefaultReader.h b/AnnService/inc/Helper/VectorSetReaders/DefaultReader.h similarity index 83% rename from AnnService/inc/IndexBuilder/VectorSetReaders/DefaultReader.h rename to AnnService/inc/Helper/VectorSetReaders/DefaultReader.h index e3e1911a9..52c8404ca 100644 --- a/AnnService/inc/IndexBuilder/VectorSetReaders/DefaultReader.h +++ b/AnnService/inc/Helper/VectorSetReaders/DefaultReader.h @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#ifndef _SPTAG_INDEXBUILDER_VECTORSETREADERS_DEFAULTREADER_H_ -#define _SPTAG_INDEXBUILDER_VECTORSETREADERS_DEFAULTREADER_H_ +#ifndef _SPTAG_HELPER_VECTORSETREADERS_DEFAULTREADER_H_ +#define _SPTAG_HELPER_VECTORSETREADERS_DEFAULTREADER_H_ #include "../VectorSetReader.h" #include "inc/Helper/Concurrent.h" @@ -13,13 +13,13 @@ namespace SPTAG { -namespace IndexBuilder +namespace Helper { class DefaultReader : public VectorSetReader { public: - DefaultReader(std::shared_ptr p_options); + DefaultReader(std::shared_ptr p_options); virtual ~DefaultReader(); @@ -44,7 +44,7 @@ class DefaultReader : public VectorSetReader template bool TranslateVector(char* p_str, DataType* p_vector) { - std::uint32_t eleCount = 0; + DimensionType eleCount = 0; char* next = p_str; while ((*next) != '\0') { @@ -85,11 +85,11 @@ class DefaultReader : public VectorSetReader std::size_t m_subTaskBlocksize; - std::atomic m_totalRecordCount; + std::atomic m_totalRecordCount; std::atomic m_totalRecordVectorBytes; - std::vector m_subTaskRecordCount; + std::vector m_subTaskRecordCount; std::string m_vectorOutput; @@ -102,7 +102,7 @@ class DefaultReader : public VectorSetReader -} // namespace IndexBuilder +} // namespace Helper } // namespace SPTAG -#endif // _SPTAG_INDEXBUILDER_VECTORSETREADERS_DEFAULT_H_ +#endif // _SPTAG_HELPER_VECTORSETREADERS_DEFAULT_H_ diff --git a/AnnService/inc/IndexBuilder/Options.h b/AnnService/inc/IndexBuilder/Options.h index 7c939efae..4ebe6b27a 100644 --- a/AnnService/inc/IndexBuilder/Options.h +++ b/AnnService/inc/IndexBuilder/Options.h @@ -5,7 +5,7 @@ #define _SPTAG_INDEXBUILDER_OPTIONS_H_ #include "inc/Core/Common.h" -#include "inc/Helper/ArgumentsParser.h" +#include "inc/Helper/VectorsetReader.h" #include #include @@ -16,21 +16,13 @@ namespace SPTAG namespace IndexBuilder { -class BuilderOptions : public Helper::ArgumentsParser +class BuilderOptions : public Helper::ReaderOptions { public: BuilderOptions(); ~BuilderOptions(); - std::uint32_t m_threadNum; - - std::uint32_t m_dimension; - - std::string m_vectorDelimiter; - - SPTAG::VectorValueType m_inputValueType; - std::string m_inputFiles; std::string m_outputFolder; diff --git a/AnnService/inc/IndexBuilder/VectorSetReader.h b/AnnService/inc/IndexBuilder/VectorSetReader.h deleted file mode 100644 index 6bb3026aa..000000000 --- a/AnnService/inc/IndexBuilder/VectorSetReader.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef _SPTAG_INDEXBUILDER_VECTORSETREADER_H_ -#define _SPTAG_INDEXBUILDER_VECTORSETREADER_H_ - -#include "inc/Core/Common.h" -#include "inc/Core/VectorSet.h" -#include "inc/Core/MetadataSet.h" -#include "Options.h" - -#include - -namespace SPTAG -{ -namespace IndexBuilder -{ - -class VectorSetReader -{ -public: - VectorSetReader(std::shared_ptr p_options); - - virtual ~VectorSetReader(); - - virtual ErrorCode LoadFile(const std::string& p_filePath) = 0; - - virtual std::shared_ptr GetVectorSet() const = 0; - - virtual std::shared_ptr GetMetadataSet() const = 0; - - static std::shared_ptr CreateInstance(std::shared_ptr p_options); - -protected: - std::shared_ptr m_options; -}; - - - -} // namespace IndexBuilder -} // namespace SPTAG - -#endif // _SPTAG_INDEXBUILDER_VECTORSETREADER_H_ diff --git a/AnnService/src/Core/VectorIndex.cpp b/AnnService/src/Core/VectorIndex.cpp index 5ed9b36af..9c7ccf549 100644 --- a/AnnService/src/Core/VectorIndex.cpp +++ b/AnnService/src/Core/VectorIndex.cpp @@ -315,6 +315,17 @@ VectorIndex::DeleteIndex(ByteArray p_meta) { } +const void* VectorIndex::GetSample(ByteArray p_meta) +{ + if (m_pMetaToVec == nullptr) return nullptr; + + std::string meta((char*)p_meta.Data(), p_meta.Length()); + auto iter = m_pMetaToVec->find(meta); + if (iter != m_pMetaToVec->end()) return GetSample(iter->second); + return nullptr; +} + + std::shared_ptr VectorIndex::CreateInstance(IndexAlgoType p_algo, VectorValueType p_valuetype) { diff --git a/AnnService/src/Helper/VectorSetReader.cpp b/AnnService/src/Helper/VectorSetReader.cpp new file mode 100644 index 000000000..44371ae24 --- /dev/null +++ b/AnnService/src/Helper/VectorSetReader.cpp @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "inc/Helper/VectorSetReader.h" +#include "inc/Helper/VectorSetReaders/DefaultReader.h" + + +using namespace SPTAG; +using namespace SPTAG::Helper; + + +ReaderOptions::ReaderOptions(VectorValueType p_valueType, DimensionType p_dimension, std::string p_vectorDelimiter, std::uint32_t p_threadNum) + : m_threadNum(p_threadNum), m_dimension(p_dimension), m_vectorDelimiter(p_vectorDelimiter), m_inputValueType(p_valueType) +{ + AddOptionalOption(m_threadNum, "-t", "--thread", "Thread Number."); + AddOptionalOption(m_vectorDelimiter, "", "--delimiter", "Vector delimiter."); + AddRequiredOption(m_dimension, "-d", "--dimension", "Dimension of vector."); + AddRequiredOption(m_inputValueType, "-v", "--vectortype", "Input vector data type. Default is float."); +} + + +ReaderOptions::~ReaderOptions() +{ +} + + +VectorSetReader::VectorSetReader(std::shared_ptr p_options) + : m_options(p_options) +{ +} + + +VectorSetReader:: ~VectorSetReader() +{ +} + + +std::shared_ptr +VectorSetReader::CreateInstance(std::shared_ptr p_options) +{ + return std::shared_ptr(new DefaultReader(std::move(p_options))); +} + + diff --git a/AnnService/src/IndexBuilder/VectorSetReaders/DefaultReader.cpp b/AnnService/src/Helper/VectorSetReaders/DefaultReader.cpp similarity index 92% rename from AnnService/src/IndexBuilder/VectorSetReaders/DefaultReader.cpp rename to AnnService/src/Helper/VectorSetReaders/DefaultReader.cpp index 7f7c4187b..fb079c895 100644 --- a/AnnService/src/IndexBuilder/VectorSetReaders/DefaultReader.cpp +++ b/AnnService/src/Helper/VectorSetReaders/DefaultReader.cpp @@ -1,17 +1,17 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "inc/IndexBuilder/VectorSetReaders/DefaultReader.h" +#include "inc/Helper/VectorSetReaders/DefaultReader.h" #include "inc/Helper/StringConvert.h" #include "inc/Helper/CommonHelper.h" -#include "inc/IndexBuilder/ThreadPool.h" #include #include #include +#include using namespace SPTAG; -using namespace SPTAG::IndexBuilder; +using namespace SPTAG::Helper; namespace { @@ -139,10 +139,13 @@ class BinaryLineReader } // namespace Local } // namespace -DefaultReader::DefaultReader(std::shared_ptr p_options) + +DefaultReader::DefaultReader(std::shared_ptr p_options) : VectorSetReader(std::move(p_options)), - m_subTaskBlocksize(0) + m_subTaskBlocksize(0) { + omp_set_num_threads(m_options->m_threadNum); + std::string tempFolder("tempfolder"); if (!direxists(tempFolder.c_str())) { @@ -180,7 +183,7 @@ DefaultReader::LoadFile(const std::string& p_filePaths) { const auto& files = GetFileSizes(p_filePaths); std::vector> subWorks; - subWorks.reserve(files.size() * ThreadPool::CurrentThreadNum()); + subWorks.reserve(files.size() * m_options->m_threadNum); m_subTaskCount = 0; for (const auto& fileInfo : files) @@ -197,7 +200,7 @@ DefaultReader::LoadFile(const std::string& p_filePaths) std::size_t blockSize = m_subTaskBlocksize; if (0 == blockSize) { - fileTaskCount = ThreadPool::CurrentThreadNum(); + fileTaskCount = m_options->m_threadNum; blockSize = (fileInfo.second + fileTaskCount - 1) / fileTaskCount; } else @@ -223,9 +226,10 @@ DefaultReader::LoadFile(const std::string& p_filePaths) m_waitSignal.Reset(m_subTaskCount); - for (auto& workItem : subWorks) +#pragma omp parallel for schedule(dynamic) + for (int64_t i = 0; i < (int64_t)subWorks.size(); i++) { - ThreadPool::Queue(std::move(workItem)); + subWorks[i](); } m_waitSignal.Wait(); @@ -244,7 +248,7 @@ DefaultReader::GetVectorSet() const std::ifstream inputStream; inputStream.open(m_vectorOutput, std::ifstream::binary); - inputStream.seekg(sizeof(uint32_t) + sizeof(uint32_t), std::ifstream::beg); + inputStream.seekg(sizeof(SizeType) + sizeof(DimensionType), std::ifstream::beg); inputStream.read(vecBuf, m_totalRecordVectorBytes); inputStream.close(); @@ -276,7 +280,7 @@ DefaultReader::LoadFileInternal(const std::string& p_filePath, std::ofstream metaStreamContent; std::ofstream metaStreamIndex; - std::uint32_t recordCount = 0; + SizeType recordCount = 0; std::uint64_t metaOffset = 0; std::size_t totalRead = 0; std::streamoff startpos = p_fileBlockID * p_fileBlockSize; @@ -400,11 +404,11 @@ DefaultReader::MergeData() std::unique_ptr bufferHolder(new char[bufferSize]); char* buf = bufferHolder.get(); - std::uint32_t uint32Var = m_totalRecordCount; + SizeType totalRecordCount = m_totalRecordCount; outputStream.open(m_vectorOutput, std::ofstream::binary); - outputStream.write(reinterpret_cast(&uint32Var), sizeof(uint32Var)); + outputStream.write(reinterpret_cast(&totalRecordCount), sizeof(totalRecordCount)); outputStream.write(reinterpret_cast(&(m_options->m_dimension)), sizeof(m_options->m_dimension)); for (std::uint32_t i = 0; i < m_subTaskCount; ++i) @@ -442,7 +446,7 @@ DefaultReader::MergeData() outputStream.open(m_metadataIndexOutput, std::ofstream::binary); - outputStream.write(reinterpret_cast(&uint32Var), sizeof(uint32Var)); + outputStream.write(reinterpret_cast(&totalRecordCount), sizeof(totalRecordCount)); std::uint64_t totalOffset = 0; for (std::uint32_t i = 0; i < m_subTaskCount; ++i) @@ -453,18 +457,18 @@ DefaultReader::MergeData() file += ".tmp"; inputStream.open(file, std::ifstream::binary); - for (std::uint32_t remains = m_subTaskRecordCount[i]; remains > 0;) + for (SizeType remains = m_subTaskRecordCount[i]; remains > 0;) { std::size_t readBytesCount = min(remains * sizeof(std::uint64_t), bufferSizeTrim64); inputStream.read(buf, readBytesCount); std::uint64_t* offset = reinterpret_cast(buf); - for (std::uint32_t i = 0; i < readBytesCount / sizeof(std::uint64_t); ++i) + for (std::uint64_t i = 0; i < readBytesCount / sizeof(std::uint64_t); ++i) { offset[i] += totalOffset; } outputStream.write(buf, readBytesCount); - remains -= static_cast(readBytesCount / sizeof(std::uint64_t)); + remains -= static_cast(readBytesCount / sizeof(std::uint64_t)); } inputStream.read(buf, sizeof(std::uint64_t)); diff --git a/AnnService/src/IndexBuilder/Options.cpp b/AnnService/src/IndexBuilder/Options.cpp index d0fcd0fd8..6360b73c2 100644 --- a/AnnService/src/IndexBuilder/Options.cpp +++ b/AnnService/src/IndexBuilder/Options.cpp @@ -11,14 +11,8 @@ using namespace SPTAG::IndexBuilder; BuilderOptions::BuilderOptions() - : m_threadNum(32), - m_inputValueType(VectorValueType::Float), - m_vectorDelimiter("|") + : Helper::ReaderOptions(VectorValueType::Float, 0, "|", 32) { - AddOptionalOption(m_threadNum, "-t", "--thread", "Thread Number."); - AddOptionalOption(m_vectorDelimiter, "", "--delimiter", "Vector delimiter."); - AddRequiredOption(m_dimension, "-d", "--dimension", "Dimension of vector."); - AddRequiredOption(m_inputValueType, "-v", "--vectortype", "Input vector data type. Default is float."); AddRequiredOption(m_inputFiles, "-i", "--input", "Input raw data."); AddRequiredOption(m_outputFolder, "-o", "--outputfolder", "Output folder."); AddRequiredOption(m_indexAlgoType, "-a", "--algo", "Index Algorithm type."); diff --git a/AnnService/src/IndexBuilder/VectorSetReader.cpp b/AnnService/src/IndexBuilder/VectorSetReader.cpp deleted file mode 100644 index e50f6f5eb..000000000 --- a/AnnService/src/IndexBuilder/VectorSetReader.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "inc/IndexBuilder/VectorSetReader.h" -#include "inc/IndexBuilder/VectorSetReaders/DefaultReader.h" - - -using namespace SPTAG; -using namespace SPTAG::IndexBuilder; - -VectorSetReader::VectorSetReader(std::shared_ptr p_options) - : m_options(p_options) -{ -} - - -VectorSetReader:: ~VectorSetReader() -{ -} - - -std::shared_ptr -VectorSetReader::CreateInstance(std::shared_ptr p_options) -{ - return std::shared_ptr(new DefaultReader(std::move(p_options))); -} - diff --git a/AnnService/src/IndexBuilder/main.cpp b/AnnService/src/IndexBuilder/main.cpp index 055cd3265..040703c3c 100644 --- a/AnnService/src/IndexBuilder/main.cpp +++ b/AnnService/src/IndexBuilder/main.cpp @@ -1,9 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "inc/IndexBuilder/ThreadPool.h" #include "inc/IndexBuilder/Options.h" -#include "inc/IndexBuilder/VectorSetReader.h" +#include "inc/Helper/VectorSetReader.h" #include "inc/Core/VectorIndex.h" #include "inc/Core/Common.h" #include "inc/Helper/SimpleIniReader.h" @@ -20,7 +19,7 @@ int main(int argc, char* argv[]) { exit(1); } - IndexBuilder::ThreadPool::Init(options->m_threadNum); + auto indexBuilder = VectorIndex::CreateInstance(options->m_indexAlgoType, options->m_inputValueType); Helper::IniReader iniReader; @@ -82,7 +81,7 @@ int main(int argc, char* argv[]) indexBuilder->SaveIndex(options->m_outputFolder); } else { - auto vectorReader = IndexBuilder::VectorSetReader::CreateInstance(options); + auto vectorReader = Helper::VectorSetReader::CreateInstance(options); if (ErrorCode::Success != vectorReader->LoadFile(options->m_inputFiles)) { fprintf(stderr, "Failed to read input file.\n");