From 36bebeede45b2d0ba9815c4ed25f20e1d8c477c2 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Tue, 11 Jan 2022 00:15:28 +0900 Subject: [PATCH 1/2] Found a subtle bug caused by data race. --- CMakeLists.txt | 12 ++++++++---- cpp_sources/child_worker.cpp | 8 ++++---- cpp_sources/trainer_base.cpp | 12 +++++++----- cpp_sources/trainer_base.hpp | 3 +++ tests/test_lda.py | 9 ++++++++- tests/test_llda.py | 6 +++--- 6 files changed, 33 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 776be75..ee55f51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,11 @@ -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.0.0) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -project(lda11) +set(CMAKE_RELS) +set(CMAKE_BUILD_TYPE Debug) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_INIT} -std=c++11 -fPIC -O0") +set(PYBIND11_PYTHON_VERSION 3.10) add_subdirectory(pybind11) -include_directories(eigen-3.3.7) -pybind11_add_module(lda11 src/wrapper.cpp src/labelled_lda.cpp src/trainer_base.cpp src/trainer.cpp src/predictor.cpp src/child_worker.cpp) +include_directories(eigen-3.4.0) +pybind11_add_module(_lda cpp_sources/wrapper.cpp cpp_sources/labelled_lda.cpp cpp_sources/trainer_base.cpp cpp_sources/trainer.cpp cpp_sources/predictor.cpp cpp_sources/child_worker.cpp) diff --git a/cpp_sources/child_worker.cpp b/cpp_sources/child_worker.cpp index ed28969..130fba6 100644 --- a/cpp_sources/child_worker.cpp +++ b/cpp_sources/child_worker.cpp @@ -73,6 +73,7 @@ void LDATrainerBase::ChildWorker::sync_topic( const Eigen::Ref &word_topic_global, const Eigen::Ref &doc_topic_global, const Eigen::Ref &topic_counts) { + word_topic_local = word_topic_global; topic_counts_local = topic_counts; for (size_t internal_dix = 0; internal_dix < global_indices.size(); @@ -86,7 +87,6 @@ void LDATrainerBase::ChildWorker::do_work( Eigen::Ref word_topic, Eigen::Ref doc_topic, Eigen::Ref topic_counts, const Eigen::Ref &topic_word_prior) { - this->sync_topic(word_topic, doc_topic, topic_counts); this->decr_count(word_topic, doc_topic, topic_counts); Real eta_sum = topic_word_prior.sum(); RealVector p_(n_topics_); @@ -98,8 +98,7 @@ void LDATrainerBase::ChildWorker::do_work( size_t global_dix = global_indices[ws.doc_id]; p_ = (word_topic_local.row(ws.word_id).cast().transpose().array() + - topic_word_prior.array()) - .array() / + topic_word_prior(ws.word_id)) / (topic_counts_local.cast().array() + eta_sum) * (doc_topic_local.row(ws.doc_id).cast().transpose().array() + parent_->obtain_doc_topic_prior(global_dix).array()); @@ -111,6 +110,7 @@ void LDATrainerBase::ChildWorker::do_work( topic_counts_local(ws.topic_id)++; } this->add_count(word_topic, doc_topic, topic_counts); + this->epoch++; } RealMatrix LDATrainerBase::ChildWorker::obtain_phi( @@ -126,7 +126,7 @@ RealMatrix LDATrainerBase::ChildWorker::obtain_phi( topic_counts_local(ws.topic_id)--; size_t global_dix = global_indices[ws.doc_id]; p_ = (word_topic_local.row(ws.word_id).cast().transpose().array() + - topic_word_prior.array()) + topic_word_prior(ws.word_id)) .array() / (topic_counts_local.cast().array() + eta_sum) * (doc_topic_local.row(ws.doc_id).cast().transpose().array() + diff --git a/cpp_sources/trainer_base.cpp b/cpp_sources/trainer_base.cpp index 6c72dad..f12904f 100644 --- a/cpp_sources/trainer_base.cpp +++ b/cpp_sources/trainer_base.cpp @@ -21,7 +21,7 @@ LDATrainerBase::LDATrainerBase(Eigen::Ref counts, if (n_workers == 0) { throw std::invalid_argument("n_workers has to be strictly positive."); - } else if (n_workers == 1) { + } else if (n_workers == 1u) { for (size_t i = 0; i < n_; i++) { size_t c = counts(i); size_t dix = dixs(i); @@ -43,7 +43,7 @@ LDATrainerBase::LDATrainerBase(Eigen::Ref counts, size_t assigned; if (dix_seen.find(dix) == dix_seen.end()) { assigned = std::floor(urand_.rand() * n_workers); - assigned = assigned < n_workers - 1 ? assigned : n_workers - 1; + assigned = assigned < (n_workers - 1) ? assigned : n_workers - 1; dix_seen.insert({dix, assigned}); children[assigned]->add_doc(dix); } else { @@ -92,8 +92,7 @@ void LDATrainerBase::iterate_gibbs(Eigen::Ref topic_word_prior, topic_counts(ws.topic_id)--; p_ = (word_topic.row(ws.word_id).cast().transpose().array() + - topic_word_prior.array()) - .array() / + topic_word_prior(ws.word_id)) / (topic_counts.cast().array() + eta_sum) * (doc_topic.row(ws.doc_id).cast().transpose().array() + obtain_doc_topic_prior(ws.doc_id).array()); @@ -106,6 +105,9 @@ void LDATrainerBase::iterate_gibbs(Eigen::Ref topic_word_prior, } } else { std::vector workers; + for (auto &child : children) { + child->sync_topic(word_topic, doc_topic, topic_counts); + } for (auto &child : children) { workers.emplace_back([&child, &word_topic, &doc_topic, &topic_counts, &topic_word_prior] { @@ -135,7 +137,7 @@ LDATrainerBase::obtain_phi(const Eigen::Ref &topic_word_prior, word_topic(ws.word_id, ws.topic_id)--; topic_counts(ws.topic_id)--; p_ = (word_topic.row(ws.word_id).cast().transpose().array() + - topic_word_prior.array()) + topic_word_prior(ws.word_id)) .array() / (topic_counts.cast().array() + eta_sum) * (doc_topic.row(ws.doc_id).cast().transpose().array() + diff --git a/cpp_sources/trainer_base.hpp b/cpp_sources/trainer_base.hpp index bb819ce..b170b58 100644 --- a/cpp_sources/trainer_base.hpp +++ b/cpp_sources/trainer_base.hpp @@ -54,12 +54,15 @@ struct LDATrainerBase { UrandDevice urand_; std::vector word_states_local; + + private: IntegerMatrix doc_topic_local; IntegerMatrix word_topic_local; IntegerVector topic_counts_local; std::vector global_indices; std::unordered_map dix_to_internal_index; + int epoch; }; protected: diff --git a/tests/test_lda.py b/tests/test_lda.py index a7cb1d7..9e52936 100644 --- a/tests/test_lda.py +++ b/tests/test_lda.py @@ -7,7 +7,14 @@ def test_lda(docs_gen: Docs) -> None: (X1, _), true_theta = docs_gen.gen_doc(1000) - lda = LDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25, use_cgs_p=False) + lda = LDA( + 2, + n_iter=50, + optimize_interval=1, + optimize_burn_in=25, + use_cgs_p=False, + n_workers=4, + ) lda.fit(X1) phi1 = lda.phi diff --git a/tests/test_llda.py b/tests/test_llda.py index 6d74d30..2611770 100644 --- a/tests/test_llda.py +++ b/tests/test_llda.py @@ -43,7 +43,7 @@ def test_llda() -> None: A_word_index = np.where(TOPIC_A > 0.1)[0] B_word_index = np.where(TOPIC_A < 0.1)[0] - for A_index, cgs_p, n_threads in zip([1, 2], [False, True], [1, 2]): + for A_index, cgs_p in zip([1, 2], [False, True]): if A_index == 1: language = LabelledLanguage(TOPIC_A, TOPIC_B) B_index = 2 @@ -53,7 +53,7 @@ def test_llda() -> None: X, Y = language.gen_doc(1000) - llda = LabelledLDA(use_cgs_p=cgs_p, n_workers=n_threads).fit(X, Y) + llda = LabelledLDA(use_cgs_p=cgs_p, n_workers=1, n_iter=100).fit(X, Y) if sys.platform.startswith("linux"): with NamedTemporaryFile() as temp_fs: pickle.dump(llda, temp_fs) @@ -70,7 +70,7 @@ def test_llda() -> None: A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32) for mode in ["mf", "gibbs"]: - theta = llda_new.transform(A_DOC, mode=mode, n_workers=n_threads)[0] # type: ignore + theta = llda_new.transform(A_DOC, mode=mode, n_workers=1)[0] # type: ignore if A_index == 1: assert (theta[1] / theta[2]) > 5 else: From 114de6d6e8c66ea8950e235f719e004c9b675caa Mon Sep 17 00:00:00 2001 From: Tomoki Date: Tue, 11 Jan 2022 00:22:32 +0900 Subject: [PATCH 2/2] efficient test path --- tests/test_lda.py | 2 +- tests/test_mlda.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_lda.py b/tests/test_lda.py index 9e52936..6acdbaf 100644 --- a/tests/test_lda.py +++ b/tests/test_lda.py @@ -12,7 +12,7 @@ def test_lda(docs_gen: Docs) -> None: n_iter=50, optimize_interval=1, optimize_burn_in=25, - use_cgs_p=False, + use_cgs_p=True, n_workers=4, ) lda.fit(X1) diff --git a/tests/test_mlda.py b/tests/test_mlda.py index 5ee03ce..fb0525e 100644 --- a/tests/test_mlda.py +++ b/tests/test_mlda.py @@ -9,7 +9,9 @@ def test_mlda(docs_gen: Docs) -> None: (X1, X2), true_theta = docs_gen.gen_doc(1000) X2 = sps.lil_matrix(X2) - lda = MultilingualLDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25) + lda = MultilingualLDA( + 2, n_iter=50, optimize_interval=1, optimize_burn_in=25, use_cgs_p=False + ) lda.fit(X1, X2) phi1, phi2 = lda.phis