From a4269d7d3ff0eac91c9b4c510d667bcbf31208f5 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Sun, 9 Jan 2022 21:48:45 +0900 Subject: [PATCH 01/32] add pre-commit --- .gitignore | 4 +-- .pre-commit-config.yaml | 28 ++++++++++++++++ CMakeLists.txt | 2 +- README.md | 2 -- examples/compare_perplexity.py | 50 ++++++++++++++--------------- lda11/__init__.py | 2 +- lda11/labelled_lda.py | 58 ++++++++++++++++++---------------- lda11/lda.py | 14 ++++---- lda11/util.py | 5 +-- pyproject.toml | 15 +++++++++ setup.py | 12 ++++--- src/defs.hpp | 2 +- 12 files changed, 119 insertions(+), 75 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index 66885ed..90a0672 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.python-version **.ipynb_checkpoints** eigen-3.3.7/ build/* @@ -9,10 +10,9 @@ lda11.egg-info .vscode/* *.so test/* -pubind11/ .eggs/ var/ dist/ compile_commands.json .clangd/ -**.ipynb_checkpoints/** \ No newline at end of file +**.ipynb_checkpoints/** diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..cef10c9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: check-merge-conflict + - id: check-yaml + - id: end-of-file-fixer + - id: no-commit-to-branch + args: [--branch, main] + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - repo: https://github.com/PyCQA/isort + rev: 5.6.4 + hooks: + - id: isort + name: isort +# - repo: https://github.com/pre-commit/mirrors-mypy +# rev: v0.790 # Use the sha / tag you want to point at +# hooks: +# - id: mypy + - repo: https://github.com/psf/black + rev: 20.8b1 + hooks: + - id: black diff --git a/CMakeLists.txt b/CMakeLists.txt index ce7d696..776be75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 2.8.12) -set(CMAKE_EXPORT_COMPILE_COMMANDS, True) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) project(lda11) add_subdirectory(pybind11) diff --git a/README.md b/README.md index 7acf19d..4f900df 100644 --- a/README.md +++ b/README.md @@ -21,5 +21,3 @@ type ``` EIGEN3_INCLUDE_DIR=/path/to/eigen pip install git+https://github.com/tohtsky/lda11 ``` - - diff --git a/examples/compare_perplexity.py b/examples/compare_perplexity.py index fcf7d5b..41c0df0 100644 --- a/examples/compare_perplexity.py +++ b/examples/compare_perplexity.py @@ -3,23 +3,22 @@ import numpy as np from scipy import sparse as sps -from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import fetch_20newsgroups -from sklearn.model_selection import train_test_split from sklearn.decomposition import LatentDirichletAllocation as LDA_vb +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split from lda11 import LDA as LDA_cgs_p from lda11.util import rowwise_train_test_split N_TOPICS = 16 -print('reading data...') -dataset = fetch_20newsgroups(shuffle=False, - remove=('headers', 'footers', 'quotes')) +print("reading data...") +dataset = fetch_20newsgroups(shuffle=False, remove=("headers", "footers", "quotes")) data_samples = dataset.data train_docs, test_docs = train_test_split(data_samples, random_state=42) -print('priparing Count Vectorizer') -tf_vectorizer = CountVectorizer(max_df=1.0, stop_words='english') +print("priparing Count Vectorizer") +tf_vectorizer = CountVectorizer(max_df=1.0, stop_words="english") X_train = tf_vectorizer.fit_transform(train_docs) X_test = tf_vectorizer.transform(test_docs) @@ -28,36 +27,36 @@ tf_vectorizer.get_stop_words() -print('Splitting test documents...') +print("Splitting test documents...") X_test_train, X_test_test = rowwise_train_test_split(X_test, random_seed=114514) -print('Start fitting sk-learn model...') +print("Start fitting sk-learn model...") start = time() vb_model = LDA_vb(n_components=N_TOPICS) vb_model.fit(X_train) -phi_vb = vb_model.components_ / \ - vb_model.components_.sum(axis=1)[:, np.newaxis] +phi_vb = vb_model.components_ / vb_model.components_.sum(axis=1)[:, np.newaxis] end = time() -print('done in {:.2f} seconds'.format((end-start))) +print("done in {:.2f} seconds".format((end - start))) -print('Start fitting our lda model...') +print("Start fitting our lda model...") start = time() cgs_p_model = LDA_cgs_p(n_components=N_TOPICS, n_iter=500) cgs_p_model.fit(X_train) phi_cgs_p = cgs_p_model.phi.transpose() end = time() -print('done in {:.2f} seconds'.format((end-start))) +print("done in {:.2f} seconds".format((end - start))) -print('Start fitting paralleized CGS sampler with hyper-parameter optimization...') +print("Start fitting paralleized CGS sampler with hyper-parameter optimization...") start = time() parallel_cgs_model = LDA_cgs_p( - n_components=N_TOPICS, n_iter=500, n_workers=2, optimize_interval=50) + n_components=N_TOPICS, n_iter=500, n_workers=2, optimize_interval=50 +) parallel_cgs_model.fit(X_train) phi_parallel_cgs = parallel_cgs_model.phi.transpose() end = time() -print('done in {:.2f} seconds'.format((end-start))) +print("done in {:.2f} seconds".format((end - start))) def test_perplexity(model, phi, **kwargs): @@ -65,27 +64,24 @@ def test_perplexity(model, phi, **kwargs): log_ps = np.log(theta.dot(phi)) coo = X_test_test.tocoo() # perplexity - return np.exp(- (log_ps[coo.row, coo.col] * coo.data).sum() / coo.data.sum()) + return np.exp(-(log_ps[coo.row, coo.col] * coo.data).sum() / coo.data.sum()) -print('Start testing vb model') +print("Start testing vb model") start = time() ll_vb = test_perplexity(vb_model, phi_vb) end = time() -print('Done in {:.2f} seconds, test perplexity = {:.2f}'.format( - end - start, ll_vb)) +print("Done in {:.2f} seconds, test perplexity = {:.2f}".format(end - start, ll_vb)) -print('Start testing cgs_p model') +print("Start testing cgs_p model") start = time() ll_cgs_p = test_perplexity(cgs_p_model, phi_cgs_p) end = time() -print('Done in {:.2f} seconds, test perplexity = {:.2f}'.format( - end - start, ll_cgs_p)) +print("Done in {:.2f} seconds, test perplexity = {:.2f}".format(end - start, ll_cgs_p)) -print('Start testing parallelized + optimized cgs model') +print("Start testing parallelized + optimized cgs model") start = time() ll_cgs_p = test_perplexity(parallel_cgs_model, phi_parallel_cgs, n_workers=4) end = time() -print('Done in {:.2f} seconds, test perplexity = {:.2f}'.format( - end - start, ll_cgs_p)) +print("Done in {:.2f} seconds, test perplexity = {:.2f}".format(end - start, ll_cgs_p)) diff --git a/lda11/__init__.py b/lda11/__init__.py index f696d9a..6775be3 100644 --- a/lda11/__init__.py +++ b/lda11/__init__.py @@ -1,2 +1,2 @@ -from .lda import LDA, MultipleContextLDA from .labelled_lda import LabelledLDA +from .lda import LDA, MultipleContextLDA diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py index dd1dcba..6d25844 100644 --- a/lda11/labelled_lda.py +++ b/lda11/labelled_lda.py @@ -1,22 +1,30 @@ import numpy as np from scipy import sparse as sps from tqdm import tqdm + from ._lda import LabelledLDATrainer from .lda import ( + IndexType, + IntegerType, + LDAPredictorMixin, Predictor, - RealType, IntegerType, IndexType, - number_to_array, check_array, - LDAPredictorMixin + RealType, + check_array, + number_to_array, ) class LabelledLDA(LDAPredictorMixin): - def __init__(self, - alpha=1e-2, epsilon=1e-30, topic_word_prior=None, add_dummy_topic=False, - n_iter=1000, - n_workers=1, - use_cgs_p=True - ): + def __init__( + self, + alpha=1e-2, + epsilon=1e-30, + topic_word_prior=None, + add_dummy_topic=False, + n_iter=1000, + n_workers=1, + use_cgs_p=True, + ): self.n_components = None self.topic_word_prior = topic_word_prior self.alpha = alpha @@ -49,51 +57,45 @@ def _fit(self, X, Y, ll_freq=10): self.n_components = Y.shape[1] ones_topic = np.ones(self.n_components, dtype=RealType) self.topic_word_prior = number_to_array( - X.shape[1], 1 / float(self.n_components), - self.topic_word_prior + X.shape[1], 1 / float(self.n_components), self.topic_word_prior ) try: count, dix, wix = check_array(X) except: - print('Check for X failed.') + print("Check for X failed.") raise - doc_topic = np.zeros( - (X.shape[0], self.n_components), dtype=IntegerType) - word_topic = np.zeros( - (X.shape[1], self.n_components), dtype=IntegerType) + doc_topic = np.zeros((X.shape[0], self.n_components), dtype=IntegerType) + word_topic = np.zeros((X.shape[1], self.n_components), dtype=IntegerType) topic_counts = np.zeros(self.n_components, dtype=IntegerType) docstate = LabelledLDATrainer( self.alpha, self.epsilon, Y, - count, dix, wix, self.n_components, 42, - self.n_workers + count, + dix, + wix, + self.n_components, + 42, + self.n_workers, ) docstate.initialize(word_topic, doc_topic, topic_counts) with tqdm(range(self.n_iter)) as pbar: for _ in pbar: docstate.iterate_gibbs( - self.topic_word_prior, - doc_topic, - word_topic, - topic_counts + self.topic_word_prior, doc_topic, word_topic, topic_counts ) - doc_topic_prior = ( - self.alpha * np.ones(self.n_components, dtype=RealType)) + doc_topic_prior = self.alpha * np.ones(self.n_components, dtype=RealType) self.components_ = word_topic.transpose() predictor = Predictor(self.n_components, doc_topic_prior, 42) if self.use_cgs_p: phi = docstate.obtain_phi( - self.topic_word_prior, - doc_topic, - word_topic, - topic_counts + self.topic_word_prior, doc_topic, word_topic, topic_counts ) else: phi = word_topic + self.topic_word_prior[:, np.newaxis] diff --git a/lda11/lda.py b/lda11/lda.py index 277e1aa..512829f 100644 --- a/lda11/lda.py +++ b/lda11/lda.py @@ -1,16 +1,18 @@ -import numpy as np -from numbers import Number from gc import collect +from numbers import Number + +import numpy as np +from scipy import sparse as sps +from scipy.special import digamma +from tqdm import tqdm + from ._lda import ( LDATrainer, - log_likelihood_doc_topic, Predictor, learn_dirichlet, learn_dirichlet_symmetric, + log_likelihood_doc_topic, ) -from tqdm import tqdm -from scipy import sparse as sps -from scipy.special import digamma RealType = np.float64 IntegerType = np.int32 diff --git a/lda11/util.py b/lda11/util.py index 64120c3..c6e01ee 100644 --- a/lda11/util.py +++ b/lda11/util.py @@ -1,7 +1,8 @@ import numpy as np from scipy import sparse as sps -from .lda import RealType, IntegerType + from ._lda import train_test_split +from .lda import IntegerType, RealType def rowwise_train_test_split(X, random_seed=None, test_ratio=0.5): @@ -9,6 +10,6 @@ def rowwise_train_test_split(X, random_seed=None, test_ratio=0.5): split matrix randomly """ if random_seed is None: - random_seed = np.random.randint(-2**63, 2**63-1) + random_seed = np.random.randint(-(2 ** 63), 2 ** 63 - 1) X = sps.csr_matrix(X, dtype=IntegerType) return train_test_split(X, test_ratio, random_seed) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6796ae2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +[tool.black] +ensure_newline_before_comments = true +force_grid_wrap = 0 +include_trailing_comma = true +line_length = 88 +multi_line_output = 3 +use_parentheses = true + +[tool.isort] +ensure_newline_before_comments = true +force_grid_wrap = 0 +include_trailing_comma = true +line_length = 88 +multi_line_output = 3 +use_parentheses = true diff --git a/setup.py b/setup.py index 9bce798..82fe294 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,9 @@ -from setuptools import setup, Extension -from setuptools.command.build_ext import build_ext +import os import sys + import setuptools -import os +from setuptools import Extension, setup +from setuptools.command.build_ext import build_ext __version__ = "0.2.2.0" install_requires = ["pybind11>=2.5", "numpy >= 1.11", "tqdm", "scipy>=1.0.0"] @@ -27,9 +28,10 @@ def __str__(self): return target_dir download_target_dir = os.path.join(basedir, "eigen3.zip") - import requests import zipfile + import requests + response = requests.get(self.EIGEN3_URL, stream=True) with open(download_target_dir, "wb") as ofs: for chunk in response.iter_content(chunk_size=1024): @@ -45,7 +47,7 @@ class get_pybind_include(object): """Helper class to determine the pybind11 include path The purpose of this class is to postpone importing pybind11 until it is actually installed, so that the ``get_include()`` - method can be invoked. """ + method can be invoked.""" def __init__(self, user=False): self.user = user diff --git a/src/defs.hpp b/src/defs.hpp index 7646504..1213a36 100644 --- a/src/defs.hpp +++ b/src/defs.hpp @@ -36,4 +36,4 @@ struct UrandDevice { private: std::mt19937 random_state_; std::uniform_real_distribution udist_; -}; \ No newline at end of file +}; From b46d5c566c5d5519365f20e747e138b77d12f593 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Sun, 9 Jan 2022 21:56:19 +0900 Subject: [PATCH 02/32] Added stubs for _lda --- .gitignore | 1 + create_pb_stubs.sh | 13 ++ lda11/_lda.pyi | 193 ++++++++++++++++++++++++++++ src/wrapper.cpp | 3 - stubs/lda11/_lda-stubs/__init__.pyi | 58 +++++++++ tests/__init__.py | 0 6 files changed, 265 insertions(+), 3 deletions(-) create mode 100755 create_pb_stubs.sh create mode 100644 lda11/_lda.pyi create mode 100644 stubs/lda11/_lda-stubs/__init__.pyi create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore index 90a0672..19ff8cb 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ dist/ compile_commands.json .clangd/ **.ipynb_checkpoints/** +.cache/clangd diff --git a/create_pb_stubs.sh b/create_pb_stubs.sh new file mode 100755 index 0000000..ab0c720 --- /dev/null +++ b/create_pb_stubs.sh @@ -0,0 +1,13 @@ +#!/bin/sh +module_name="lda11._lda" +echo "Create stub for $module_name" +pybind11-stubgen -o stubs --no-setup-py "$module_name" +output_path="$(echo "${module_name}" | sed 's/\./\//g').pyi" +input_path="stubs/$(echo "${module_name}" | sed 's/\./\//g')-stubs/__init__.pyi" +rm "${output_path}" +echo 'm: int +n: int +from numpy import float32 +' >> "${output_path}" +cat "${input_path}" >> "${output_path}" +black "${output_path}" diff --git a/lda11/_lda.pyi b/lda11/_lda.pyi new file mode 100644 index 0000000..4349ab9 --- /dev/null +++ b/lda11/_lda.pyi @@ -0,0 +1,193 @@ +m: int +n: int +from numpy import float32 + +"""Backend C++ inplementation for lda11.""" +from __future__ import annotations +import lda11._lda +import typing +import numpy +import scipy.sparse + +_Shape = typing.Tuple[int, ...] + +__all__ = [ + "LDATrainer", + "LabelledLDATrainer", + "Predictor", + "learn_dirichlet", + "learn_dirichlet_symmetric", + "log_likelihood_doc_topic", + "train_test_split", +] + +class LDATrainer: + def __init__( + self, + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]], + arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]], + arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]], + arg4: int, + arg5: int, + arg6: int, + ) -> None: ... + def initialize( + self, + arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]], + ) -> None: ... + def iterate_gibbs( + self, + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], + ) -> None: ... + def log_likelihood( + self, + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + ) -> float: ... + def obtain_phi( + self, + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], + ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + def set_doc_topic_prior( + self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]] + ) -> None: ... + pass + +class LabelledLDATrainer: + def __init__( + self, + arg0: float, + arg1: float, + arg2: scipy.sparse.csr_matrix[numpy.int32], + arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], + arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]], + arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]], + arg6: int, + arg7: int, + arg8: int, + ) -> None: ... + def initialize( + self, + arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]], + ) -> None: ... + def iterate_gibbs( + self, + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], + ) -> None: ... + def log_likelihood( + self, + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + ) -> float: ... + def obtain_phi( + self, + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], + ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + pass + +class Predictor: + def __getstate__(self) -> tuple: ... + def __init__( + self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int + ) -> None: ... + def __setstate__(self, arg0: tuple) -> None: ... + def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ... + def predict_gibbs( + self, + arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg2: int, + arg3: int, + arg4: int, + arg5: bool, + ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... + def predict_gibbs_batch( + self, + arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], + arg1: int, + arg2: int, + arg3: int, + arg4: bool, + arg5: int, + ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + def predict_gibbs_with_word_assignment( + self, + arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg2: int, + arg3: int, + arg4: int, + arg5: bool, + ) -> typing.Tuple[ + numpy.ndarray[numpy.float64, _Shape[m, 1]], + typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]], + ]: ... + def predict_mf( + self, + arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg2: int, + arg3: float, + ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... + def predict_mf_batch( + self, + arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], + arg1: int, + arg2: float, + arg3: int, + ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + @property + def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]: + """ + :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]] + """ + pass + +def learn_dirichlet( + arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg2: float, + arg3: float, + arg4: int, +) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: + pass + +def learn_dirichlet_symmetric( + arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg1: float, + arg2: float, + arg3: float, + arg4: int, +) -> float: + pass + +def log_likelihood_doc_topic( + arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]], +) -> float: + pass + +def train_test_split( + arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int +) -> typing.Tuple[ + scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32] +]: + pass diff --git a/src/wrapper.cpp b/src/wrapper.cpp index c5287f7..c73915b 100644 --- a/src/wrapper.cpp +++ b/src/wrapper.cpp @@ -145,8 +145,6 @@ Real learn_dirichlet_symmetric(const Eigen::Ref &counts, } Real alpha_current(alpha_start); - Real numerator; - vector doc_length; vector doc_length_freq; @@ -183,7 +181,6 @@ Real learn_dirichlet_symmetric(const Eigen::Ref &counts, } for (size_t it = 0; it < iteration; it++) { Real alpha_sum = n_topic * alpha_current; - numerator = 0; Real denominator = ((vector_to_eigen(doc_length).array() + alpha_sum).digamma() - digamma(alpha_sum)) diff --git a/stubs/lda11/_lda-stubs/__init__.pyi b/stubs/lda11/_lda-stubs/__init__.pyi new file mode 100644 index 0000000..ed1336a --- /dev/null +++ b/stubs/lda11/_lda-stubs/__init__.pyi @@ -0,0 +1,58 @@ +"""Backend C++ inplementation for lda11.""" +from __future__ import annotations +import lda11._lda +import typing +import numpy +import scipy.sparse +_Shape = typing.Tuple[int, ...] + +__all__ = [ + "LDATrainer", + "LabelledLDATrainer", + "Predictor", + "learn_dirichlet", + "learn_dirichlet_symmetric", + "log_likelihood_doc_topic", + "train_test_split" +] + + +class LDATrainer(): + def __init__(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg4: int, arg5: int, arg6: int) -> None: ... + def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... + def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... + def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ... + def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + def set_doc_topic_prior(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]]) -> None: ... + pass +class LabelledLDATrainer(): + def __init__(self, arg0: float, arg1: float, arg2: scipy.sparse.csr_matrix[numpy.int32], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg6: int, arg7: int, arg8: int) -> None: ... + def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... + def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... + def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ... + def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + pass +class Predictor(): + def __getstate__(self) -> tuple: ... + def __init__(self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int) -> None: ... + def __setstate__(self, arg0: tuple) -> None: ... + def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ... + def predict_gibbs(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... + def predict_gibbs_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + def predict_gibbs_with_word_assignment(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> typing.Tuple[numpy.ndarray[numpy.float64, _Shape[m, 1]], typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]]]: ... + def predict_mf(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: float) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... + def predict_mf_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: float, arg3: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + @property + def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]: + """ + :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]] + """ + pass +def learn_dirichlet(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: float, arg3: float, arg4: int) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: + pass +def learn_dirichlet_symmetric(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: float, arg2: float, arg3: float, arg4: int) -> float: + pass +def log_likelihood_doc_topic(arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> float: + pass +def train_test_split(arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int) -> typing.Tuple[scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32]]: + pass diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From bad17635fd6f4017413e9a2820fe4edeb1bff204 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Sun, 9 Jan 2022 23:28:59 +0900 Subject: [PATCH 03/32] mypy --- .pre-commit-config.yaml | 8 +- lda11/labelled_lda.py | 7 +- lda11/lda.py | 219 ++++++++++++++++++++++------------------ setup.py | 3 +- 4 files changed, 129 insertions(+), 108 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cef10c9..809da50 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,10 +18,10 @@ repos: hooks: - id: isort name: isort -# - repo: https://github.com/pre-commit/mirrors-mypy -# rev: v0.790 # Use the sha / tag you want to point at -# hooks: -# - id: mypy + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.790 # Use the sha / tag you want to point at + hooks: + - id: mypy - repo: https://github.com/psf/black rev: 20.8b1 hooks: diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py index 6d25844..add8f46 100644 --- a/lda11/labelled_lda.py +++ b/lda11/labelled_lda.py @@ -3,11 +3,11 @@ from tqdm import tqdm from ._lda import LabelledLDATrainer +from ._lda import Predictor as CorePredictor from .lda import ( IndexType, IntegerType, LDAPredictorMixin, - Predictor, RealType, check_array, number_to_array, @@ -48,7 +48,7 @@ def fit_transform(self, X, Y, **kwargs): result /= result.sum(axis=1)[:, np.newaxis] return result - def _fit(self, X, Y, ll_freq=10): + def _fit(self, X, Y, ll_freq: int = 10): if not sps.issparse(Y): Y = sps.csr_matrix(Y).astype(IntegerType) else: @@ -92,7 +92,7 @@ def _fit(self, X, Y, ll_freq=10): doc_topic_prior = self.alpha * np.ones(self.n_components, dtype=RealType) self.components_ = word_topic.transpose() - predictor = Predictor(self.n_components, doc_topic_prior, 42) + predictor = CorePredictor(self.n_components, doc_topic_prior, 42) if self.use_cgs_p: phi = docstate.obtain_phi( self.topic_word_prior, doc_topic, word_topic, topic_counts @@ -108,4 +108,5 @@ def _fit(self, X, Y, ll_freq=10): @property def phi(self): + assert self.predictor is not None return self.predictor.phis[0] diff --git a/lda11/lda.py b/lda11/lda.py index 512829f..dff1cfb 100644 --- a/lda11/lda.py +++ b/lda11/lda.py @@ -1,61 +1,76 @@ from gc import collect from numbers import Number +from typing import Dict, List, Literal, NamedTuple, Optional, Tuple, Union import numpy as np +from numpy import integer +from numpy import typing as npt from scipy import sparse as sps -from scipy.special import digamma from tqdm import tqdm -from ._lda import ( - LDATrainer, - Predictor, - learn_dirichlet, - learn_dirichlet_symmetric, - log_likelihood_doc_topic, -) +from ._lda import LDATrainer +from ._lda import Predictor as CorePredictor +from ._lda import learn_dirichlet, learn_dirichlet_symmetric, log_likelihood_doc_topic RealType = np.float64 + IntegerType = np.int32 IndexType = np.uint64 -def number_to_array(n_components, default, arg=None, ensure_symmetry=False): - if arg is None: - arg = default - if isinstance(arg, Number): - return np.ones(n_components, dtype=RealType) * RealType(arg) - elif isinstance(arg, np.ndarray): - assert arg.shape[0] == n_components - if ensure_symmetry and np.unique(arg).shape[0] > 1: +ValidXType = Union[sps.spmatrix, np.ndarray] +PriorType = Union[np.ndarray, float, None] + + +class LDAInput(NamedTuple): + counts: np.ndarray + dix: np.ndarray + wis: np.ndarray + + +def number_to_array( + n_components: int, + default: float, + arg_: Union[float, None, np.ndarray] = None, + ensure_symmetry: bool = False, +) -> np.ndarray: + if arg_ is None or isinstance(arg_, float): + value_ = default if arg_ is None else float(arg_) + return np.ones(n_components, dtype=RealType) * value_ + if isinstance(arg_, np.ndarray): + assert arg_.shape[0] == n_components + if ensure_symmetry and np.unique(arg_).shape[0] > 1: raise ValueError("Symmetric array required.") - return arg.astype(RealType) - return None + return arg_.astype(RealType) + raise ValueError("Number of ndarray is required.") -def check_array(X): +def check_array(X: ValidXType) -> LDAInput: + assert X.dtype == np.int32 or X.dtype == np.int64 if isinstance(X, np.ndarray): assert len(X.shape) == 2 - assert X.dtype == np.int32 or X.dtype == np.int64 - dix, wix = X.nonzero() - counts = X[dix, wix] + counts: np.ndarray = X[dix, wix] elif sps.issparse(X): # if X is either types of, scipy.sparse X has this attribute. - X = X.tocsr() + X = sps.csr_matrix(X) X.sort_indices() dix, wix = X.nonzero() - counts = X.data + counts = X.data.astype(np.int32) else: raise ValueError("The input must be either np.ndarray or sparse array.") - return counts.astype(IntegerType), dix.astype(IndexType), wix.astype(IndexType) + return LDAInput( + counts.astype(IntegerType), dix.astype(IndexType), wix.astype(IndexType) + ) -def bow_row_to_counts(X, i): +def bow_row_to_counts(X: ValidXType, i: int) -> Tuple[np.ndarray, np.ndarray]: + wix: np.ndarray if isinstance(X, np.ndarray): assert len(X.shape) == 2 assert X.dtype == np.int32 or X.dtype == np.int64 (wix,) = X[i].nonzero() - counts = X[i, wix] + counts: np.ndarray = X[i, wix] else: _, wix = X[i].nonzero() counts = X[i, wix].toarray().ravel() @@ -63,7 +78,7 @@ def bow_row_to_counts(X, i): return counts.astype(IntegerType), wix.astype(IndexType) -def to_sparse(X): +def to_valid_csr(X: ValidXType) -> sps.csr_matrix: result = sps.csr_matrix(X) result.data = result.data.astype(IntegerType) return result @@ -77,33 +92,38 @@ class LDAPredictorMixin: are needed """ + topic_word_priors_: Optional[List[np.ndarray]] + predictor: Optional[CorePredictor] + def transform( self, - *Xs, - n_iter=100, - random_seed=42, - mode="gibbs", - mf_tolerance=1e-10, - gibbs_burn_in=10, - use_cgs_p=True, + *Xs: Union[ValidXType, None], + n_iter: int = 100, + random_seed: int = 42, + mode: Literal["gibbs", "mf"] = "gibbs", + mf_tolerance: float = 1e-10, + gibbs_burn_in: int = 10, + use_cgs_p: bool = True, n_workers=1 - ): - shapes = set({X.shape[0] for X in Xs}) + ) -> np.ndarray: + assert self.topic_word_priors_ is not None + assert self.predictor is not None + shapes = set({int(X.shape[0]) for X in Xs if X is not None}) if len(shapes) != 1: raise ValueError("Got different shape for Xs.") shape = list(shapes)[0] - Xs_csr = [] + Xs_csr: List[sps.csr_matrix] = [] for i, X in enumerate(Xs): if X is None: Xs_csr.append( sps.csr_matrix( - ([], ([], [])), - shape=(shape, self.topic_word_priors[i].shape[0]), + shape=(shape, self.topic_word_priors_[i].shape[0]), + dtype=IntegerType, ) ) else: - Xs_csr.append(to_sparse(X)) + Xs_csr.append(to_valid_csr(X)) if mode == "gibbs": return self.predictor.predict_gibbs_batch( @@ -116,7 +136,9 @@ def transform( def word_topic_assignment( self, *Xs, n_iter=100, random_seed=42, gibbs_burn_in=10, use_cgs_p=True - ): + ) -> List[Tuple[np.ndarray, List[Dict[int, np.ndarray]]]]: + assert self.topic_word_priors_ is not None + assert self.predictor is not None n_domains = len(Xs) shapes = set({X.shape[0] for X in Xs}) if len(shapes) != 1: @@ -129,7 +151,7 @@ def word_topic_assignment( Xs_csr.append( sps.csr_matrix( ([], ([], [])), - shape=(shape, self.topic_word_priors[i].shape[0]), + shape=(shape, self.topic_word_priors_[i].shape[0]), ) ) results = [] @@ -149,38 +171,40 @@ def word_topic_assignment( return results @property - def phis(self): + def phis(self) -> List[np.ndarray]: + assert self.predictor is not None return self.predictor.phis class MultipleContextLDA(LDAPredictorMixin): def __init__( self, - n_components=100, - doc_topic_prior=None, - topic_word_priors=None, - n_iter=1000, - optimize_interval=None, - optimize_burn_in=None, - n_workers=1, - use_cgs_p=True, - is_phi_symmetric=True, + n_components: int = 100, + doc_topic_prior: PriorType = None, + n_iter: int = 1000, + optimize_interval: Optional[int] = None, + optimize_burn_in: Optional[int] = None, + n_workers: int = 1, + use_cgs_p: bool = True, + is_phi_symmetric: bool = True, ): n_components = int(n_components) assert n_iter >= 1 assert n_components >= 1 self.n_components = n_components - self.doc_topic_prior = doc_topic_prior - self.topic_word_priors = topic_word_priors + self.doc_topic_prior = number_to_array( + self.n_components, 1 / float(self.n_components), doc_topic_prior + ) + self.topic_word_priors_ = None self.is_phi_symmetric = is_phi_symmetric - self.n_vocabs = None + self.n_vocabs: Optional[List[int]] = None self.docstate_ = None - self.components_ = None - self.n_modals = None + self.components_: Optional[int] = None + self.n_modals: Optional[int] = None - self.predictor = None - self.use_cgs_p = use_cgs_p + self.predictor: Optional[CorePredictor] = None + self.use_cgs_p: bool = use_cgs_p self.n_iter = n_iter self.optimize_interval = optimize_interval @@ -197,35 +221,34 @@ def fit(self, *X, **kwargs): self._fit(*X, **kwargs) return self - def _fit(self, *Xs, ll_freq=10): + def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray: """ Xs should be a list of contents. All entries must have the same shape[0]. """ - n_vocabs = [] self.modality = len(Xs) - self.doc_topic_prior = number_to_array( - self.n_components, 1 / float(self.n_components), self.doc_topic_prior - ) - if self.topic_word_priors is None: - self.topic_word_priors = [None for i in range(self.modality)] + topic_word_priors_canonical: List[np.ndarray] = [] - self.topic_word_priors = [ - number_to_array( - X.shape[1], - 1 / float(self.n_components), - ensure_symmetry=self.is_phi_symmetric, - ) - for X, val in zip(Xs, self.topic_word_priors) - ] + doc_tuples: List[LDAInput] = [] - doc_tuples = [] + n_rows: Optional[int] = None for X in Xs: - doc_tuples.append((check_array(X))) + doc_tuples.append(check_array(X)) + if n_rows is None: + n_rows = X.shape[0] + else: + assert n_rows == X.shape[0] + topic_word_priors_canonical.append( + number_to_array( + X.shape[1], + 1 / float(self.n_components), + ensure_symmetry=True, + ) + ) - doc_topic = np.zeros((X.shape[0], self.n_components), dtype=IntegerType) + doc_topic: np.ndarray = np.zeros((n_rows, self.n_components), dtype=IntegerType) topic_counts = np.zeros(self.n_components, dtype=IntegerType) @@ -233,7 +256,7 @@ def _fit(self, *Xs, ll_freq=10): np.zeros((X.shape[1], self.n_components), dtype=IntegerType) for X in Xs ] - docstates = [] + docstates: List[LDATrainer] = [] for (count, dix, wix), word_topic in zip(doc_tuples, word_topics): docstate = LDATrainer( self.doc_topic_prior, @@ -246,11 +269,11 @@ def _fit(self, *Xs, ll_freq=10): ) docstates.append(docstate) docstate.initialize(word_topic, doc_topic, topic_counts) - doc_length = doc_topic.sum(axis=1).astype(IntegerType) + doc_length: np.ndarray = doc_topic.sum(axis=1).astype(IntegerType) ll = log_likelihood_doc_topic(self.doc_topic_prior, doc_topic, doc_length) for topic_word_prior, word_topic, docstate in zip( - self.topic_word_priors, word_topics, docstates + topic_word_priors_canonical, word_topics, docstates ): ll += docstate.log_likelihood(topic_word_prior, word_topic) @@ -258,7 +281,7 @@ def _fit(self, *Xs, ll_freq=10): pbar.set_description("Log Likelihood = {0:.2f}".format(ll)) for i in pbar: for topic_word_prior, word_topic, docstate in zip( - self.topic_word_priors, word_topics, docstates + topic_word_priors_canonical, word_topics, docstates ): docstate.iterate_gibbs( topic_word_prior, doc_topic, word_topic, topic_counts @@ -269,7 +292,7 @@ def _fit(self, *Xs, ll_freq=10): ) for topic_word_prior, word_topic, docstate in zip( - self.topic_word_priors, word_topics, docstates + topic_word_priors_canonical, word_topics, docstates ): ll += docstate.log_likelihood(topic_word_prior, word_topic) pbar.set_description("Log Likelihood = {0:.2f}".format(ll)) @@ -287,7 +310,7 @@ def _fit(self, *Xs, ll_freq=10): 100, ) for topic_word_prior, word_topic, docstate in zip( - self.topic_word_priors, word_topics, docstates + topic_word_priors_canonical, word_topics, docstates ): if self.is_phi_symmetric: topic_word_prior_new = np.ones_like( @@ -310,8 +333,9 @@ def _fit(self, *Xs, ll_freq=10): topic_word_prior[:] = topic_word_prior_new self.doc_topic_prior = doc_topic_prior_new docstate.set_doc_topic_prior(doc_topic_prior_new) + self.topic_word_priors = topic_word_priors_canonical - predictor = Predictor(self.n_components, self.doc_topic_prior, 42) + predictor = CorePredictor(self.n_components, self.doc_topic_prior, 42) for i, (twp, wt, docstate) in enumerate( zip(self.topic_word_priors, word_topics, docstates) @@ -334,25 +358,19 @@ class LDA(MultipleContextLDA): def __init__( self, - n_components=100, - doc_topic_prior=None, - topic_word_prior=None, - n_iter=1000, - optimize_burn_in=None, - optimize_interval=None, - n_workers=1, - use_cgs_p=True, - is_phi_symmetric=True, + n_components: int = 100, + doc_topic_prior: Optional[np.ndarray] = None, + n_iter: int = 1000, + optimize_burn_in: Optional[int] = None, + optimize_interval: Optional[int] = None, + n_workers: int = 1, + use_cgs_p: bool = True, + is_phi_symmetric: bool = True, ): - if topic_word_prior is not None: - topic_word_priors = [topic_word_prior] - else: - topic_word_priors = None super(LDA, self).__init__( n_components=n_components, doc_topic_prior=doc_topic_prior, - topic_word_priors=topic_word_priors, n_iter=n_iter, optimize_burn_in=optimize_burn_in, optimize_interval=optimize_interval, @@ -366,5 +384,6 @@ def fit(self, X, **kwargs): return self @property - def phi(self): + def phi(self) -> np.ndarray: + assert self.predictor is not None return self.predictor.phis[0] diff --git a/setup.py b/setup.py index 82fe294..9354f54 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ import os import sys +from typing import Dict, List import setuptools from setuptools import Extension, setup @@ -117,7 +118,7 @@ class BuildExt(build_ext): "msvc": ["/EHsc"], "unix": [], } - l_opts = { + l_opts: Dict[str, List[str]] = { "msvc": [], "unix": [], } From eb343e6e469d37d1b30e6808ca8166d29d32dbe0 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 11:02:13 +0900 Subject: [PATCH 04/32] mypy check now passes --- lda11/__init__.py | 4 +- lda11/_lda.pyi | 125 +++++++++---------- lda11/labelled_lda.py | 44 ++++--- lda11/lda.py | 108 +++++++++++----- lda11/util.py | 10 +- pyproject.toml | 11 ++ stubs/lda11/_lda-stubs/__init__.pyi | 187 +++++++++++++++++++++++----- 7 files changed, 339 insertions(+), 150 deletions(-) diff --git a/lda11/__init__.py b/lda11/__init__.py index 6775be3..7981c69 100644 --- a/lda11/__init__.py +++ b/lda11/__init__.py @@ -1,2 +1,4 @@ from .labelled_lda import LabelledLDA -from .lda import LDA, MultipleContextLDA +from .lda import LDA, MultilingualLDA + +__all__ = ["LDA", "LabelledLDA", "MultilingualLDA"] diff --git a/lda11/_lda.pyi b/lda11/_lda.pyi index 4349ab9..902111c 100644 --- a/lda11/_lda.pyi +++ b/lda11/_lda.pyi @@ -1,12 +1,9 @@ -m: int -n: int -from numpy import float32 - """Backend C++ inplementation for lda11.""" from __future__ import annotations import lda11._lda import typing import numpy +import numpy.typing as npt import scipy.sparse _Shape = typing.Tuple[int, ...] @@ -24,42 +21,40 @@ __all__ = [ class LDATrainer: def __init__( self, - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]], - arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]], - arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]], + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.uint64], + arg3: npt.NDArray[numpy.uint64], arg4: int, arg5: int, arg6: int, ) -> None: ... def initialize( self, - arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]], + arg0: npt.NDArray[numpy.int32], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], ) -> None: ... def iterate_gibbs( self, - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], ) -> None: ... def log_likelihood( self, - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], ) -> float: ... def obtain_phi( self, - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], - ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... - def set_doc_topic_prior( - self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]] - ) -> None: ... + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], + ) -> npt.NDArray[numpy.float64]: ... + def set_doc_topic_prior(self, arg0: npt.NDArray[numpy.float64]) -> None: ... pass class LabelledLDATrainer: @@ -68,56 +63,56 @@ class LabelledLDATrainer: arg0: float, arg1: float, arg2: scipy.sparse.csr_matrix[numpy.int32], - arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], - arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]], - arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]], + arg3: npt.NDArray[numpy.int32], + arg4: npt.NDArray[numpy.uint64], + arg5: npt.NDArray[numpy.uint64], arg6: int, arg7: int, arg8: int, ) -> None: ... def initialize( self, - arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]], + arg0: npt.NDArray[numpy.int32], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], ) -> None: ... def iterate_gibbs( self, - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], ) -> None: ... def log_likelihood( self, - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], ) -> float: ... def obtain_phi( self, - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], - ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], + ) -> npt.NDArray[numpy.float64]: ... pass class Predictor: def __getstate__(self) -> tuple: ... def __init__( - self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int + self, arg0: int, arg1: npt.NDArray[numpy.float64], arg2: int ) -> None: ... def __setstate__(self, arg0: tuple) -> None: ... - def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ... + def add_beta(self, arg0: npt.NDArray[numpy.float64]) -> None: ... def predict_gibbs( self, - arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], - arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg0: typing.List[npt.NDArray[numpy.int32]], + arg1: typing.List[npt.NDArray[numpy.int32]], arg2: int, arg3: int, arg4: int, arg5: bool, - ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... + ) -> npt.NDArray[numpy.float64]: ... def predict_gibbs_batch( self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], @@ -126,51 +121,51 @@ class Predictor: arg3: int, arg4: bool, arg5: int, - ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + ) -> npt.NDArray[numpy.float64]: ... def predict_gibbs_with_word_assignment( self, - arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], - arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg0: typing.List[npt.NDArray[numpy.int32]], + arg1: typing.List[npt.NDArray[numpy.int32]], arg2: int, arg3: int, arg4: int, arg5: bool, ) -> typing.Tuple[ - numpy.ndarray[numpy.float64, _Shape[m, 1]], - typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]], + npt.NDArray[numpy.float64], + typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]], ]: ... def predict_mf( self, - arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], - arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], + arg0: typing.List[npt.NDArray[numpy.int32]], + arg1: typing.List[npt.NDArray[numpy.int32]], arg2: int, arg3: float, - ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... + ) -> npt.NDArray[numpy.float64]: ... def predict_mf_batch( self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: float, arg3: int, - ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + ) -> npt.NDArray[numpy.float64]: ... @property - def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]: + def phis(self) -> typing.List[npt.NDArray[numpy.float64]]: """ - :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]] + :type: typing.List[npt.NDArray[numpy.float64]] """ pass def learn_dirichlet( - arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], + arg0: npt.NDArray[numpy.int32], + arg1: npt.NDArray[numpy.float64], arg2: float, arg3: float, arg4: int, -) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: +) -> npt.NDArray[numpy.float64]: pass def learn_dirichlet_symmetric( - arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], + arg0: npt.NDArray[numpy.int32], arg1: float, arg2: float, arg3: float, @@ -179,9 +174,9 @@ def learn_dirichlet_symmetric( pass def log_likelihood_doc_topic( - arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], - arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], - arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]], + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], ) -> float: pass diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py index add8f46..b19f13b 100644 --- a/lda11/labelled_lda.py +++ b/lda11/labelled_lda.py @@ -1,4 +1,7 @@ +from typing import Optional + import numpy as np +from numpy import typing as npt from scipy import sparse as sps from tqdm import tqdm @@ -9,6 +12,7 @@ IntegerType, LDAPredictorMixin, RealType, + ValidXType, check_array, number_to_array, ) @@ -17,47 +21,41 @@ class LabelledLDA(LDAPredictorMixin): def __init__( self, - alpha=1e-2, - epsilon=1e-30, - topic_word_prior=None, - add_dummy_topic=False, - n_iter=1000, - n_workers=1, - use_cgs_p=True, + alpha: float = 1e-2, + epsilon: float = 1e-30, + n_iter: int = 1000, + n_workers: int = 1, + use_cgs_p: bool = True, ): - self.n_components = None - self.topic_word_prior = topic_word_prior + self.n_components_: Optional[int] = None self.alpha = alpha self.epsilon = 1e-20 self.n_vocabs = None self.docstate_ = None - self.components_ = None + self.components_: Optional[npt.NDArray[np.int32]] = None self.predictor = None self.n_workers = n_workers self.epsilon = epsilon - self.add_dummy_topic = add_dummy_topic self.n_iter = n_iter self.use_cgs_p = use_cgs_p - def fit(self, X, Y): - self._fit(X, Y) + def fit(self, X: ValidXType, Y: ValidXType) -> "LabelledLDA": + self._fit_llda(X, Y) return self - def fit_transform(self, X, Y, **kwargs): - result = self._fit(X, **kwargs) + self.doc_topic_prior[np.newaxis, :] - result /= result.sum(axis=1)[:, np.newaxis] - return result - - def _fit(self, X, Y, ll_freq: int = 10): + def _fit_llda( + self, + X: ValidXType, + Y: ValidXType, + ) -> npt.NDArray[np.int32]: if not sps.issparse(Y): Y = sps.csr_matrix(Y).astype(IntegerType) else: Y = Y.astype(IntegerType) - self.n_components = Y.shape[1] - ones_topic = np.ones(self.n_components, dtype=RealType) + self.n_components = int(Y.shape[1]) self.topic_word_prior = number_to_array( - X.shape[1], 1 / float(self.n_components), self.topic_word_prior + X.shape[1], 1 / float(self.n_components), None ) try: @@ -107,6 +105,6 @@ def _fit(self, X, Y, ll_freq: int = 10): return doc_topic @property - def phi(self): + def phi(self) -> npt.NDArray[np.float64]: assert self.predictor is not None return self.predictor.phis[0] diff --git a/lda11/lda.py b/lda11/lda.py index dff1cfb..d0f71f0 100644 --- a/lda11/lda.py +++ b/lda11/lda.py @@ -1,9 +1,16 @@ -from gc import collect -from numbers import Number -from typing import Dict, List, Literal, NamedTuple, Optional, Tuple, Union +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Literal, + NamedTuple, + Optional, + Tuple, + Union, +) import numpy as np -from numpy import integer from numpy import typing as npt from scipy import sparse as sps from tqdm import tqdm @@ -18,7 +25,7 @@ IndexType = np.uint64 -ValidXType = Union[sps.spmatrix, np.ndarray] +ValidXType = Union[sps.spmatrix, npt.NDArray[np.int32], npt.NDArray[np.int64]] PriorType = Union[np.ndarray, float, None] @@ -33,7 +40,7 @@ def number_to_array( default: float, arg_: Union[float, None, np.ndarray] = None, ensure_symmetry: bool = False, -) -> np.ndarray: +) -> npt.NDArray[np.float64]: if arg_ is None or isinstance(arg_, float): value_ = default if arg_ is None else float(arg_) return np.ones(n_components, dtype=RealType) * value_ @@ -85,13 +92,6 @@ def to_valid_csr(X: ValidXType) -> sps.csr_matrix: class LDAPredictorMixin: - """ - self.components_ - self.n_components - self.predictor - are needed - """ - topic_word_priors_: Optional[List[np.ndarray]] predictor: Optional[CorePredictor] @@ -104,7 +104,7 @@ def transform( mf_tolerance: float = 1e-10, gibbs_burn_in: int = 10, use_cgs_p: bool = True, - n_workers=1 + n_workers: int = 1 ) -> np.ndarray: assert self.topic_word_priors_ is not None assert self.predictor is not None @@ -118,7 +118,7 @@ def transform( if X is None: Xs_csr.append( sps.csr_matrix( - shape=(shape, self.topic_word_priors_[i].shape[0]), + (shape, self.topic_word_priors_[i].shape[0]), dtype=IntegerType, ) ) @@ -129,18 +129,25 @@ def transform( return self.predictor.predict_gibbs_batch( Xs_csr, n_iter, gibbs_burn_in, random_seed, use_cgs_p, n_workers ) - else: + elif mode == "mf": return self.predictor.predict_mf_batch( Xs_csr, n_iter, mf_tolerance, n_workers ) + else: + raise ValueError('"mode" argument must be either "gibbs" for "mf".') def word_topic_assignment( - self, *Xs, n_iter=100, random_seed=42, gibbs_burn_in=10, use_cgs_p=True + self, + *Xs: Union[ValidXType, None], + n_iter: int = 100, + random_seed: int = 42, + gibbs_burn_in: int = 10, + use_cgs_p: bool = True ) -> List[Tuple[np.ndarray, List[Dict[int, np.ndarray]]]]: assert self.topic_word_priors_ is not None assert self.predictor is not None n_domains = len(Xs) - shapes = set({X.shape[0] for X in Xs}) + shapes = set({X.shape[0] for X in Xs if X is not None}) if len(shapes) != 1: raise ValueError("Got different shape for Xs.") @@ -150,8 +157,7 @@ def word_topic_assignment( if X is None: Xs_csr.append( sps.csr_matrix( - ([], ([], [])), - shape=(shape, self.topic_word_priors_[i].shape[0]), + (shape, self.topic_word_priors_[i].shape[0]), dtype=IntegerType ) ) results = [] @@ -176,7 +182,7 @@ def phis(self) -> List[np.ndarray]: return self.predictor.phis -class MultipleContextLDA(LDAPredictorMixin): +class LDABase(LDAPredictorMixin): def __init__( self, n_components: int = 100, @@ -217,11 +223,7 @@ def __init__( self.n_workers = n_workers - def fit(self, *X, **kwargs): - self._fit(*X, **kwargs) - return self - - def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray: + def _fit(self, *Xs: ValidXType, ll_freq: int = 10) -> npt.NDArray[IntegerType]: """ Xs should be a list of contents. All entries must have the same shape[0]. @@ -247,6 +249,8 @@ def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray: ensure_symmetry=True, ) ) + if n_rows is None: + raise ValueError("At least one doc-term matrix must be given.") doc_topic: np.ndarray = np.zeros((n_rows, self.n_components), dtype=IntegerType) @@ -353,7 +357,53 @@ def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray: return doc_topic -class LDA(MultipleContextLDA): +class MultilingualLDA(LDABase): + def __init__( + self, + n_components: int = 100, + doc_topic_prior: PriorType = None, + n_iter: int = 1000, + optimize_interval: Optional[int] = None, + optimize_burn_in: Optional[int] = None, + n_workers: int = 1, + use_cgs_p: bool = True, + is_phi_symmetric: bool = True, + ): + n_components = int(n_components) + assert n_iter >= 1 + assert n_components >= 1 + self.n_components = n_components + + self.doc_topic_prior = number_to_array( + self.n_components, 1 / float(self.n_components), doc_topic_prior + ) + self.topic_word_priors_ = None + self.is_phi_symmetric = is_phi_symmetric + self.n_vocabs: Optional[List[int]] = None + self.docstate_ = None + self.components_: Optional[int] = None + self.n_modals: Optional[int] = None + + self.predictor: Optional[CorePredictor] = None + self.use_cgs_p: bool = use_cgs_p + + self.n_iter = n_iter + self.optimize_interval = optimize_interval + if optimize_interval is not None: + if optimize_burn_in is None: + optimize_burn_in = n_iter // 2 + else: + optimize_burn_in = optimize_burn_in + self.optimize_burn_in = optimize_burn_in + + self.n_workers = n_workers + + def fit(self, *X: ValidXType, ll_freq: int = 10) -> "MultilingualLDA": + self._fit(*X, ll_freq=ll_freq) + return self + + +class LDA(LDABase): pass def __init__( @@ -379,8 +429,8 @@ def __init__( is_phi_symmetric=is_phi_symmetric, ) - def fit(self, X, **kwargs): - super(LDA, self).fit(X, **kwargs) + def fit(self, X: ValidXType, ll_freq: int = 10) -> "LDA": + self._fit(X, ll_freq=ll_freq) return self @property diff --git a/lda11/util.py b/lda11/util.py index c6e01ee..dee0dee 100644 --- a/lda11/util.py +++ b/lda11/util.py @@ -1,15 +1,19 @@ +from typing import Optional, Tuple + import numpy as np from scipy import sparse as sps from ._lda import train_test_split -from .lda import IntegerType, RealType +from .lda import IntegerType, RealType, ValidXType -def rowwise_train_test_split(X, random_seed=None, test_ratio=0.5): +def rowwise_train_test_split( + X: ValidXType, random_seed: Optional[int] = None, test_ratio: float = 0.5 +) -> Tuple[sps.csr_matrix, sps.csr_matrix]: """ split matrix randomly """ if random_seed is None: - random_seed = np.random.randint(-(2 ** 63), 2 ** 63 - 1) + random_seed = np.random.randint(-(2 ** 31), 2 ** 31 - 1) X = sps.csr_matrix(X, dtype=IntegerType) return train_test_split(X, test_ratio, random_seed) diff --git a/pyproject.toml b/pyproject.toml index 6796ae2..901f417 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,3 +13,14 @@ include_trailing_comma = true line_length = 88 multi_line_output = 3 use_parentheses = true + +[tool.mypy] +disallow_untyped_defs = true +exclude = [] +ignore_missing_imports = true +plugins = "numpy.typing.mypy_plugin" +python_version = "3.8" +warn_return_any = true +warn_unused_configs = true + +# mypy per-module options: diff --git a/stubs/lda11/_lda-stubs/__init__.pyi b/stubs/lda11/_lda-stubs/__init__.pyi index ed1336a..818f26d 100644 --- a/stubs/lda11/_lda-stubs/__init__.pyi +++ b/stubs/lda11/_lda-stubs/__init__.pyi @@ -4,6 +4,7 @@ import lda11._lda import typing import numpy import scipy.sparse + _Shape = typing.Tuple[int, ...] __all__ = [ @@ -13,46 +14,174 @@ __all__ = [ "learn_dirichlet", "learn_dirichlet_symmetric", "log_likelihood_doc_topic", - "train_test_split" + "train_test_split", ] - -class LDATrainer(): - def __init__(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg4: int, arg5: int, arg6: int) -> None: ... - def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... - def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... - def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ... - def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... - def set_doc_topic_prior(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]]) -> None: ... +class LDATrainer: + def __init__( + self, + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.uint64], + arg3: npt.NDArray[numpy.uint64], + arg4: int, + arg5: int, + arg6: int, + ) -> None: ... + def initialize( + self, + arg0: npt.NDArray[numpy.int32], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + ) -> None: ... + def iterate_gibbs( + self, + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], + ) -> None: ... + def log_likelihood( + self, + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + ) -> float: ... + def obtain_phi( + self, + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], + ) -> npt.NDArray[numpy.float64]: ... + def set_doc_topic_prior(self, arg0: npt.NDArray[numpy.float64]) -> None: ... pass -class LabelledLDATrainer(): - def __init__(self, arg0: float, arg1: float, arg2: scipy.sparse.csr_matrix[numpy.int32], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg6: int, arg7: int, arg8: int) -> None: ... - def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... - def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ... - def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ... - def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + +class LabelledLDATrainer: + def __init__( + self, + arg0: float, + arg1: float, + arg2: scipy.sparse.csr_matrix[numpy.int32], + arg3: npt.NDArray[numpy.int32], + arg4: npt.NDArray[numpy.uint64], + arg5: npt.NDArray[numpy.uint64], + arg6: int, + arg7: int, + arg8: int, + ) -> None: ... + def initialize( + self, + arg0: npt.NDArray[numpy.int32], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + ) -> None: ... + def iterate_gibbs( + self, + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], + ) -> None: ... + def log_likelihood( + self, + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + ) -> float: ... + def obtain_phi( + self, + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], + arg3: npt.NDArray[numpy.int32], + ) -> npt.NDArray[numpy.float64]: ... pass -class Predictor(): + +class Predictor: def __getstate__(self) -> tuple: ... - def __init__(self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int) -> None: ... + def __init__( + self, arg0: int, arg1: npt.NDArray[numpy.float64], arg2: int + ) -> None: ... def __setstate__(self, arg0: tuple) -> None: ... - def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ... - def predict_gibbs(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... - def predict_gibbs_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... - def predict_gibbs_with_word_assignment(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> typing.Tuple[numpy.ndarray[numpy.float64, _Shape[m, 1]], typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]]]: ... - def predict_mf(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: float) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ... - def predict_mf_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: float, arg3: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ... + def add_beta(self, arg0: npt.NDArray[numpy.float64]) -> None: ... + def predict_gibbs( + self, + arg0: typing.List[npt.NDArray[numpy.int32]], + arg1: typing.List[npt.NDArray[numpy.int32]], + arg2: int, + arg3: int, + arg4: int, + arg5: bool, + ) -> npt.NDArray[numpy.float64]: ... + def predict_gibbs_batch( + self, + arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], + arg1: int, + arg2: int, + arg3: int, + arg4: bool, + arg5: int, + ) -> npt.NDArray[numpy.float64]: ... + def predict_gibbs_with_word_assignment( + self, + arg0: typing.List[npt.NDArray[numpy.int32]], + arg1: typing.List[npt.NDArray[numpy.int32]], + arg2: int, + arg3: int, + arg4: int, + arg5: bool, + ) -> typing.Tuple[ + npt.NDArray[numpy.float64], + typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]], + ]: ... + def predict_mf( + self, + arg0: typing.List[npt.NDArray[numpy.int32]], + arg1: typing.List[npt.NDArray[numpy.int32]], + arg2: int, + arg3: float, + ) -> npt.NDArray[numpy.float64]: ... + def predict_mf_batch( + self, + arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], + arg1: int, + arg2: float, + arg3: int, + ) -> npt.NDArray[numpy.float64]: ... @property - def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]: + def phis(self) -> typing.List[npt.NDArray[numpy.float64]]: """ - :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]] + :type: typing.List[npt.NDArray[numpy.float64]] """ pass -def learn_dirichlet(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: float, arg3: float, arg4: int) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: + +def learn_dirichlet( + arg0: npt.NDArray[numpy.int32], + arg1: npt.NDArray[numpy.float64], + arg2: float, + arg3: float, + arg4: int, +) -> npt.NDArray[numpy.float64]: pass -def learn_dirichlet_symmetric(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: float, arg2: float, arg3: float, arg4: int) -> float: + +def learn_dirichlet_symmetric( + arg0: npt.NDArray[numpy.int32], + arg1: float, + arg2: float, + arg3: float, + arg4: int, +) -> float: pass -def log_likelihood_doc_topic(arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> float: + +def log_likelihood_doc_topic( + arg0: npt.NDArray[numpy.float64], + arg1: npt.NDArray[numpy.int32], + arg2: npt.NDArray[numpy.int32], +) -> float: pass -def train_test_split(arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int) -> typing.Tuple[scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32]]: + +def train_test_split( + arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int +) -> typing.Tuple[ + scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32] +]: pass From 93bf647c34b114fe8c800ab8d05e5a018b3ef8c3 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 11:49:47 +0900 Subject: [PATCH 05/32] adding tests --- tests/conftest.py | 16 ++++++++++++++++ tests/language.py | 40 +++++++++++++++++++++++++++++++++++++++ tests/test_mlds.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/language.py create mode 100644 tests/test_mlds.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..0d307e3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,16 @@ +import numpy as np +import pytest + +from .language import Docs, Language + + +@pytest.fixture +def docs_gen() -> Docs: + language_1 = Language( + np.asfarray([1, 1, 1, 0.01, 0.01, 0.01]), + np.asfarray([0.01, 0.01, 0.01, 1, 1, 1]), + ) + language_2 = Language( + np.asfarray([1, 0.01, 1, 0.01]), np.asfarray([0.01, 1, 0.01, 1]) + ) + return Docs([language_1, language_2]) diff --git a/tests/language.py b/tests/language.py new file mode 100644 index 0000000..0ae1373 --- /dev/null +++ b/tests/language.py @@ -0,0 +1,40 @@ +from typing import List, Tuple + +import numpy as np +import numpy.typing as npt + +N_DOCS = 1000 + + +class Language: + def __init__( + self, TOPIC1: npt.NDArray[np.float64], TOPIC2: npt.NDArray[np.float64] + ): + self.topic_1: npt.NDArray[np.float64] = TOPIC1 / TOPIC1.sum() + self.topic_2: npt.NDArray[np.float64] = TOPIC2 / TOPIC2.sum() + + +class Docs: + def __init__(self, languages: List[Language]): + self.languages = languages + + def gen_doc( + self, n_docs: int + ) -> Tuple[List[npt.NDArray[np.int32]], npt.NDArray[np.float64]]: + rns = np.random.RandomState(0) + words: List[List[npt.NDArray[np.int64]]] = [ + [] for _ in range(len(self.languages)) + ] + thetas: List[np.ndarray] = [] + for _ in range(n_docs): + theta = rns.dirichlet(np.asfarray([1.0, 1.0])) + thetas.append(theta) + for lind, language in enumerate(self.languages): + cnt = rns.poisson(3) + wdist = ( + float(theta[0]) * language.topic_1 + + float(theta[1]) * language.topic_2 + ) + words[lind].append(rns.multinomial(cnt, wdist)) + + return [np.vstack(x) for x in words], np.vstack(thetas) diff --git a/tests/test_mlds.py b/tests/test_mlds.py new file mode 100644 index 0000000..ddea5f4 --- /dev/null +++ b/tests/test_mlds.py @@ -0,0 +1,47 @@ +import numpy as np + +from lda11 import MultilingualLDA + +from .conftest import Docs + + +def test_mlda(docs_gen: Docs) -> None: + (X1, X2), true_theta = docs_gen.gen_doc(1000) + lda = MultilingualLDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25) + lda.fit(X1, X2) + phi1, phi2 = lda.phis + + # determin which is TOPIC1 + + lang1_topic1_strong_index = np.where(docs_gen.languages[0].topic_1 > 0.1)[0] + lang1_topic2_strong_index = np.where(docs_gen.languages[0].topic_1 < 0.1)[0] + if ( + phi1[lang1_topic1_strong_index, 0].mean() + > phi1[lang1_topic2_strong_index, 0].mean() + ): + topic1_index = 0 + topic2_index = 1 + else: + topic1_index = 1 + topic2_index = 0 + for i in lang1_topic1_strong_index: + for j in lang1_topic2_strong_index: + assert phi1[i, topic1_index] > phi1[j, topic1_index] + assert phi1[i, topic2_index] < phi1[j, topic2_index] + + lang2_topic1_strong_index = np.where(docs_gen.languages[1].topic_1 > 0.1)[0] + lang2_topic2_strong_index = np.where(docs_gen.languages[1].topic_1 < 0.1)[0] + for i in lang2_topic1_strong_index: + for j in lang2_topic2_strong_index: + assert phi2[i, topic1_index] > phi2[j, topic1_index] + assert phi2[i, topic2_index] < phi2[j, topic2_index] + + # just check it works. + for algo in ["mf", "gibbs"]: + checked_cnt = 0 + theta_inferred = lda.transform(X1, X2, mode=algo) # type: ignore + for i in range(X1.shape[0]): + if (true_theta[i, 0] / true_theta[i, 1]) > 10: + checked_cnt += 1 + assert theta_inferred[i, topic1_index] > theta_inferred[i, topic2_index] + assert checked_cnt > 0 From a072f9fee5a355b643b9004bf3ab2462ded581ca Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 11:51:01 +0900 Subject: [PATCH 06/32] Type --- lda11/lda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lda11/lda.py b/lda11/lda.py index d0f71f0..21bd42d 100644 --- a/lda11/lda.py +++ b/lda11/lda.py @@ -337,12 +337,12 @@ def _fit(self, *Xs: ValidXType, ll_freq: int = 10) -> npt.NDArray[IntegerType]: topic_word_prior[:] = topic_word_prior_new self.doc_topic_prior = doc_topic_prior_new docstate.set_doc_topic_prior(doc_topic_prior_new) - self.topic_word_priors = topic_word_priors_canonical + self.topic_word_priors_ = topic_word_priors_canonical predictor = CorePredictor(self.n_components, self.doc_topic_prior, 42) for i, (twp, wt, docstate) in enumerate( - zip(self.topic_word_priors, word_topics, docstates) + zip(self.topic_word_priors_, word_topics, docstates) ): if self.use_cgs_p: phi = docstate.obtain_phi(twp, doc_topic, wt, topic_counts) From e50cb28c3274c37a27ba0868d022a93063825c78 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 12:03:15 +0900 Subject: [PATCH 07/32] add test for word_topic_assignment --- .gitignore | 1 + lda11/lda.py | 36 +++++++++--------------------------- tests/test_mlds.py | 25 +++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 19ff8cb..f3e2f8e 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ compile_commands.json .clangd/ **.ipynb_checkpoints/** .cache/clangd +.coverage diff --git a/lda11/lda.py b/lda11/lda.py index 21bd42d..fb1a198 100644 --- a/lda11/lda.py +++ b/lda11/lda.py @@ -369,34 +369,16 @@ def __init__( use_cgs_p: bool = True, is_phi_symmetric: bool = True, ): - n_components = int(n_components) - assert n_iter >= 1 - assert n_components >= 1 - self.n_components = n_components - - self.doc_topic_prior = number_to_array( - self.n_components, 1 / float(self.n_components), doc_topic_prior + super().__init__( + n_components, + doc_topic_prior=doc_topic_prior, + n_iter=n_iter, + optimize_interval=optimize_interval, + optimize_burn_in=optimize_burn_in, + n_workers=n_workers, + use_cgs_p=use_cgs_p, + is_phi_symmetric=is_phi_symmetric, ) - self.topic_word_priors_ = None - self.is_phi_symmetric = is_phi_symmetric - self.n_vocabs: Optional[List[int]] = None - self.docstate_ = None - self.components_: Optional[int] = None - self.n_modals: Optional[int] = None - - self.predictor: Optional[CorePredictor] = None - self.use_cgs_p: bool = use_cgs_p - - self.n_iter = n_iter - self.optimize_interval = optimize_interval - if optimize_interval is not None: - if optimize_burn_in is None: - optimize_burn_in = n_iter // 2 - else: - optimize_burn_in = optimize_burn_in - self.optimize_burn_in = optimize_burn_in - - self.n_workers = n_workers def fit(self, *X: ValidXType, ll_freq: int = 10) -> "MultilingualLDA": self._fit(*X, ll_freq=ll_freq) diff --git a/tests/test_mlds.py b/tests/test_mlds.py index ddea5f4..5ee03ce 100644 --- a/tests/test_mlds.py +++ b/tests/test_mlds.py @@ -1,4 +1,5 @@ import numpy as np +from scipy import sparse as sps from lda11 import MultilingualLDA @@ -7,6 +8,7 @@ def test_mlda(docs_gen: Docs) -> None: (X1, X2), true_theta = docs_gen.gen_doc(1000) + X2 = sps.lil_matrix(X2) lda = MultilingualLDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25) lda.fit(X1, X2) phi1, phi2 = lda.phis @@ -45,3 +47,26 @@ def test_mlda(docs_gen: Docs) -> None: checked_cnt += 1 assert theta_inferred[i, topic1_index] > theta_inferred[i, topic2_index] assert checked_cnt > 0 + + wdt = lda.word_topic_assignment(X1, X2) + assert len(wdt) == 1000 + for i, wdt_result_doc in enumerate(wdt): + theta = wdt_result_doc[0] + if (true_theta[i, 0] / true_theta[i, 1]) > 10: + assert theta[topic1_index] > theta[topic2_index] + m = wdt_result_doc[1] + assert len(m) == 2 + # lang 1 + lang1_assignment = m[0] + for word, topic in lang1_assignment.items(): + if (topic[topic1_index] / (1e-10 + topic[topic2_index])) > 10: + assert word in lang1_topic1_strong_index + if (topic[topic2_index] / (1e-10 + topic[topic1_index])) > 10: + assert word in lang1_topic2_strong_index + + lang2_assignment = m[1] + for word, topic in lang2_assignment.items(): + if (topic[topic1_index] / (1e-10 + topic[topic2_index])) > 10: + assert word in lang2_topic1_strong_index + if (topic[topic2_index] / (1e-10 + topic[topic1_index])) > 10: + assert word in lang2_topic2_strong_index From 5341ec468876a70a608b8069cb6621ee6ee81d68 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 12:06:42 +0900 Subject: [PATCH 08/32] add test for usual lda --- tests/test_lda.py | 30 ++++++++++++++++++++++++++++ tests/{test_mlds.py => test_mlda.py} | 0 2 files changed, 30 insertions(+) create mode 100644 tests/test_lda.py rename tests/{test_mlds.py => test_mlda.py} (100%) diff --git a/tests/test_lda.py b/tests/test_lda.py new file mode 100644 index 0000000..a7cb1d7 --- /dev/null +++ b/tests/test_lda.py @@ -0,0 +1,30 @@ +import numpy as np + +from lda11 import LDA + +from .conftest import Docs + + +def test_lda(docs_gen: Docs) -> None: + (X1, _), true_theta = docs_gen.gen_doc(1000) + lda = LDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25, use_cgs_p=False) + lda.fit(X1) + phi1 = lda.phi + + # determin which is TOPIC1 + + lang1_topic1_strong_index = np.where(docs_gen.languages[0].topic_1 > 0.1)[0] + lang1_topic2_strong_index = np.where(docs_gen.languages[0].topic_1 < 0.1)[0] + if ( + phi1[lang1_topic1_strong_index, 0].mean() + > phi1[lang1_topic2_strong_index, 0].mean() + ): + topic1_index = 0 + topic2_index = 1 + else: + topic1_index = 1 + topic2_index = 0 + for i in lang1_topic1_strong_index: + for j in lang1_topic2_strong_index: + assert phi1[i, topic1_index] > phi1[j, topic1_index] + assert phi1[i, topic2_index] < phi1[j, topic2_index] diff --git a/tests/test_mlds.py b/tests/test_mlda.py similarity index 100% rename from tests/test_mlds.py rename to tests/test_mlda.py From 70b4089079dbe5eb58930a5e2c4013913f4affa2 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 12:17:40 +0900 Subject: [PATCH 09/32] test utils --- lda11/__init__.py | 3 ++- tests/test_util.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 tests/test_util.py diff --git a/lda11/__init__.py b/lda11/__init__.py index 7981c69..4c3681a 100644 --- a/lda11/__init__.py +++ b/lda11/__init__.py @@ -1,4 +1,5 @@ from .labelled_lda import LabelledLDA from .lda import LDA, MultilingualLDA +from .util import rowwise_train_test_split -__all__ = ["LDA", "LabelledLDA", "MultilingualLDA"] +__all__ = ["LDA", "LabelledLDA", "MultilingualLDA", "rowwise_train_test_split"] diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..d943d73 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,17 @@ +import numpy as np +from scipy import sparse as sps + +from lda11 import rowwise_train_test_split + +from .conftest import Docs + + +def test_split(docs_gen: Docs) -> None: + (X1, X2), _ = docs_gen.gen_doc(1000) + X2_sp = sps.lil_matrix(X2) + X1_tr, X1_te = rowwise_train_test_split(X1) + assert np.all(np.asarray(X1 - X1_tr - X1_te) == 0) + X2_tr, X2_te = rowwise_train_test_split(X2_sp, random_seed=0) + # raise RuntimeError((X2.tocsr() - X2_tr - X2_te)) + v = np.abs(X2 - X2_tr.toarray() - X2_te.toarray()).sum() + assert v == 0 From 435d52b171c8ba05e6b97facf9e6d721eb959307 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 12:45:58 +0900 Subject: [PATCH 10/32] llda test --- lda11/labelled_lda.py | 12 +++++----- tests/test_llda.py | 55 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 6 deletions(-) create mode 100644 tests/test_llda.py diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py index b19f13b..3bb28fe 100644 --- a/lda11/labelled_lda.py +++ b/lda11/labelled_lda.py @@ -54,9 +54,9 @@ def _fit_llda( Y = Y.astype(IntegerType) self.n_components = int(Y.shape[1]) - self.topic_word_prior = number_to_array( - X.shape[1], 1 / float(self.n_components), None - ) + self.topic_word_priors_ = [ + number_to_array(X.shape[1], 1 / float(self.n_components), None) + ] try: count, dix, wix = check_array(X) @@ -84,7 +84,7 @@ def _fit_llda( with tqdm(range(self.n_iter)) as pbar: for _ in pbar: docstate.iterate_gibbs( - self.topic_word_prior, doc_topic, word_topic, topic_counts + self.topic_word_priors_[0], doc_topic, word_topic, topic_counts ) doc_topic_prior = self.alpha * np.ones(self.n_components, dtype=RealType) @@ -93,10 +93,10 @@ def _fit_llda( predictor = CorePredictor(self.n_components, doc_topic_prior, 42) if self.use_cgs_p: phi = docstate.obtain_phi( - self.topic_word_prior, doc_topic, word_topic, topic_counts + self.topic_word_priors_[0], doc_topic, word_topic, topic_counts ) else: - phi = word_topic + self.topic_word_prior[:, np.newaxis] + phi = word_topic + self.topic_word_priors_[0][:, np.newaxis] phi /= phi.sum(axis=0)[np.newaxis, :] phi = phi.transpose() predictor.add_beta(phi.transpose()) diff --git a/tests/test_llda.py b/tests/test_llda.py new file mode 100644 index 0000000..b20820f --- /dev/null +++ b/tests/test_llda.py @@ -0,0 +1,55 @@ +from typing import Tuple + +import numpy as np +import numpy.typing as npt + +from lda11 import LabelledLDA + + +class LabelledLanguage: + def __init__( + self, TOPIC1: npt.NDArray[np.float64], TOPIC2: npt.NDArray[np.float64] + ): + self.topic_1: npt.NDArray[np.float64] = TOPIC1 / TOPIC1.sum() + self.topic_2: npt.NDArray[np.float64] = TOPIC2 / TOPIC2.sum() + self.common = np.ones_like(TOPIC1) / TOPIC1.shape[0] + + def gen_doc( + self, n_docs: int + ) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: + rns = np.random.RandomState(0) + Xs = [] + labels = [] + for i in range(n_docs): + cnt = rns.poisson(10) + label = np.asfarray([1, rns.binomial(1, 0.5), rns.binomial(1, 0.5)]) + p = ( + label[0] * self.common + + label[1] * self.topic_1 + + label[2] * self.topic_2 + ) + words = rns.multinomial(cnt, p / p.sum()) + Xs.append(words) + labels.append(label) + return np.vstack(Xs), np.vstack(labels) + + +def test_llda() -> None: + TOPIC_A = np.asfarray([0.01, 1, 0.01, 1]) + TOPIC_B = np.asfarray([1, 0.01, 1, 0.01]) + for A_index in [1, 2]: + if A_index == 1: + language = LabelledLanguage(TOPIC_A, TOPIC_B) + else: + language = LabelledLanguage(TOPIC_B, TOPIC_A) + X, Y = language.gen_doc(1000) + + llda = LabelledLDA().fit(X, Y) + + A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32) + for mode in ["mf", "gibbs"]: + theta = llda.transform(A_DOC, mode=mode)[0] # type: ignore + if A_index == 1: + assert (theta[1] / theta[2]) > 5 + else: + assert (theta[2] / theta[1]) > 5 From 89b8cec9b15324b5ae3a5156106765f352479e26 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 12:49:27 +0900 Subject: [PATCH 11/32] further llda test --- tests/test_llda.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_llda.py b/tests/test_llda.py index b20820f..a77e5f3 100644 --- a/tests/test_llda.py +++ b/tests/test_llda.py @@ -37,15 +37,26 @@ def gen_doc( def test_llda() -> None: TOPIC_A = np.asfarray([0.01, 1, 0.01, 1]) TOPIC_B = np.asfarray([1, 0.01, 1, 0.01]) + A_word_index = np.where(TOPIC_A > 0.1)[0] + B_word_index = np.where(TOPIC_A < 0.1)[0] + for A_index in [1, 2]: if A_index == 1: language = LabelledLanguage(TOPIC_A, TOPIC_B) + B_index = 2 else: language = LabelledLanguage(TOPIC_B, TOPIC_A) + B_index = 1 + X, Y = language.gen_doc(1000) llda = LabelledLDA().fit(X, Y) + for a_word in A_word_index: + for b_word in B_word_index: + assert llda.phi[a_word, A_index] > llda.phi[b_word, A_index] + assert llda.phi[a_word, B_index] < llda.phi[b_word, B_index] + A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32) for mode in ["mf", "gibbs"]: theta = llda.transform(A_DOC, mode=mode)[0] # type: ignore From 8bf8baa367bd6cd90b53f37ab4388245d69f6942 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 14:08:49 +0900 Subject: [PATCH 12/32] Add test workflow --- .github/workflows/test.yml | 36 ++++++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 4 ---- setup.py | 24 ++++++++++++++++++++++-- tests/test_llda.py | 4 ++-- 4 files changed, 60 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..0d99b19 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,36 @@ +name: Full Test & Upload coverage +on: [push, pull_request] +jobs: + run_pytest_upload_coverage: + runs-on: ubuntu-latest + env: + OS: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Setup Python + uses: actions/setup-python@master + with: + python-version: 3.10 + - name: Build lda11 + run: | + pip install --upgrade pip + sudo apt-get install lcov + TEST_BUILD=true python setup.py develop + - name: Run pytest + run: | + pip install pytest pytest-cov + pytest --cov=./lda11 tests/ + - name: Generate coverage (ubuntu) + run: | + coverage xml + lcov -d `pwd` -c -o coverage.info + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + files: ./coverage.xml,./coverage.info + verbose: false + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 809da50..97f9721 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,10 +18,6 @@ repos: hooks: - id: isort name: isort - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.790 # Use the sha / tag you want to point at - hooks: - - id: mypy - repo: https://github.com/psf/black rev: 20.8b1 hooks: diff --git a/setup.py b/setup.py index 9354f54..0da795b 100644 --- a/setup.py +++ b/setup.py @@ -6,13 +6,15 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -__version__ = "0.2.2.0" -install_requires = ["pybind11>=2.5", "numpy >= 1.11", "tqdm", "scipy>=1.0.0"] +__version__ = "0.3.0.0" +install_requires = ["pybind11>=2.5", "numpy >= 1.22", "tqdm", "scipy>=1.0.0"] eigen_include_dir = os.environ.get("EIGEN3_INCLUDE_DIR", None) if eigen_include_dir is None: install_requires.append("requests") +TEST_BUILD = os.environ.get("TEST_BUILD", None) is not None + class get_eigen_include(object): EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip" @@ -122,6 +124,24 @@ class BuildExt(build_ext): "msvc": [], "unix": [], } + if TEST_BUILD: + c_opts: Dict[str, List[str]] = { + "msvc": ["/EHsc"], + "unix": ["-O0", "-coverage", "-g"], + } + l_opts: Dict[str, List[str]] = { + "msvc": [], + "unix": ["-coverage"], + } + else: + c_opts = { + "msvc": ["/EHsc"], + "unix": [], + } + l_opts = { + "msvc": [], + "unix": [], + } if sys.platform == "darwin": darwin_opts = ["-stdlib=libc++", "-mmacosx-version-min=10.7"] diff --git a/tests/test_llda.py b/tests/test_llda.py index a77e5f3..1571485 100644 --- a/tests/test_llda.py +++ b/tests/test_llda.py @@ -40,7 +40,7 @@ def test_llda() -> None: A_word_index = np.where(TOPIC_A > 0.1)[0] B_word_index = np.where(TOPIC_A < 0.1)[0] - for A_index in [1, 2]: + for A_index, cgs_p in zip([1, 2], [True, False]): if A_index == 1: language = LabelledLanguage(TOPIC_A, TOPIC_B) B_index = 2 @@ -50,7 +50,7 @@ def test_llda() -> None: X, Y = language.gen_doc(1000) - llda = LabelledLDA().fit(X, Y) + llda = LabelledLDA(use_cgs_p=cgs_p).fit(X, Y) for a_word in A_word_index: for b_word in B_word_index: From c72481e20b216c37035f907e003e919a945a5fc0 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 14:09:38 +0900 Subject: [PATCH 13/32] 3.10 -> str --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0d99b19..a9d9efc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,7 +12,7 @@ jobs: - name: Setup Python uses: actions/setup-python@master with: - python-version: 3.10 + python-version: "3.10" - name: Build lda11 run: | pip install --upgrade pip From d88cd8c5fffeeacb3737a955e3b445758179eba1 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 14:22:33 +0900 Subject: [PATCH 14/32] Add threading test --- setup.py | 29 ++++++++++++++--------------- tests/test_llda.py | 6 +++--- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/setup.py b/setup.py index 0da795b..300ea80 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,15 @@ import os import sys -from typing import Dict, List +from typing import Any, Dict, List import setuptools from setuptools import Extension, setup from setuptools.command.build_ext import build_ext __version__ = "0.3.0.0" -install_requires = ["pybind11>=2.5", "numpy >= 1.22", "tqdm", "scipy>=1.0.0"] +install_requires = ["numpy >= 1.22", "tqdm", "scipy>=1.0.0"] +setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"] + eigen_include_dir = os.environ.get("EIGEN3_INCLUDE_DIR", None) if eigen_include_dir is None: @@ -20,7 +22,7 @@ class get_eigen_include(object): EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip" EIGEN3_DIRNAME = "eigen-3.3.7" - def __str__(self): + def __str__(self) -> str: if eigen_include_dir is not None: return eigen_include_dir @@ -55,7 +57,7 @@ class get_pybind_include(object): def __init__(self, user=False): self.user = user - def __str__(self): + def __str__(self) -> str: import pybind11 return pybind11.get_include(self.user) @@ -85,7 +87,7 @@ def __str__(self): # As of Python 3.6, CCompiler has a `has_flag` method. # cf http://bugs.python.org/issue26689 -def has_flag(compiler, flagname): +def has_flag(compiler, flagname) -> bool: """Return a boolean indicating whether a flag name is supported on the specified compiler. """ @@ -100,7 +102,7 @@ def has_flag(compiler, flagname): return True -def cpp_flag(compiler): +def cpp_flag(compiler) -> str: """Return the -std=c++[11/14/17] compiler flag. The newer version is prefered over c++11 (when it is available). """ @@ -116,14 +118,6 @@ def cpp_flag(compiler): class BuildExt(build_ext): """A custom build extension for adding compiler-specific options.""" - c_opts = { - "msvc": ["/EHsc"], - "unix": [], - } - l_opts: Dict[str, List[str]] = { - "msvc": [], - "unix": [], - } if TEST_BUILD: c_opts: Dict[str, List[str]] = { "msvc": ["/EHsc"], @@ -148,7 +142,7 @@ class BuildExt(build_ext): c_opts["unix"] += darwin_opts l_opts["unix"] += darwin_opts - def build_extensions(self): + def build_extensions(self) -> None: ct = self.compiler.compiler_type opts = self.c_opts.get(ct, []) link_opts = self.l_opts.get(ct, []) @@ -165,8 +159,13 @@ def build_extensions(self): build_ext.build_extensions(self) +def local_scheme(version: Any) -> str: + return "" + + setup( name="lda11", + use_scm_version={"local_scheme": local_scheme}, version=__version__, author="Tomoki Ohtsuki", url="https://github.com/tohtsky/lda11", diff --git a/tests/test_llda.py b/tests/test_llda.py index 1571485..0bb18d1 100644 --- a/tests/test_llda.py +++ b/tests/test_llda.py @@ -40,7 +40,7 @@ def test_llda() -> None: A_word_index = np.where(TOPIC_A > 0.1)[0] B_word_index = np.where(TOPIC_A < 0.1)[0] - for A_index, cgs_p in zip([1, 2], [True, False]): + for A_index, cgs_p, n_threads in zip([1, 2], [True, False], [1, 2]): if A_index == 1: language = LabelledLanguage(TOPIC_A, TOPIC_B) B_index = 2 @@ -50,7 +50,7 @@ def test_llda() -> None: X, Y = language.gen_doc(1000) - llda = LabelledLDA(use_cgs_p=cgs_p).fit(X, Y) + llda = LabelledLDA(use_cgs_p=cgs_p, n_workers=n_threads).fit(X, Y) for a_word in A_word_index: for b_word in B_word_index: @@ -59,7 +59,7 @@ def test_llda() -> None: A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32) for mode in ["mf", "gibbs"]: - theta = llda.transform(A_DOC, mode=mode)[0] # type: ignore + theta = llda.transform(A_DOC, mode=mode, n_workers=n_threads)[0] # type: ignore if A_index == 1: assert (theta[1] / theta[2]) > 5 else: From 5ffa6052b8599f8dedc97bb5e5ceb2bd5289a392 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 14:24:34 +0900 Subject: [PATCH 15/32] Remove unused func --- lda11/_lda.pyi | 7 -- src/predictor.cpp | 43 ------- src/predictor.hpp | 3 - src/wrapper.cpp | 1 - stubs/lda11/_lda-stubs/__init__.pyi | 187 ---------------------------- 5 files changed, 241 deletions(-) delete mode 100644 stubs/lda11/_lda-stubs/__init__.pyi diff --git a/lda11/_lda.pyi b/lda11/_lda.pyi index 902111c..a2322ff 100644 --- a/lda11/_lda.pyi +++ b/lda11/_lda.pyi @@ -134,13 +134,6 @@ class Predictor: npt.NDArray[numpy.float64], typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]], ]: ... - def predict_mf( - self, - arg0: typing.List[npt.NDArray[numpy.int32]], - arg1: typing.List[npt.NDArray[numpy.int32]], - arg2: int, - arg3: float, - ) -> npt.NDArray[numpy.float64]: ... def predict_mf_batch( self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], diff --git a/src/predictor.cpp b/src/predictor.cpp index 885afda..722583b 100644 --- a/src/predictor.cpp +++ b/src/predictor.cpp @@ -106,49 +106,6 @@ RealMatrix Predictor::predict_mf_batch(std::vector Xs, } return result; } -RealVector Predictor::predict_mf(std::vector nonzeros, - std::vector counts, - std::size_t iter, Real delta) const { - size_t dim_buffer = 0; - for (size_t n = 0; n < n_domains_; n++) { - dim_buffer += counts[n].sum(); - } - if (dim_buffer == 0) { - return doc_topic_prior_ / doc_topic_prior_.sum(); - } - RealMatrix current_prob(dim_buffer, n_topics_); - current_prob.array() = 0; - RealMatrix new_prob(dim_buffer, n_topics_); - RealMatrix beta_rel(dim_buffer, n_topics_); - - size_t current_iter = 0; - for (size_t n = 0; n < n_domains_; n++) { - size_t n_unique_words = nonzeros[n].rows(); - for (size_t j = 0; j < n_unique_words; j++) { - size_t wid = nonzeros[n](j); - size_t count = counts[n][j]; - for (size_t k = 0; k < count; k++) { - beta_rel.row(current_iter) = betas_[n].row(wid); - current_iter++; - } - } - } - - for (size_t i = 0; i <= iter; i++) { - new_prob = -current_prob; - new_prob.rowwise() += current_prob.colwise().sum(); - new_prob.rowwise() += doc_topic_prior_.transpose(); - new_prob.array() = new_prob.array() * beta_rel.array(); - new_prob.array().colwise() /= new_prob.array().rowwise().sum(); - double diff = (new_prob - current_prob).array().abs().sum(); - current_prob = new_prob; - if (diff < delta) - break; - } - RealVector theta = current_prob.array().colwise().sum().transpose(); - theta /= theta.sum(); - return theta; -} RealVector Predictor::predict_gibbs_write_assignment( const std::vector &nonzeros, diff --git a/src/predictor.hpp b/src/predictor.hpp index 4268997..cfc390f 100644 --- a/src/predictor.hpp +++ b/src/predictor.hpp @@ -8,9 +8,6 @@ struct Predictor { void add_beta(const RealMatrix &beta); - RealVector predict_mf(std::vector nonzeros, - std::vector counts, size_t iter, - Real delta) const; RealMatrix predict_mf_batch(std::vector Xs, std::size_t iter, Real delta, size_t n_workers) const; diff --git a/src/wrapper.cpp b/src/wrapper.cpp index c73915b..1583a0f 100644 --- a/src/wrapper.cpp +++ b/src/wrapper.cpp @@ -249,7 +249,6 @@ PYBIND11_MODULE(_lda, m) { .def("predict_gibbs_with_word_assignment", &Predictor::predict_gibbs_with_word_assignment) .def("predict_gibbs_batch", &Predictor::predict_gibbs_batch) - .def("predict_mf", &Predictor::predict_mf) .def("predict_mf_batch", &Predictor::predict_mf_batch) .def_readonly("phis", &Predictor::betas_) .def(py::pickle( diff --git a/stubs/lda11/_lda-stubs/__init__.pyi b/stubs/lda11/_lda-stubs/__init__.pyi deleted file mode 100644 index 818f26d..0000000 --- a/stubs/lda11/_lda-stubs/__init__.pyi +++ /dev/null @@ -1,187 +0,0 @@ -"""Backend C++ inplementation for lda11.""" -from __future__ import annotations -import lda11._lda -import typing -import numpy -import scipy.sparse - -_Shape = typing.Tuple[int, ...] - -__all__ = [ - "LDATrainer", - "LabelledLDATrainer", - "Predictor", - "learn_dirichlet", - "learn_dirichlet_symmetric", - "log_likelihood_doc_topic", - "train_test_split", -] - -class LDATrainer: - def __init__( - self, - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.uint64], - arg3: npt.NDArray[numpy.uint64], - arg4: int, - arg5: int, - arg6: int, - ) -> None: ... - def initialize( - self, - arg0: npt.NDArray[numpy.int32], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.int32], - ) -> None: ... - def iterate_gibbs( - self, - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.int32], - arg3: npt.NDArray[numpy.int32], - ) -> None: ... - def log_likelihood( - self, - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - ) -> float: ... - def obtain_phi( - self, - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.int32], - arg3: npt.NDArray[numpy.int32], - ) -> npt.NDArray[numpy.float64]: ... - def set_doc_topic_prior(self, arg0: npt.NDArray[numpy.float64]) -> None: ... - pass - -class LabelledLDATrainer: - def __init__( - self, - arg0: float, - arg1: float, - arg2: scipy.sparse.csr_matrix[numpy.int32], - arg3: npt.NDArray[numpy.int32], - arg4: npt.NDArray[numpy.uint64], - arg5: npt.NDArray[numpy.uint64], - arg6: int, - arg7: int, - arg8: int, - ) -> None: ... - def initialize( - self, - arg0: npt.NDArray[numpy.int32], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.int32], - ) -> None: ... - def iterate_gibbs( - self, - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.int32], - arg3: npt.NDArray[numpy.int32], - ) -> None: ... - def log_likelihood( - self, - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - ) -> float: ... - def obtain_phi( - self, - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.int32], - arg3: npt.NDArray[numpy.int32], - ) -> npt.NDArray[numpy.float64]: ... - pass - -class Predictor: - def __getstate__(self) -> tuple: ... - def __init__( - self, arg0: int, arg1: npt.NDArray[numpy.float64], arg2: int - ) -> None: ... - def __setstate__(self, arg0: tuple) -> None: ... - def add_beta(self, arg0: npt.NDArray[numpy.float64]) -> None: ... - def predict_gibbs( - self, - arg0: typing.List[npt.NDArray[numpy.int32]], - arg1: typing.List[npt.NDArray[numpy.int32]], - arg2: int, - arg3: int, - arg4: int, - arg5: bool, - ) -> npt.NDArray[numpy.float64]: ... - def predict_gibbs_batch( - self, - arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], - arg1: int, - arg2: int, - arg3: int, - arg4: bool, - arg5: int, - ) -> npt.NDArray[numpy.float64]: ... - def predict_gibbs_with_word_assignment( - self, - arg0: typing.List[npt.NDArray[numpy.int32]], - arg1: typing.List[npt.NDArray[numpy.int32]], - arg2: int, - arg3: int, - arg4: int, - arg5: bool, - ) -> typing.Tuple[ - npt.NDArray[numpy.float64], - typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]], - ]: ... - def predict_mf( - self, - arg0: typing.List[npt.NDArray[numpy.int32]], - arg1: typing.List[npt.NDArray[numpy.int32]], - arg2: int, - arg3: float, - ) -> npt.NDArray[numpy.float64]: ... - def predict_mf_batch( - self, - arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], - arg1: int, - arg2: float, - arg3: int, - ) -> npt.NDArray[numpy.float64]: ... - @property - def phis(self) -> typing.List[npt.NDArray[numpy.float64]]: - """ - :type: typing.List[npt.NDArray[numpy.float64]] - """ - pass - -def learn_dirichlet( - arg0: npt.NDArray[numpy.int32], - arg1: npt.NDArray[numpy.float64], - arg2: float, - arg3: float, - arg4: int, -) -> npt.NDArray[numpy.float64]: - pass - -def learn_dirichlet_symmetric( - arg0: npt.NDArray[numpy.int32], - arg1: float, - arg2: float, - arg3: float, - arg4: int, -) -> float: - pass - -def log_likelihood_doc_topic( - arg0: npt.NDArray[numpy.float64], - arg1: npt.NDArray[numpy.int32], - arg2: npt.NDArray[numpy.int32], -) -> float: - pass - -def train_test_split( - arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int -) -> typing.Tuple[ - scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32] -]: - pass From 58e01a33d3df2e38ef18c5ad03d02bf925301ef0 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 14:25:55 +0900 Subject: [PATCH 16/32] add setup_requires --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 300ea80..175f774 100644 --- a/setup.py +++ b/setup.py @@ -174,7 +174,7 @@ def local_scheme(version: Any) -> str: long_description="", ext_modules=ext_modules, install_requires=install_requires, - setup_requires=install_requires, + setup_requires=setup_requires, cmdclass={"build_ext": BuildExt}, packages=["lda11"], zip_safe=False, From b9249acca8fe00290fa25d33eabc2843eed8a136 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 14:28:07 +0900 Subject: [PATCH 17/32] Fix numpy deps --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 175f774..8c12b82 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools.command.build_ext import build_ext __version__ = "0.3.0.0" -install_requires = ["numpy >= 1.22", "tqdm", "scipy>=1.0.0"] +install_requires = ["numpy>=1.22", "tqdm", "scipy>=1.0.0"] setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"] From 18f96985a79cdb554ddb46fcef1a2fb911d1815e Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 17:53:57 +0900 Subject: [PATCH 18/32] pre-install numpy & scipy --- .github/workflows/test.yml | 1 + setup.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a9d9efc..4ee7fca 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,6 +16,7 @@ jobs: - name: Build lda11 run: | pip install --upgrade pip + pip install numpy scipy sudo apt-get install lcov TEST_BUILD=true python setup.py develop - name: Run pytest diff --git a/setup.py b/setup.py index 8c12b82..f2899f8 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,12 @@ from setuptools.command.build_ext import build_ext __version__ = "0.3.0.0" -install_requires = ["numpy>=1.22", "tqdm", "scipy>=1.0.0"] +install_requires = [ + "numpy>=1.22", + "tqdm", + "scipy>=1.0.0", + "typing_extensions>=3.0", +] setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"] From fb344a1233c4f7a30e9e5aef2ea3e2a2ff75d8fe Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:01:00 +0900 Subject: [PATCH 19/32] use scm for versioning --- lda11/__init__.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lda11/__init__.py b/lda11/__init__.py index 4c3681a..b3e4181 100644 --- a/lda11/__init__.py +++ b/lda11/__init__.py @@ -1,5 +1,18 @@ +from pkg_resources import DistributionNotFound, get_distribution + from .labelled_lda import LabelledLDA from .lda import LDA, MultilingualLDA from .util import rowwise_train_test_split -__all__ = ["LDA", "LabelledLDA", "MultilingualLDA", "rowwise_train_test_split"] +try: + __version__ = get_distribution("lda11").version +except DistributionNotFound: # pragma: no cover + __version__ = "unknown" + +__all__ = [ + "__version__", + "LDA", + "LabelledLDA", + "MultilingualLDA", + "rowwise_train_test_split", +] From 5cfd095031e787e3842ac1cce9d843b085b602de Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:03:13 +0900 Subject: [PATCH 20/32] More efficient test path. --- tests/test_llda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_llda.py b/tests/test_llda.py index 0bb18d1..a6027dd 100644 --- a/tests/test_llda.py +++ b/tests/test_llda.py @@ -40,7 +40,7 @@ def test_llda() -> None: A_word_index = np.where(TOPIC_A > 0.1)[0] B_word_index = np.where(TOPIC_A < 0.1)[0] - for A_index, cgs_p, n_threads in zip([1, 2], [True, False], [1, 2]): + for A_index, cgs_p, n_threads in zip([1, 2], [False, True], [1, 2]): if A_index == 1: language = LabelledLanguage(TOPIC_A, TOPIC_B) B_index = 2 From 86557f71578158f0f0800926eb72ab2f9ef5b5cc Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:11:25 +0900 Subject: [PATCH 21/32] Add mypy check before testing. --- .github/workflows/pre-commit.yaml | 13 +++++++++++++ .github/workflows/test.yml | 4 ++++ lda11/__init__.py | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pre-commit.yaml diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 0000000..96bb78b --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,13 @@ +name: pre-commit +on: + pull_request: + push: +jobs: + pre-commit: + runs-on: ubuntu-latest + env: + SKIP: no-commit-to-branch + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: pre-commit/action@v2.0.0 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4ee7fca..691ae5f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,6 +19,10 @@ jobs: pip install numpy scipy sudo apt-get install lcov TEST_BUILD=true python setup.py develop + - name: mypy + run: | + pip install mypy + mypy lda11 --ignore-missing-imports - name: Run pytest run: | pip install pytest pytest-cov diff --git a/lda11/__init__.py b/lda11/__init__.py index b3e4181..e00c01c 100644 --- a/lda11/__init__.py +++ b/lda11/__init__.py @@ -1,4 +1,4 @@ -from pkg_resources import DistributionNotFound, get_distribution +from pkg_resources import DistributionNotFound, get_distribution # type: ignore from .labelled_lda import LabelledLDA from .lda import LDA, MultilingualLDA From c2196ba51202c3b3bb9d495c0ba3855fddd9fde8 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:19:29 +0900 Subject: [PATCH 22/32] add test for pickling. --- tests/test_llda.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/test_llda.py b/tests/test_llda.py index a6027dd..6d74d30 100644 --- a/tests/test_llda.py +++ b/tests/test_llda.py @@ -1,9 +1,12 @@ +import pickle +import sys +from tempfile import NamedTemporaryFile from typing import Tuple import numpy as np import numpy.typing as npt -from lda11 import LabelledLDA +from lda11 import LabelledLDA, labelled_lda class LabelledLanguage: @@ -51,15 +54,23 @@ def test_llda() -> None: X, Y = language.gen_doc(1000) llda = LabelledLDA(use_cgs_p=cgs_p, n_workers=n_threads).fit(X, Y) + if sys.platform.startswith("linux"): + with NamedTemporaryFile() as temp_fs: + pickle.dump(llda, temp_fs) + temp_fs.seek(0) + del llda + llda_new: LabelledLDA = pickle.load(temp_fs) + else: + llda_new = llda for a_word in A_word_index: for b_word in B_word_index: - assert llda.phi[a_word, A_index] > llda.phi[b_word, A_index] - assert llda.phi[a_word, B_index] < llda.phi[b_word, B_index] + assert llda_new.phi[a_word, A_index] > llda_new.phi[b_word, A_index] + assert llda_new.phi[a_word, B_index] < llda_new.phi[b_word, B_index] A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32) for mode in ["mf", "gibbs"]: - theta = llda.transform(A_DOC, mode=mode, n_workers=n_threads)[0] # type: ignore + theta = llda_new.transform(A_DOC, mode=mode, n_workers=n_threads)[0] # type: ignore if A_index == 1: assert (theta[1] / theta[2]) > 5 else: From 680a728a9bcecb90a1d9673e06e926ce52733286 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:30:56 +0900 Subject: [PATCH 23/32] Add wheel build workflow --- .github/workflows/wheels.yaml | 163 ++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 .github/workflows/wheels.yaml diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml new file mode 100644 index 0000000..e82f7ad --- /dev/null +++ b/.github/workflows/wheels.yaml @@ -0,0 +1,163 @@ +name: Build +on: + push: +# branches: +# - main + release: + types: + - created +env: + cibuildwheel_version: "2.2.2" +jobs: + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-python@v2 + name: Install Python + with: + python-version: '3.7' + - name: Build sdist + run: python setup.py sdist + - uses: actions/upload-artifact@v2 + with: + path: dist/*.tar.gz + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + env: + MACOSX_DEPLOYMENT_TARGET: "10.9" + CIBW_BUILD_VERBOSITY: "1" + CIBW_BUILD: "${{ matrix.cibw.build || '*' }}" + CIBW_SKIP: "${{ matrix.cibw.skip || '' }}" + CIBW_ENVIRONMENT: "${{ matrix.cibw.env || '' }}" + CIBW_TEST_COMMAND: pytest {project}/tests + CIBW_TEST_REQUIRES: pytest + CIBW_MANYLINUX_X86_64_IMAGE: "${{ matrix.cibw.manylinux_image }}" + CIBW_MANYLINUX_I686_IMAGE: "${{ matrix.cibw.manylinux_image }}" + CIBW_MANYLINUX_AARCH64_IMAGE: "${{ matrix.cibw.manylinux_image }}" + CIBW_ARCHS_LINUX: "${{ matrix.cibw.arch || 'auto' }}" + CIBW_ARCHS_MACOS: "${{ matrix.cibw.arch || 'auto' }}" + strategy: + matrix: + include: + - os: macos-10.15 + name: mac + cibw: + arch: x86_64 + env: CFLAGS='-march=core-avx-i' + build: "cp37* cp38*" + + - os: macos-10.15 + name: mac-arm + cibw: + arch: universal2 + build: "cp39* cp310*" + env: '' + + - os: ubuntu-20.04 + name: manylinux1 + cibw: + build: "cp37*" + skip: "*musllinux*" + manylinux_image: manylinux2010 + env: CFLAGS='-march=core-avx-i' + arch: auto64 + + - os: ubuntu-20.04 + name: manylinux2014 + cibw: + build: "cp38* cp39* cp310" + skip: "*musllinux*" + manylinux_image: manylinux2014 + env: CFLAGS='-march=core-avx-i' + arch: auto64 + + - os: ubuntu-20.04 + name: manylinux_aarch64_cp37 + cibw: + build: "cp37*" + skip: "*musllinux*" + manylinux_image: manylinux2014 + arch: aarch64 + + - os: ubuntu-20.04 + name: manylinux_aarch64_cp38 + cibw: + build: "cp38*" + skip: "*musllinux*" + manylinux_image: manylinux2014 + arch: aarch64 + + - os: ubuntu-20.04 + name: manylinux_aarch64_cp39 + cibw: + build: "cp39*" + skip: "*musllinux*" + manylinux_image: manylinux2014 + arch: aarch64 + + - os: ubuntu-20.04 + name: manylinux_aarch64_cp310 + cibw: + build: "cp310*" + skip: "*musllinux*" + manylinux_image: manylinux2014 + arch: aarch64 + + - os: windows-2019 + name: win_amd64 + architecture: x64 + cibw: + build: "cp*win_amd64" + env: "CL='/arch:AVX'" + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-python@v2 + name: Install Python + - name: register qemu + if: contains(matrix.cibw.arch, 'aarch64') + run: | + docker run --rm --privileged hypriot/qemu-register:v4.2.0 + - name: Install cibuildwheel + run: python -m pip install cibuildwheel=="${{env.cibuildwheel_version}}" + - name: Build wheels + run: python -m cibuildwheel --output-dir wheelhouse + + + - uses: actions/upload-artifact@v2 + with: + path: ./wheelhouse/*.whl + + upload_pypi: + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v2 + with: + name: artifact + path: dist + - name: Publish package to TestPyPI + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.TEST_PYPI_APITOKEN }} + packages_dir: dist/ + repository_url: https://test.pypi.org/legacy/ + verbose: true + skip_existing: true + - name: Publish package to PyPI + if: github.event_name == 'release' + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.PYPI_APITOKEN }} + packages_dir: dist/ + verbose: true + skip_existing: true From 5c4f14c133f0257f7fb31a2b2c4a42dd6d2b3900 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:42:39 +0900 Subject: [PATCH 24/32] numpy 1.21 instead & typing_extentions --- lda11/lda.py | 13 ++----------- setup.py | 4 ++-- src/wrapper.cpp | 8 +++++++- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/lda11/lda.py b/lda11/lda.py index fb1a198..c3ed90b 100644 --- a/lda11/lda.py +++ b/lda11/lda.py @@ -1,19 +1,10 @@ -from typing import ( - TYPE_CHECKING, - Any, - Dict, - List, - Literal, - NamedTuple, - Optional, - Tuple, - Union, -) +from typing import Dict, List, NamedTuple, Optional, Tuple, Union import numpy as np from numpy import typing as npt from scipy import sparse as sps from tqdm import tqdm +from typing_extensions import Literal from ._lda import LDATrainer from ._lda import Predictor as CorePredictor diff --git a/setup.py b/setup.py index f2899f8..d5d8fb6 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,10 @@ __version__ = "0.3.0.0" install_requires = [ - "numpy>=1.22", + "numpy>=1.21", "tqdm", "scipy>=1.0.0", - "typing_extensions>=3.0", + "typing_extensions>=3.10", ] setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"] diff --git a/src/wrapper.cpp b/src/wrapper.cpp index 1583a0f..defd152 100644 --- a/src/wrapper.cpp +++ b/src/wrapper.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -216,7 +217,12 @@ Real log_likelihood_doc_topic(const Eigen::Ref &doc_topic_prior, } PYBIND11_MODULE(_lda, m) { - m.doc() = "Backend C++ inplementation for lda11."; + std::stringstream doc_stream; + doc_stream << "Backend C++ implementation for lda11." << std::endl + << "Built to use" << std::endl + << "\t" << Eigen::SimdInstructionSetsInUse(); + + m.doc() = doc_stream.str(); py::class_(m, "LDATrainer") .def(py::init, Eigen::Ref, Eigen::Ref, From 1e1cb90689b72b6fab55b3bc28b43dbb53a99886 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:51:01 +0900 Subject: [PATCH 25/32] Fix readme & packaging. --- README.md | 17 ++++++++++------- setup.py | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4f900df..80c88d1 100644 --- a/README.md +++ b/README.md @@ -2,22 +2,25 @@ ## Features -- Use [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) for faster array multiplication. -- Use [pybind11](https://github.com/pybind/pybind11) to bind the code into python. - Support parallelized sampler proposed in [Distributed Inference for Latent Dirichlet Allocation](https://dl.acm.org/doi/abs/10.5555/2981562.2981698). - Implement [CGS_p estimator](http://www.jmlr.org/papers/volume18/16-526/16-526.pdf) for more precise point estimate of topic-word distribution. - Implement [Labelled LDA](https://www-nlp.stanford.edu/cmanning/papers/llda-emnlp09.pdf) +- Able to obtain per-word topic frequency. + +The implementaion relies on [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) for faster array multiplication and [pybind11](https://github.com/pybind/pybind11) for simple binding. + ## Installation +You can install the wheel from pypi: + ``` -pip install git+https://github.com/tohtsky/lda11 +pip install lda11 ``` -The above command will automatically download Eigen (ver 3.3.7). -If you want to use an existing version of Eigen (located on `path/to/eigen`), -type +For x64 architecture, the above wheel is built using AVX. +If it is not convenient for you, try e.g. ``` -EIGEN3_INCLUDE_DIR=/path/to/eigen pip install git+https://github.com/tohtsky/lda11 +CFLAGS="-march=native" pip install git+https://github.com/tohtsky/lda11 ``` diff --git a/setup.py b/setup.py index d5d8fb6..3c9618b 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List import setuptools -from setuptools import Extension, setup +from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext __version__ = "0.3.0.0" @@ -181,6 +181,6 @@ def local_scheme(version: Any) -> str: install_requires=install_requires, setup_requires=setup_requires, cmdclass={"build_ext": BuildExt}, - packages=["lda11"], + packages=find_packages(), zip_safe=False, ) From 4b9519548bbe2191d32698f027b9386a5cbac411 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 18:59:07 +0900 Subject: [PATCH 26/32] Bump eigen version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3c9618b..83ec118 100644 --- a/setup.py +++ b/setup.py @@ -24,8 +24,8 @@ class get_eigen_include(object): - EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip" - EIGEN3_DIRNAME = "eigen-3.3.7" + EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip" + EIGEN3_DIRNAME = "eigen-3.4.0" def __str__(self) -> str: if eigen_include_dir is not None: From 467ae2d48e9186a512c2f836fa072c0163688ec9 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 19:58:22 +0900 Subject: [PATCH 27/32] Fix setup.py --- .gitignore | 1 + setup.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index f3e2f8e..36a4bb0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .python-version **.ipynb_checkpoints** eigen-3.3.7/ +eigen-3.4.0/ build/* **__pycache__** tmp/** diff --git a/setup.py b/setup.py index 83ec118..9beb43d 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,16 @@ import os import sys +from pathlib import Path from typing import Any, Dict, List import setuptools from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext +SETUP_DIRECTORY = Path(__file__).resolve().parent +with (SETUP_DIRECTORY / "README.md").open() as ifs: + LONG_DESCRIPTION = ifs.read() + __version__ = "0.3.0.0" install_requires = [ "numpy>=1.21", @@ -174,13 +179,14 @@ def local_scheme(version: Any) -> str: version=__version__, author="Tomoki Ohtsuki", url="https://github.com/tohtsky/lda11", - author_email="tomoki.ohtsuki129@gmail.com", + author_email="tomoki.ohtsuki.19937@outook.jp", description="Yet another CGS sampler for Latent Dirichlet Allocation.", - long_description="", + long_description=LONG_DESCRIPTION, + long_description_content_type="text/markdown", ext_modules=ext_modules, install_requires=install_requires, setup_requires=setup_requires, cmdclass={"build_ext": BuildExt}, packages=find_packages(), - zip_safe=False, + include_package_data=True, ) From 8a16cec1f73c92501c5b28af7014daefcc0eb84a Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 20:08:35 +0900 Subject: [PATCH 28/32] Manually specify packages. --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9beb43d..c652ed7 100644 --- a/setup.py +++ b/setup.py @@ -187,6 +187,7 @@ def local_scheme(version: Any) -> str: install_requires=install_requires, setup_requires=setup_requires, cmdclass={"build_ext": BuildExt}, - packages=find_packages(), + packages=["lda11", "lda11._lda"], include_package_data=True, + zip_safe=False, ) From 86cc42fed7a8b09fdb37f2eb2be2604192cf2f21 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 20:40:19 +0900 Subject: [PATCH 29/32] fix layout --- .github/workflows/wheels.yaml | 2 +- {src => cpp_sources}/child_worker.cpp | 0 {src => cpp_sources}/defs.hpp | 0 {src => cpp_sources}/labelled_lda.cpp | 0 {src => cpp_sources}/labelled_lda.hpp | 0 {src => cpp_sources}/predictor.cpp | 0 {src => cpp_sources}/predictor.hpp | 0 {src => cpp_sources}/trainer.cpp | 0 {src => cpp_sources}/trainer.hpp | 0 {src => cpp_sources}/trainer_base.cpp | 0 {src => cpp_sources}/trainer_base.hpp | 0 {src => cpp_sources}/util.hpp | 0 {src => cpp_sources}/wrapper.cpp | 0 setup.py | 15 ++++++++------- {lda11 => src/lda11}/__init__.py | 0 {lda11 => src/lda11}/_lda.pyi | 1 - {lda11 => src/lda11}/labelled_lda.py | 1 - {lda11 => src/lda11}/lda.py | 0 {lda11 => src/lda11}/util.py | 0 19 files changed, 9 insertions(+), 10 deletions(-) rename {src => cpp_sources}/child_worker.cpp (100%) rename {src => cpp_sources}/defs.hpp (100%) rename {src => cpp_sources}/labelled_lda.cpp (100%) rename {src => cpp_sources}/labelled_lda.hpp (100%) rename {src => cpp_sources}/predictor.cpp (100%) rename {src => cpp_sources}/predictor.hpp (100%) rename {src => cpp_sources}/trainer.cpp (100%) rename {src => cpp_sources}/trainer.hpp (100%) rename {src => cpp_sources}/trainer_base.cpp (100%) rename {src => cpp_sources}/trainer_base.hpp (100%) rename {src => cpp_sources}/util.hpp (100%) rename {src => cpp_sources}/wrapper.cpp (100%) rename {lda11 => src/lda11}/__init__.py (100%) rename {lda11 => src/lda11}/_lda.pyi (99%) rename {lda11 => src/lda11}/labelled_lda.py (99%) rename {lda11 => src/lda11}/lda.py (100%) rename {lda11 => src/lda11}/util.py (100%) diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml index e82f7ad..e9fa3b2 100644 --- a/.github/workflows/wheels.yaml +++ b/.github/workflows/wheels.yaml @@ -34,7 +34,7 @@ jobs: CIBW_BUILD: "${{ matrix.cibw.build || '*' }}" CIBW_SKIP: "${{ matrix.cibw.skip || '' }}" CIBW_ENVIRONMENT: "${{ matrix.cibw.env || '' }}" - CIBW_TEST_COMMAND: pytest {project}/tests + CIBW_TEST_COMMAND: "pytest {project}/tests" CIBW_TEST_REQUIRES: pytest CIBW_MANYLINUX_X86_64_IMAGE: "${{ matrix.cibw.manylinux_image }}" CIBW_MANYLINUX_I686_IMAGE: "${{ matrix.cibw.manylinux_image }}" diff --git a/src/child_worker.cpp b/cpp_sources/child_worker.cpp similarity index 100% rename from src/child_worker.cpp rename to cpp_sources/child_worker.cpp diff --git a/src/defs.hpp b/cpp_sources/defs.hpp similarity index 100% rename from src/defs.hpp rename to cpp_sources/defs.hpp diff --git a/src/labelled_lda.cpp b/cpp_sources/labelled_lda.cpp similarity index 100% rename from src/labelled_lda.cpp rename to cpp_sources/labelled_lda.cpp diff --git a/src/labelled_lda.hpp b/cpp_sources/labelled_lda.hpp similarity index 100% rename from src/labelled_lda.hpp rename to cpp_sources/labelled_lda.hpp diff --git a/src/predictor.cpp b/cpp_sources/predictor.cpp similarity index 100% rename from src/predictor.cpp rename to cpp_sources/predictor.cpp diff --git a/src/predictor.hpp b/cpp_sources/predictor.hpp similarity index 100% rename from src/predictor.hpp rename to cpp_sources/predictor.hpp diff --git a/src/trainer.cpp b/cpp_sources/trainer.cpp similarity index 100% rename from src/trainer.cpp rename to cpp_sources/trainer.cpp diff --git a/src/trainer.hpp b/cpp_sources/trainer.hpp similarity index 100% rename from src/trainer.hpp rename to cpp_sources/trainer.hpp diff --git a/src/trainer_base.cpp b/cpp_sources/trainer_base.cpp similarity index 100% rename from src/trainer_base.cpp rename to cpp_sources/trainer_base.cpp diff --git a/src/trainer_base.hpp b/cpp_sources/trainer_base.hpp similarity index 100% rename from src/trainer_base.hpp rename to cpp_sources/trainer_base.hpp diff --git a/src/util.hpp b/cpp_sources/util.hpp similarity index 100% rename from src/util.hpp rename to cpp_sources/util.hpp diff --git a/src/wrapper.cpp b/cpp_sources/wrapper.cpp similarity index 100% rename from src/wrapper.cpp rename to cpp_sources/wrapper.cpp diff --git a/setup.py b/setup.py index c652ed7..bc6dd1f 100644 --- a/setup.py +++ b/setup.py @@ -77,12 +77,12 @@ def __str__(self) -> str: Extension( "lda11._lda", [ - "src/wrapper.cpp", - "src/predictor.cpp", - "src/trainer_base.cpp", - "src/trainer.cpp", - "src/child_worker.cpp", - "src/labelled_lda.cpp", + "cpp_sources/wrapper.cpp", + "cpp_sources/predictor.cpp", + "cpp_sources/trainer_base.cpp", + "cpp_sources/trainer.cpp", + "cpp_sources/child_worker.cpp", + "cpp_sources/labelled_lda.cpp", ], include_dirs=[ # Path to pybind11 headers @@ -187,7 +187,8 @@ def local_scheme(version: Any) -> str: install_requires=install_requires, setup_requires=setup_requires, cmdclass={"build_ext": BuildExt}, - packages=["lda11", "lda11._lda"], + packages=find_packages("src"), include_package_data=True, zip_safe=False, + package_dir={"": "src"}, ) diff --git a/lda11/__init__.py b/src/lda11/__init__.py similarity index 100% rename from lda11/__init__.py rename to src/lda11/__init__.py diff --git a/lda11/_lda.pyi b/src/lda11/_lda.pyi similarity index 99% rename from lda11/_lda.pyi rename to src/lda11/_lda.pyi index a2322ff..bcac5e4 100644 --- a/lda11/_lda.pyi +++ b/src/lda11/_lda.pyi @@ -1,6 +1,5 @@ """Backend C++ inplementation for lda11.""" from __future__ import annotations -import lda11._lda import typing import numpy import numpy.typing as npt diff --git a/lda11/labelled_lda.py b/src/lda11/labelled_lda.py similarity index 99% rename from lda11/labelled_lda.py rename to src/lda11/labelled_lda.py index 3bb28fe..c41fc0d 100644 --- a/lda11/labelled_lda.py +++ b/src/lda11/labelled_lda.py @@ -8,7 +8,6 @@ from ._lda import LabelledLDATrainer from ._lda import Predictor as CorePredictor from .lda import ( - IndexType, IntegerType, LDAPredictorMixin, RealType, diff --git a/lda11/lda.py b/src/lda11/lda.py similarity index 100% rename from lda11/lda.py rename to src/lda11/lda.py diff --git a/lda11/util.py b/src/lda11/util.py similarity index 100% rename from lda11/util.py rename to src/lda11/util.py From 61e6d1d571a7153f818a60742fa83b21eb0b57f6 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 20:45:39 +0900 Subject: [PATCH 30/32] Fix workflows. --- .github/workflows/test.yml | 2 +- .github/workflows/{wheels.yaml => wheels.yml} | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) rename .github/workflows/{wheels.yaml => wheels.yml} (99%) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 691ae5f..36586a8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -22,7 +22,7 @@ jobs: - name: mypy run: | pip install mypy - mypy lda11 --ignore-missing-imports + mypy src/lda11 --ignore-missing-imports - name: Run pytest run: | pip install pytest pytest-cov diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yml similarity index 99% rename from .github/workflows/wheels.yaml rename to .github/workflows/wheels.yml index e9fa3b2..1c8320b 100644 --- a/.github/workflows/wheels.yaml +++ b/.github/workflows/wheels.yml @@ -112,6 +112,7 @@ jobs: name: win_amd64 architecture: x64 cibw: + skip: "cp36*" build: "cp*win_amd64" env: "CL='/arch:AVX'" From 880334f5e2b4db99010bb308639608b0cb1e894e Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 21:00:21 +0900 Subject: [PATCH 31/32] Restore branch restriction --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 1c8320b..0849f2e 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -1,8 +1,8 @@ name: Build on: push: -# branches: -# - main + branches: + - main release: types: - created From 813f8895bf1353fd515ffd659749455eefe1b3c5 Mon Sep 17 00:00:00 2001 From: Tomoki Date: Mon, 10 Jan 2022 21:03:03 +0900 Subject: [PATCH 32/32] Fix test workflow --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 36586a8..4c176ca 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -26,7 +26,7 @@ jobs: - name: Run pytest run: | pip install pytest pytest-cov - pytest --cov=./lda11 tests/ + pytest --cov=./src/lda11 tests/ - name: Generate coverage (ubuntu) run: | coverage xml