From a4269d7d3ff0eac91c9b4c510d667bcbf31208f5 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Sun, 9 Jan 2022 21:48:45 +0900
Subject: [PATCH 01/32] add pre-commit

---
 .gitignore                     |  4 +--
 .pre-commit-config.yaml        | 28 ++++++++++++++++
 CMakeLists.txt                 |  2 +-
 README.md                      |  2 --
 examples/compare_perplexity.py | 50 ++++++++++++++---------------
 lda11/__init__.py              |  2 +-
 lda11/labelled_lda.py          | 58 ++++++++++++++++++----------------
 lda11/lda.py                   | 14 ++++----
 lda11/util.py                  |  5 +--
 pyproject.toml                 | 15 +++++++++
 setup.py                       | 12 ++++---
 src/defs.hpp                   |  2 +-
 12 files changed, 119 insertions(+), 75 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index 66885ed..90a0672 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.python-version
 **.ipynb_checkpoints**
 eigen-3.3.7/
 build/*
@@ -9,10 +10,9 @@ lda11.egg-info
 .vscode/*
 *.so
 test/*
-pubind11/
 .eggs/
 var/
 dist/
 compile_commands.json
 .clangd/
-**.ipynb_checkpoints/**
\ No newline at end of file
+**.ipynb_checkpoints/**
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..cef10c9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,28 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: no-commit-to-branch
+        args: [--branch, main]
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.6.4
+    hooks:
+      - id: isort
+        name: isort
+#  - repo: https://github.com/pre-commit/mirrors-mypy
+#    rev: v0.790 # Use the sha / tag you want to point at
+#    hooks:
+#      - id: mypy
+  - repo: https://github.com/psf/black
+    rev: 20.8b1
+    hooks:
+      - id: black
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce7d696..776be75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 2.8.12)
-set(CMAKE_EXPORT_COMPILE_COMMANDS, True)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 project(lda11)
 
 add_subdirectory(pybind11)
diff --git a/README.md b/README.md
index 7acf19d..4f900df 100644
--- a/README.md
+++ b/README.md
@@ -21,5 +21,3 @@ type
 ```
 EIGEN3_INCLUDE_DIR=/path/to/eigen pip install git+https://github.com/tohtsky/lda11
 ```
-
-
diff --git a/examples/compare_perplexity.py b/examples/compare_perplexity.py
index fcf7d5b..41c0df0 100644
--- a/examples/compare_perplexity.py
+++ b/examples/compare_perplexity.py
@@ -3,23 +3,22 @@
 
 import numpy as np
 from scipy import sparse as sps
-from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.datasets import fetch_20newsgroups
-from sklearn.model_selection import train_test_split
 from sklearn.decomposition import LatentDirichletAllocation as LDA_vb
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
 
 from lda11 import LDA as LDA_cgs_p
 from lda11.util import rowwise_train_test_split
 
 N_TOPICS = 16
-print('reading data...')
-dataset = fetch_20newsgroups(shuffle=False,
-                             remove=('headers', 'footers', 'quotes'))
+print("reading data...")
+dataset = fetch_20newsgroups(shuffle=False, remove=("headers", "footers", "quotes"))
 data_samples = dataset.data
 train_docs, test_docs = train_test_split(data_samples, random_state=42)
 
-print('priparing Count Vectorizer')
-tf_vectorizer = CountVectorizer(max_df=1.0, stop_words='english')
+print("priparing Count Vectorizer")
+tf_vectorizer = CountVectorizer(max_df=1.0, stop_words="english")
 
 X_train = tf_vectorizer.fit_transform(train_docs)
 X_test = tf_vectorizer.transform(test_docs)
@@ -28,36 +27,36 @@
 
 tf_vectorizer.get_stop_words()
 
-print('Splitting test documents...')
+print("Splitting test documents...")
 X_test_train, X_test_test = rowwise_train_test_split(X_test, random_seed=114514)
 
 
-print('Start fitting sk-learn model...')
+print("Start fitting sk-learn model...")
 start = time()
 vb_model = LDA_vb(n_components=N_TOPICS)
 vb_model.fit(X_train)
 
-phi_vb = vb_model.components_ / \
-    vb_model.components_.sum(axis=1)[:, np.newaxis]
+phi_vb = vb_model.components_ / vb_model.components_.sum(axis=1)[:, np.newaxis]
 end = time()
-print('done in {:.2f} seconds'.format((end-start)))
+print("done in {:.2f} seconds".format((end - start)))
 
-print('Start fitting our lda model...')
+print("Start fitting our lda model...")
 start = time()
 cgs_p_model = LDA_cgs_p(n_components=N_TOPICS, n_iter=500)
 cgs_p_model.fit(X_train)
 phi_cgs_p = cgs_p_model.phi.transpose()
 end = time()
-print('done in {:.2f} seconds'.format((end-start)))
+print("done in {:.2f} seconds".format((end - start)))
 
-print('Start fitting paralleized CGS sampler with hyper-parameter optimization...')
+print("Start fitting paralleized CGS sampler with hyper-parameter optimization...")
 start = time()
 parallel_cgs_model = LDA_cgs_p(
-    n_components=N_TOPICS, n_iter=500, n_workers=2, optimize_interval=50)
+    n_components=N_TOPICS, n_iter=500, n_workers=2, optimize_interval=50
+)
 parallel_cgs_model.fit(X_train)
 phi_parallel_cgs = parallel_cgs_model.phi.transpose()
 end = time()
-print('done in {:.2f} seconds'.format((end-start)))
+print("done in {:.2f} seconds".format((end - start)))
 
 
 def test_perplexity(model, phi, **kwargs):
@@ -65,27 +64,24 @@ def test_perplexity(model, phi, **kwargs):
     log_ps = np.log(theta.dot(phi))
     coo = X_test_test.tocoo()
     # perplexity
-    return np.exp(- (log_ps[coo.row, coo.col] * coo.data).sum() / coo.data.sum())
+    return np.exp(-(log_ps[coo.row, coo.col] * coo.data).sum() / coo.data.sum())
 
 
-print('Start testing vb model')
+print("Start testing vb model")
 start = time()
 ll_vb = test_perplexity(vb_model, phi_vb)
 end = time()
-print('Done in {:.2f} seconds, test perplexity = {:.2f}'.format(
-    end - start, ll_vb))
+print("Done in {:.2f} seconds, test perplexity = {:.2f}".format(end - start, ll_vb))
 
 
-print('Start testing cgs_p model')
+print("Start testing cgs_p model")
 start = time()
 ll_cgs_p = test_perplexity(cgs_p_model, phi_cgs_p)
 end = time()
-print('Done in {:.2f} seconds, test perplexity = {:.2f}'.format(
-    end - start, ll_cgs_p))
+print("Done in {:.2f} seconds, test perplexity = {:.2f}".format(end - start, ll_cgs_p))
 
-print('Start testing parallelized + optimized cgs model')
+print("Start testing parallelized + optimized cgs model")
 start = time()
 ll_cgs_p = test_perplexity(parallel_cgs_model, phi_parallel_cgs, n_workers=4)
 end = time()
-print('Done in {:.2f} seconds, test perplexity = {:.2f}'.format(
-    end - start, ll_cgs_p))
+print("Done in {:.2f} seconds, test perplexity = {:.2f}".format(end - start, ll_cgs_p))
diff --git a/lda11/__init__.py b/lda11/__init__.py
index f696d9a..6775be3 100644
--- a/lda11/__init__.py
+++ b/lda11/__init__.py
@@ -1,2 +1,2 @@
-from .lda import LDA, MultipleContextLDA
 from .labelled_lda import LabelledLDA
+from .lda import LDA, MultipleContextLDA
diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py
index dd1dcba..6d25844 100644
--- a/lda11/labelled_lda.py
+++ b/lda11/labelled_lda.py
@@ -1,22 +1,30 @@
 import numpy as np
 from scipy import sparse as sps
 from tqdm import tqdm
+
 from ._lda import LabelledLDATrainer
 from .lda import (
+    IndexType,
+    IntegerType,
+    LDAPredictorMixin,
     Predictor,
-    RealType, IntegerType, IndexType,
-    number_to_array, check_array,
-    LDAPredictorMixin
+    RealType,
+    check_array,
+    number_to_array,
 )
 
 
 class LabelledLDA(LDAPredictorMixin):
-    def __init__(self,
-                 alpha=1e-2, epsilon=1e-30, topic_word_prior=None, add_dummy_topic=False,
-                 n_iter=1000,
-                 n_workers=1,
-                 use_cgs_p=True
-                 ):
+    def __init__(
+        self,
+        alpha=1e-2,
+        epsilon=1e-30,
+        topic_word_prior=None,
+        add_dummy_topic=False,
+        n_iter=1000,
+        n_workers=1,
+        use_cgs_p=True,
+    ):
         self.n_components = None
         self.topic_word_prior = topic_word_prior
         self.alpha = alpha
@@ -49,51 +57,45 @@ def _fit(self, X, Y, ll_freq=10):
         self.n_components = Y.shape[1]
         ones_topic = np.ones(self.n_components, dtype=RealType)
         self.topic_word_prior = number_to_array(
-            X.shape[1], 1 / float(self.n_components),
-            self.topic_word_prior
+            X.shape[1], 1 / float(self.n_components), self.topic_word_prior
         )
 
         try:
             count, dix, wix = check_array(X)
         except:
-            print('Check for X failed.')
+            print("Check for X failed.")
             raise
 
-        doc_topic = np.zeros(
-            (X.shape[0], self.n_components), dtype=IntegerType)
-        word_topic = np.zeros(
-            (X.shape[1], self.n_components), dtype=IntegerType)
+        doc_topic = np.zeros((X.shape[0], self.n_components), dtype=IntegerType)
+        word_topic = np.zeros((X.shape[1], self.n_components), dtype=IntegerType)
         topic_counts = np.zeros(self.n_components, dtype=IntegerType)
 
         docstate = LabelledLDATrainer(
             self.alpha,
             self.epsilon,
             Y,
-            count, dix, wix, self.n_components, 42,
-            self.n_workers
+            count,
+            dix,
+            wix,
+            self.n_components,
+            42,
+            self.n_workers,
         )
         docstate.initialize(word_topic, doc_topic, topic_counts)
 
         with tqdm(range(self.n_iter)) as pbar:
             for _ in pbar:
                 docstate.iterate_gibbs(
-                    self.topic_word_prior,
-                    doc_topic,
-                    word_topic,
-                    topic_counts
+                    self.topic_word_prior, doc_topic, word_topic, topic_counts
                 )
 
-        doc_topic_prior = (
-            self.alpha * np.ones(self.n_components, dtype=RealType))
+        doc_topic_prior = self.alpha * np.ones(self.n_components, dtype=RealType)
 
         self.components_ = word_topic.transpose()
         predictor = Predictor(self.n_components, doc_topic_prior, 42)
         if self.use_cgs_p:
             phi = docstate.obtain_phi(
-                self.topic_word_prior,
-                doc_topic,
-                word_topic,
-                topic_counts
+                self.topic_word_prior, doc_topic, word_topic, topic_counts
             )
         else:
             phi = word_topic + self.topic_word_prior[:, np.newaxis]
diff --git a/lda11/lda.py b/lda11/lda.py
index 277e1aa..512829f 100644
--- a/lda11/lda.py
+++ b/lda11/lda.py
@@ -1,16 +1,18 @@
-import numpy as np
-from numbers import Number
 from gc import collect
+from numbers import Number
+
+import numpy as np
+from scipy import sparse as sps
+from scipy.special import digamma
+from tqdm import tqdm
+
 from ._lda import (
     LDATrainer,
-    log_likelihood_doc_topic,
     Predictor,
     learn_dirichlet,
     learn_dirichlet_symmetric,
+    log_likelihood_doc_topic,
 )
-from tqdm import tqdm
-from scipy import sparse as sps
-from scipy.special import digamma
 
 RealType = np.float64
 IntegerType = np.int32
diff --git a/lda11/util.py b/lda11/util.py
index 64120c3..c6e01ee 100644
--- a/lda11/util.py
+++ b/lda11/util.py
@@ -1,7 +1,8 @@
 import numpy as np
 from scipy import sparse as sps
-from .lda import RealType, IntegerType
+
 from ._lda import train_test_split
+from .lda import IntegerType, RealType
 
 
 def rowwise_train_test_split(X, random_seed=None, test_ratio=0.5):
@@ -9,6 +10,6 @@ def rowwise_train_test_split(X, random_seed=None, test_ratio=0.5):
     split matrix randomly
     """
     if random_seed is None:
-        random_seed = np.random.randint(-2**63, 2**63-1)
+        random_seed = np.random.randint(-(2 ** 63), 2 ** 63 - 1)
     X = sps.csr_matrix(X, dtype=IntegerType)
     return train_test_split(X, test_ratio, random_seed)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..6796ae2
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,15 @@
+[tool.black]
+ensure_newline_before_comments = true
+force_grid_wrap = 0
+include_trailing_comma = true
+line_length = 88
+multi_line_output = 3
+use_parentheses = true
+
+[tool.isort]
+ensure_newline_before_comments = true
+force_grid_wrap = 0
+include_trailing_comma = true
+line_length = 88
+multi_line_output = 3
+use_parentheses = true
diff --git a/setup.py b/setup.py
index 9bce798..82fe294 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,9 @@
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
+import os
 import sys
+
 import setuptools
-import os
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
 
 __version__ = "0.2.2.0"
 install_requires = ["pybind11>=2.5", "numpy >= 1.11", "tqdm", "scipy>=1.0.0"]
@@ -27,9 +28,10 @@ def __str__(self):
             return target_dir
 
         download_target_dir = os.path.join(basedir, "eigen3.zip")
-        import requests
         import zipfile
 
+        import requests
+
         response = requests.get(self.EIGEN3_URL, stream=True)
         with open(download_target_dir, "wb") as ofs:
             for chunk in response.iter_content(chunk_size=1024):
@@ -45,7 +47,7 @@ class get_pybind_include(object):
     """Helper class to determine the pybind11 include path
     The purpose of this class is to postpone importing pybind11
     until it is actually installed, so that the ``get_include()``
-    method can be invoked. """
+    method can be invoked."""
 
     def __init__(self, user=False):
         self.user = user
diff --git a/src/defs.hpp b/src/defs.hpp
index 7646504..1213a36 100644
--- a/src/defs.hpp
+++ b/src/defs.hpp
@@ -36,4 +36,4 @@ struct UrandDevice {
 private:
   std::mt19937 random_state_;
   std::uniform_real_distribution<Real> udist_;
-};
\ No newline at end of file
+};

From b46d5c566c5d5519365f20e747e138b77d12f593 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Sun, 9 Jan 2022 21:56:19 +0900
Subject: [PATCH 02/32] Added stubs for _lda

---
 .gitignore                          |   1 +
 create_pb_stubs.sh                  |  13 ++
 lda11/_lda.pyi                      | 193 ++++++++++++++++++++++++++++
 src/wrapper.cpp                     |   3 -
 stubs/lda11/_lda-stubs/__init__.pyi |  58 +++++++++
 tests/__init__.py                   |   0
 6 files changed, 265 insertions(+), 3 deletions(-)
 create mode 100755 create_pb_stubs.sh
 create mode 100644 lda11/_lda.pyi
 create mode 100644 stubs/lda11/_lda-stubs/__init__.pyi
 create mode 100644 tests/__init__.py

diff --git a/.gitignore b/.gitignore
index 90a0672..19ff8cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ dist/
 compile_commands.json
 .clangd/
 **.ipynb_checkpoints/**
+.cache/clangd
diff --git a/create_pb_stubs.sh b/create_pb_stubs.sh
new file mode 100755
index 0000000..ab0c720
--- /dev/null
+++ b/create_pb_stubs.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+module_name="lda11._lda"
+echo "Create stub for $module_name"
+pybind11-stubgen -o stubs --no-setup-py "$module_name"
+output_path="$(echo "${module_name}" | sed 's/\./\//g').pyi"
+input_path="stubs/$(echo "${module_name}" | sed 's/\./\//g')-stubs/__init__.pyi"
+rm "${output_path}"
+echo 'm: int
+n: int
+from numpy import float32
+' >> "${output_path}"
+cat "${input_path}" >> "${output_path}"
+black "${output_path}"
diff --git a/lda11/_lda.pyi b/lda11/_lda.pyi
new file mode 100644
index 0000000..4349ab9
--- /dev/null
+++ b/lda11/_lda.pyi
@@ -0,0 +1,193 @@
+m: int
+n: int
+from numpy import float32
+
+"""Backend C++ inplementation for lda11."""
+from __future__ import annotations
+import lda11._lda
+import typing
+import numpy
+import scipy.sparse
+
+_Shape = typing.Tuple[int, ...]
+
+__all__ = [
+    "LDATrainer",
+    "LabelledLDATrainer",
+    "Predictor",
+    "learn_dirichlet",
+    "learn_dirichlet_symmetric",
+    "log_likelihood_doc_topic",
+    "train_test_split",
+]
+
+class LDATrainer:
+    def __init__(
+        self,
+        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+        arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
+        arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
+        arg4: int,
+        arg5: int,
+        arg6: int,
+    ) -> None: ...
+    def initialize(
+        self,
+        arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+    ) -> None: ...
+    def iterate_gibbs(
+        self,
+        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+    ) -> None: ...
+    def log_likelihood(
+        self,
+        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+    ) -> float: ...
+    def obtain_phi(
+        self,
+        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    def set_doc_topic_prior(
+        self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]]
+    ) -> None: ...
+    pass
+
+class LabelledLDATrainer:
+    def __init__(
+        self,
+        arg0: float,
+        arg1: float,
+        arg2: scipy.sparse.csr_matrix[numpy.int32],
+        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+        arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
+        arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
+        arg6: int,
+        arg7: int,
+        arg8: int,
+    ) -> None: ...
+    def initialize(
+        self,
+        arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+    ) -> None: ...
+    def iterate_gibbs(
+        self,
+        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+    ) -> None: ...
+    def log_likelihood(
+        self,
+        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+    ) -> float: ...
+    def obtain_phi(
+        self,
+        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    pass
+
+class Predictor:
+    def __getstate__(self) -> tuple: ...
+    def __init__(
+        self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int
+    ) -> None: ...
+    def __setstate__(self, arg0: tuple) -> None: ...
+    def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ...
+    def predict_gibbs(
+        self,
+        arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg2: int,
+        arg3: int,
+        arg4: int,
+        arg5: bool,
+    ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
+    def predict_gibbs_batch(
+        self,
+        arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
+        arg1: int,
+        arg2: int,
+        arg3: int,
+        arg4: bool,
+        arg5: int,
+    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    def predict_gibbs_with_word_assignment(
+        self,
+        arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg2: int,
+        arg3: int,
+        arg4: int,
+        arg5: bool,
+    ) -> typing.Tuple[
+        numpy.ndarray[numpy.float64, _Shape[m, 1]],
+        typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]],
+    ]: ...
+    def predict_mf(
+        self,
+        arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg2: int,
+        arg3: float,
+    ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
+    def predict_mf_batch(
+        self,
+        arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
+        arg1: int,
+        arg2: float,
+        arg3: int,
+    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    @property
+    def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]:
+        """
+        :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]
+        """
+    pass
+
+def learn_dirichlet(
+    arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
+    arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+    arg2: float,
+    arg3: float,
+    arg4: int,
+) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]:
+    pass
+
+def learn_dirichlet_symmetric(
+    arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
+    arg1: float,
+    arg2: float,
+    arg3: float,
+    arg4: int,
+) -> float:
+    pass
+
+def log_likelihood_doc_topic(
+    arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+    arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+    arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+) -> float:
+    pass
+
+def train_test_split(
+    arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int
+) -> typing.Tuple[
+    scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32]
+]:
+    pass
diff --git a/src/wrapper.cpp b/src/wrapper.cpp
index c5287f7..c73915b 100644
--- a/src/wrapper.cpp
+++ b/src/wrapper.cpp
@@ -145,8 +145,6 @@ Real learn_dirichlet_symmetric(const Eigen::Ref<IntegerMatrix> &counts,
   }
   Real alpha_current(alpha_start);
 
-  Real numerator;
-
   vector<Real> doc_length;
   vector<Real> doc_length_freq;
 
@@ -183,7 +181,6 @@ Real learn_dirichlet_symmetric(const Eigen::Ref<IntegerMatrix> &counts,
   }
   for (size_t it = 0; it < iteration; it++) {
     Real alpha_sum = n_topic * alpha_current;
-    numerator = 0;
     Real denominator =
         ((vector_to_eigen(doc_length).array() + alpha_sum).digamma() -
          digamma(alpha_sum))
diff --git a/stubs/lda11/_lda-stubs/__init__.pyi b/stubs/lda11/_lda-stubs/__init__.pyi
new file mode 100644
index 0000000..ed1336a
--- /dev/null
+++ b/stubs/lda11/_lda-stubs/__init__.pyi
@@ -0,0 +1,58 @@
+"""Backend C++ inplementation for lda11."""
+from __future__ import annotations
+import lda11._lda
+import typing
+import numpy
+import scipy.sparse
+_Shape = typing.Tuple[int, ...]
+
+__all__ = [
+    "LDATrainer",
+    "LabelledLDATrainer",
+    "Predictor",
+    "learn_dirichlet",
+    "learn_dirichlet_symmetric",
+    "log_likelihood_doc_topic",
+    "train_test_split"
+]
+
+
+class LDATrainer():
+    def __init__(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg4: int, arg5: int, arg6: int) -> None: ...
+    def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
+    def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
+    def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ...
+    def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    def set_doc_topic_prior(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]]) -> None: ...
+    pass
+class LabelledLDATrainer():
+    def __init__(self, arg0: float, arg1: float, arg2: scipy.sparse.csr_matrix[numpy.int32], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg6: int, arg7: int, arg8: int) -> None: ...
+    def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
+    def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
+    def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ...
+    def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    pass
+class Predictor():
+    def __getstate__(self) -> tuple: ...
+    def __init__(self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int) -> None: ...
+    def __setstate__(self, arg0: tuple) -> None: ...
+    def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ...
+    def predict_gibbs(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
+    def predict_gibbs_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    def predict_gibbs_with_word_assignment(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> typing.Tuple[numpy.ndarray[numpy.float64, _Shape[m, 1]], typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]]]: ...
+    def predict_mf(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: float) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
+    def predict_mf_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: float, arg3: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    @property
+    def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]:
+        """
+        :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]
+        """
+    pass
+def learn_dirichlet(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: float, arg3: float, arg4: int) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]:
+    pass
+def learn_dirichlet_symmetric(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: float, arg2: float, arg3: float, arg4: int) -> float:
+    pass
+def log_likelihood_doc_topic(arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> float:
+    pass
+def train_test_split(arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int) -> typing.Tuple[scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32]]:
+    pass
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29

From bad17635fd6f4017413e9a2820fe4edeb1bff204 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Sun, 9 Jan 2022 23:28:59 +0900
Subject: [PATCH 03/32] mypy

---
 .pre-commit-config.yaml |   8 +-
 lda11/labelled_lda.py   |   7 +-
 lda11/lda.py            | 219 ++++++++++++++++++++++------------------
 setup.py                |   3 +-
 4 files changed, 129 insertions(+), 108 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cef10c9..809da50 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,10 +18,10 @@ repos:
     hooks:
       - id: isort
         name: isort
-#  - repo: https://github.com/pre-commit/mirrors-mypy
-#    rev: v0.790 # Use the sha / tag you want to point at
-#    hooks:
-#      - id: mypy
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.790 # Use the sha / tag you want to point at
+    hooks:
+      - id: mypy
   - repo: https://github.com/psf/black
     rev: 20.8b1
     hooks:
diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py
index 6d25844..add8f46 100644
--- a/lda11/labelled_lda.py
+++ b/lda11/labelled_lda.py
@@ -3,11 +3,11 @@
 from tqdm import tqdm
 
 from ._lda import LabelledLDATrainer
+from ._lda import Predictor as CorePredictor
 from .lda import (
     IndexType,
     IntegerType,
     LDAPredictorMixin,
-    Predictor,
     RealType,
     check_array,
     number_to_array,
@@ -48,7 +48,7 @@ def fit_transform(self, X, Y, **kwargs):
         result /= result.sum(axis=1)[:, np.newaxis]
         return result
 
-    def _fit(self, X, Y, ll_freq=10):
+    def _fit(self, X, Y, ll_freq: int = 10):
         if not sps.issparse(Y):
             Y = sps.csr_matrix(Y).astype(IntegerType)
         else:
@@ -92,7 +92,7 @@ def _fit(self, X, Y, ll_freq=10):
         doc_topic_prior = self.alpha * np.ones(self.n_components, dtype=RealType)
 
         self.components_ = word_topic.transpose()
-        predictor = Predictor(self.n_components, doc_topic_prior, 42)
+        predictor = CorePredictor(self.n_components, doc_topic_prior, 42)
         if self.use_cgs_p:
             phi = docstate.obtain_phi(
                 self.topic_word_prior, doc_topic, word_topic, topic_counts
@@ -108,4 +108,5 @@ def _fit(self, X, Y, ll_freq=10):
 
     @property
     def phi(self):
+        assert self.predictor is not None
         return self.predictor.phis[0]
diff --git a/lda11/lda.py b/lda11/lda.py
index 512829f..dff1cfb 100644
--- a/lda11/lda.py
+++ b/lda11/lda.py
@@ -1,61 +1,76 @@
 from gc import collect
 from numbers import Number
+from typing import Dict, List, Literal, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
+from numpy import integer
+from numpy import typing as npt
 from scipy import sparse as sps
-from scipy.special import digamma
 from tqdm import tqdm
 
-from ._lda import (
-    LDATrainer,
-    Predictor,
-    learn_dirichlet,
-    learn_dirichlet_symmetric,
-    log_likelihood_doc_topic,
-)
+from ._lda import LDATrainer
+from ._lda import Predictor as CorePredictor
+from ._lda import learn_dirichlet, learn_dirichlet_symmetric, log_likelihood_doc_topic
 
 RealType = np.float64
+
 IntegerType = np.int32
 IndexType = np.uint64
 
 
-def number_to_array(n_components, default, arg=None, ensure_symmetry=False):
-    if arg is None:
-        arg = default
-    if isinstance(arg, Number):
-        return np.ones(n_components, dtype=RealType) * RealType(arg)
-    elif isinstance(arg, np.ndarray):
-        assert arg.shape[0] == n_components
-        if ensure_symmetry and np.unique(arg).shape[0] > 1:
+ValidXType = Union[sps.spmatrix, np.ndarray]
+PriorType = Union[np.ndarray, float, None]
+
+
+class LDAInput(NamedTuple):
+    counts: np.ndarray
+    dix: np.ndarray
+    wis: np.ndarray
+
+
+def number_to_array(
+    n_components: int,
+    default: float,
+    arg_: Union[float, None, np.ndarray] = None,
+    ensure_symmetry: bool = False,
+) -> np.ndarray:
+    if arg_ is None or isinstance(arg_, float):
+        value_ = default if arg_ is None else float(arg_)
+        return np.ones(n_components, dtype=RealType) * value_
+    if isinstance(arg_, np.ndarray):
+        assert arg_.shape[0] == n_components
+        if ensure_symmetry and np.unique(arg_).shape[0] > 1:
             raise ValueError("Symmetric array required.")
-        return arg.astype(RealType)
-    return None
+        return arg_.astype(RealType)
+    raise ValueError("Number of ndarray is required.")
 
 
-def check_array(X):
+def check_array(X: ValidXType) -> LDAInput:
+    assert X.dtype == np.int32 or X.dtype == np.int64
     if isinstance(X, np.ndarray):
         assert len(X.shape) == 2
-        assert X.dtype == np.int32 or X.dtype == np.int64
-
         dix, wix = X.nonzero()
-        counts = X[dix, wix]
+        counts: np.ndarray = X[dix, wix]
     elif sps.issparse(X):
         # if X is either types of, scipy.sparse X has this attribute.
-        X = X.tocsr()
+        X = sps.csr_matrix(X)
         X.sort_indices()
         dix, wix = X.nonzero()
-        counts = X.data
+        counts = X.data.astype(np.int32)
     else:
         raise ValueError("The input must be either np.ndarray or sparse array.")
-    return counts.astype(IntegerType), dix.astype(IndexType), wix.astype(IndexType)
+    return LDAInput(
+        counts.astype(IntegerType), dix.astype(IndexType), wix.astype(IndexType)
+    )
 
 
-def bow_row_to_counts(X, i):
+def bow_row_to_counts(X: ValidXType, i: int) -> Tuple[np.ndarray, np.ndarray]:
+    wix: np.ndarray
     if isinstance(X, np.ndarray):
         assert len(X.shape) == 2
         assert X.dtype == np.int32 or X.dtype == np.int64
         (wix,) = X[i].nonzero()
-        counts = X[i, wix]
+        counts: np.ndarray = X[i, wix]
     else:
         _, wix = X[i].nonzero()
         counts = X[i, wix].toarray().ravel()
@@ -63,7 +78,7 @@ def bow_row_to_counts(X, i):
     return counts.astype(IntegerType), wix.astype(IndexType)
 
 
-def to_sparse(X):
+def to_valid_csr(X: ValidXType) -> sps.csr_matrix:
     result = sps.csr_matrix(X)
     result.data = result.data.astype(IntegerType)
     return result
@@ -77,33 +92,38 @@ class LDAPredictorMixin:
     are needed
     """
 
+    topic_word_priors_: Optional[List[np.ndarray]]
+    predictor: Optional[CorePredictor]
+
     def transform(
         self,
-        *Xs,
-        n_iter=100,
-        random_seed=42,
-        mode="gibbs",
-        mf_tolerance=1e-10,
-        gibbs_burn_in=10,
-        use_cgs_p=True,
+        *Xs: Union[ValidXType, None],
+        n_iter: int = 100,
+        random_seed: int = 42,
+        mode: Literal["gibbs", "mf"] = "gibbs",
+        mf_tolerance: float = 1e-10,
+        gibbs_burn_in: int = 10,
+        use_cgs_p: bool = True,
         n_workers=1
-    ):
-        shapes = set({X.shape[0] for X in Xs})
+    ) -> np.ndarray:
+        assert self.topic_word_priors_ is not None
+        assert self.predictor is not None
+        shapes = set({int(X.shape[0]) for X in Xs if X is not None})
         if len(shapes) != 1:
             raise ValueError("Got different shape for Xs.")
         shape = list(shapes)[0]
 
-        Xs_csr = []
+        Xs_csr: List[sps.csr_matrix] = []
         for i, X in enumerate(Xs):
             if X is None:
                 Xs_csr.append(
                     sps.csr_matrix(
-                        ([], ([], [])),
-                        shape=(shape, self.topic_word_priors[i].shape[0]),
+                        shape=(shape, self.topic_word_priors_[i].shape[0]),
+                        dtype=IntegerType,
                     )
                 )
             else:
-                Xs_csr.append(to_sparse(X))
+                Xs_csr.append(to_valid_csr(X))
 
         if mode == "gibbs":
             return self.predictor.predict_gibbs_batch(
@@ -116,7 +136,9 @@ def transform(
 
     def word_topic_assignment(
         self, *Xs, n_iter=100, random_seed=42, gibbs_burn_in=10, use_cgs_p=True
-    ):
+    ) -> List[Tuple[np.ndarray, List[Dict[int, np.ndarray]]]]:
+        assert self.topic_word_priors_ is not None
+        assert self.predictor is not None
         n_domains = len(Xs)
         shapes = set({X.shape[0] for X in Xs})
         if len(shapes) != 1:
@@ -129,7 +151,7 @@ def word_topic_assignment(
                 Xs_csr.append(
                     sps.csr_matrix(
                         ([], ([], [])),
-                        shape=(shape, self.topic_word_priors[i].shape[0]),
+                        shape=(shape, self.topic_word_priors_[i].shape[0]),
                     )
                 )
         results = []
@@ -149,38 +171,40 @@ def word_topic_assignment(
         return results
 
     @property
-    def phis(self):
+    def phis(self) -> List[np.ndarray]:
+        assert self.predictor is not None
         return self.predictor.phis
 
 
 class MultipleContextLDA(LDAPredictorMixin):
     def __init__(
         self,
-        n_components=100,
-        doc_topic_prior=None,
-        topic_word_priors=None,
-        n_iter=1000,
-        optimize_interval=None,
-        optimize_burn_in=None,
-        n_workers=1,
-        use_cgs_p=True,
-        is_phi_symmetric=True,
+        n_components: int = 100,
+        doc_topic_prior: PriorType = None,
+        n_iter: int = 1000,
+        optimize_interval: Optional[int] = None,
+        optimize_burn_in: Optional[int] = None,
+        n_workers: int = 1,
+        use_cgs_p: bool = True,
+        is_phi_symmetric: bool = True,
     ):
         n_components = int(n_components)
         assert n_iter >= 1
         assert n_components >= 1
         self.n_components = n_components
 
-        self.doc_topic_prior = doc_topic_prior
-        self.topic_word_priors = topic_word_priors
+        self.doc_topic_prior = number_to_array(
+            self.n_components, 1 / float(self.n_components), doc_topic_prior
+        )
+        self.topic_word_priors_ = None
         self.is_phi_symmetric = is_phi_symmetric
-        self.n_vocabs = None
+        self.n_vocabs: Optional[List[int]] = None
         self.docstate_ = None
-        self.components_ = None
-        self.n_modals = None
+        self.components_: Optional[int] = None
+        self.n_modals: Optional[int] = None
 
-        self.predictor = None
-        self.use_cgs_p = use_cgs_p
+        self.predictor: Optional[CorePredictor] = None
+        self.use_cgs_p: bool = use_cgs_p
 
         self.n_iter = n_iter
         self.optimize_interval = optimize_interval
@@ -197,35 +221,34 @@ def fit(self, *X, **kwargs):
         self._fit(*X, **kwargs)
         return self
 
-    def _fit(self, *Xs, ll_freq=10):
+    def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray:
         """
         Xs should be a list of contents.
         All entries must have the same shape[0].
         """
-        n_vocabs = []
 
         self.modality = len(Xs)
-        self.doc_topic_prior = number_to_array(
-            self.n_components, 1 / float(self.n_components), self.doc_topic_prior
-        )
 
-        if self.topic_word_priors is None:
-            self.topic_word_priors = [None for i in range(self.modality)]
+        topic_word_priors_canonical: List[np.ndarray] = []
 
-        self.topic_word_priors = [
-            number_to_array(
-                X.shape[1],
-                1 / float(self.n_components),
-                ensure_symmetry=self.is_phi_symmetric,
-            )
-            for X, val in zip(Xs, self.topic_word_priors)
-        ]
+        doc_tuples: List[LDAInput] = []
 
-        doc_tuples = []
+        n_rows: Optional[int] = None
         for X in Xs:
-            doc_tuples.append((check_array(X)))
+            doc_tuples.append(check_array(X))
+            if n_rows is None:
+                n_rows = X.shape[0]
+            else:
+                assert n_rows == X.shape[0]
+            topic_word_priors_canonical.append(
+                number_to_array(
+                    X.shape[1],
+                    1 / float(self.n_components),
+                    ensure_symmetry=True,
+                )
+            )
 
-        doc_topic = np.zeros((X.shape[0], self.n_components), dtype=IntegerType)
+        doc_topic: np.ndarray = np.zeros((n_rows, self.n_components), dtype=IntegerType)
 
         topic_counts = np.zeros(self.n_components, dtype=IntegerType)
 
@@ -233,7 +256,7 @@ def _fit(self, *Xs, ll_freq=10):
             np.zeros((X.shape[1], self.n_components), dtype=IntegerType) for X in Xs
         ]
 
-        docstates = []
+        docstates: List[LDATrainer] = []
         for (count, dix, wix), word_topic in zip(doc_tuples, word_topics):
             docstate = LDATrainer(
                 self.doc_topic_prior,
@@ -246,11 +269,11 @@ def _fit(self, *Xs, ll_freq=10):
             )
             docstates.append(docstate)
             docstate.initialize(word_topic, doc_topic, topic_counts)
-        doc_length = doc_topic.sum(axis=1).astype(IntegerType)
+        doc_length: np.ndarray = doc_topic.sum(axis=1).astype(IntegerType)
 
         ll = log_likelihood_doc_topic(self.doc_topic_prior, doc_topic, doc_length)
         for topic_word_prior, word_topic, docstate in zip(
-            self.topic_word_priors, word_topics, docstates
+            topic_word_priors_canonical, word_topics, docstates
         ):
             ll += docstate.log_likelihood(topic_word_prior, word_topic)
 
@@ -258,7 +281,7 @@ def _fit(self, *Xs, ll_freq=10):
             pbar.set_description("Log Likelihood = {0:.2f}".format(ll))
             for i in pbar:
                 for topic_word_prior, word_topic, docstate in zip(
-                    self.topic_word_priors, word_topics, docstates
+                    topic_word_priors_canonical, word_topics, docstates
                 ):
                     docstate.iterate_gibbs(
                         topic_word_prior, doc_topic, word_topic, topic_counts
@@ -269,7 +292,7 @@ def _fit(self, *Xs, ll_freq=10):
                     )
 
                     for topic_word_prior, word_topic, docstate in zip(
-                        self.topic_word_priors, word_topics, docstates
+                        topic_word_priors_canonical, word_topics, docstates
                     ):
                         ll += docstate.log_likelihood(topic_word_prior, word_topic)
                     pbar.set_description("Log Likelihood = {0:.2f}".format(ll))
@@ -287,7 +310,7 @@ def _fit(self, *Xs, ll_freq=10):
                         100,
                     )
                     for topic_word_prior, word_topic, docstate in zip(
-                        self.topic_word_priors, word_topics, docstates
+                        topic_word_priors_canonical, word_topics, docstates
                     ):
                         if self.is_phi_symmetric:
                             topic_word_prior_new = np.ones_like(
@@ -310,8 +333,9 @@ def _fit(self, *Xs, ll_freq=10):
                         topic_word_prior[:] = topic_word_prior_new
                         self.doc_topic_prior = doc_topic_prior_new
                         docstate.set_doc_topic_prior(doc_topic_prior_new)
+        self.topic_word_priors = topic_word_priors_canonical
 
-        predictor = Predictor(self.n_components, self.doc_topic_prior, 42)
+        predictor = CorePredictor(self.n_components, self.doc_topic_prior, 42)
 
         for i, (twp, wt, docstate) in enumerate(
             zip(self.topic_word_priors, word_topics, docstates)
@@ -334,25 +358,19 @@ class LDA(MultipleContextLDA):
 
     def __init__(
         self,
-        n_components=100,
-        doc_topic_prior=None,
-        topic_word_prior=None,
-        n_iter=1000,
-        optimize_burn_in=None,
-        optimize_interval=None,
-        n_workers=1,
-        use_cgs_p=True,
-        is_phi_symmetric=True,
+        n_components: int = 100,
+        doc_topic_prior: Optional[np.ndarray] = None,
+        n_iter: int = 1000,
+        optimize_burn_in: Optional[int] = None,
+        optimize_interval: Optional[int] = None,
+        n_workers: int = 1,
+        use_cgs_p: bool = True,
+        is_phi_symmetric: bool = True,
     ):
-        if topic_word_prior is not None:
-            topic_word_priors = [topic_word_prior]
-        else:
-            topic_word_priors = None
 
         super(LDA, self).__init__(
             n_components=n_components,
             doc_topic_prior=doc_topic_prior,
-            topic_word_priors=topic_word_priors,
             n_iter=n_iter,
             optimize_burn_in=optimize_burn_in,
             optimize_interval=optimize_interval,
@@ -366,5 +384,6 @@ def fit(self, X, **kwargs):
         return self
 
     @property
-    def phi(self):
+    def phi(self) -> np.ndarray:
+        assert self.predictor is not None
         return self.predictor.phis[0]
diff --git a/setup.py b/setup.py
index 82fe294..9354f54 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from typing import Dict, List
 
 import setuptools
 from setuptools import Extension, setup
@@ -117,7 +118,7 @@ class BuildExt(build_ext):
         "msvc": ["/EHsc"],
         "unix": [],
     }
-    l_opts = {
+    l_opts: Dict[str, List[str]] = {
         "msvc": [],
         "unix": [],
     }

From eb343e6e469d37d1b30e6808ca8166d29d32dbe0 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 11:02:13 +0900
Subject: [PATCH 04/32] mypy check now passes

---
 lda11/__init__.py                   |   4 +-
 lda11/_lda.pyi                      | 125 +++++++++----------
 lda11/labelled_lda.py               |  44 ++++---
 lda11/lda.py                        | 108 +++++++++++-----
 lda11/util.py                       |  10 +-
 pyproject.toml                      |  11 ++
 stubs/lda11/_lda-stubs/__init__.pyi | 187 +++++++++++++++++++++++-----
 7 files changed, 339 insertions(+), 150 deletions(-)

diff --git a/lda11/__init__.py b/lda11/__init__.py
index 6775be3..7981c69 100644
--- a/lda11/__init__.py
+++ b/lda11/__init__.py
@@ -1,2 +1,4 @@
 from .labelled_lda import LabelledLDA
-from .lda import LDA, MultipleContextLDA
+from .lda import LDA, MultilingualLDA
+
+__all__ = ["LDA", "LabelledLDA", "MultilingualLDA"]
diff --git a/lda11/_lda.pyi b/lda11/_lda.pyi
index 4349ab9..902111c 100644
--- a/lda11/_lda.pyi
+++ b/lda11/_lda.pyi
@@ -1,12 +1,9 @@
-m: int
-n: int
-from numpy import float32
-
 """Backend C++ inplementation for lda11."""
 from __future__ import annotations
 import lda11._lda
 import typing
 import numpy
+import numpy.typing as npt
 import scipy.sparse
 
 _Shape = typing.Tuple[int, ...]
@@ -24,42 +21,40 @@ __all__ = [
 class LDATrainer:
     def __init__(
         self,
-        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]],
-        arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
-        arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.uint64],
+        arg3: npt.NDArray[numpy.uint64],
         arg4: int,
         arg5: int,
         arg6: int,
     ) -> None: ...
     def initialize(
         self,
-        arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+        arg0: npt.NDArray[numpy.int32],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
     ) -> None: ...
     def iterate_gibbs(
         self,
-        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
     ) -> None: ...
     def log_likelihood(
         self,
-        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
     ) -> float: ...
     def obtain_phi(
         self,
-        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
-    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
-    def set_doc_topic_prior(
-        self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]]
-    ) -> None: ...
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
+    ) -> npt.NDArray[numpy.float64]: ...
+    def set_doc_topic_prior(self, arg0: npt.NDArray[numpy.float64]) -> None: ...
     pass
 
 class LabelledLDATrainer:
@@ -68,56 +63,56 @@ class LabelledLDATrainer:
         arg0: float,
         arg1: float,
         arg2: scipy.sparse.csr_matrix[numpy.int32],
-        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
-        arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
-        arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]],
+        arg3: npt.NDArray[numpy.int32],
+        arg4: npt.NDArray[numpy.uint64],
+        arg5: npt.NDArray[numpy.uint64],
         arg6: int,
         arg7: int,
         arg8: int,
     ) -> None: ...
     def initialize(
         self,
-        arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+        arg0: npt.NDArray[numpy.int32],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
     ) -> None: ...
     def iterate_gibbs(
         self,
-        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
     ) -> None: ...
     def log_likelihood(
         self,
-        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
     ) -> float: ...
     def obtain_phi(
         self,
-        arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg2: numpy.ndarray[numpy.int32, _Shape[m, n]],
-        arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]],
-    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
+    ) -> npt.NDArray[numpy.float64]: ...
     pass
 
 class Predictor:
     def __getstate__(self) -> tuple: ...
     def __init__(
-        self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int
+        self, arg0: int, arg1: npt.NDArray[numpy.float64], arg2: int
     ) -> None: ...
     def __setstate__(self, arg0: tuple) -> None: ...
-    def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ...
+    def add_beta(self, arg0: npt.NDArray[numpy.float64]) -> None: ...
     def predict_gibbs(
         self,
-        arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
-        arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg0: typing.List[npt.NDArray[numpy.int32]],
+        arg1: typing.List[npt.NDArray[numpy.int32]],
         arg2: int,
         arg3: int,
         arg4: int,
         arg5: bool,
-    ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
+    ) -> npt.NDArray[numpy.float64]: ...
     def predict_gibbs_batch(
         self,
         arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
@@ -126,51 +121,51 @@ class Predictor:
         arg3: int,
         arg4: bool,
         arg5: int,
-    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    ) -> npt.NDArray[numpy.float64]: ...
     def predict_gibbs_with_word_assignment(
         self,
-        arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
-        arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg0: typing.List[npt.NDArray[numpy.int32]],
+        arg1: typing.List[npt.NDArray[numpy.int32]],
         arg2: int,
         arg3: int,
         arg4: int,
         arg5: bool,
     ) -> typing.Tuple[
-        numpy.ndarray[numpy.float64, _Shape[m, 1]],
-        typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]],
+        npt.NDArray[numpy.float64],
+        typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]],
     ]: ...
     def predict_mf(
         self,
-        arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
-        arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]],
+        arg0: typing.List[npt.NDArray[numpy.int32]],
+        arg1: typing.List[npt.NDArray[numpy.int32]],
         arg2: int,
         arg3: float,
-    ) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
+    ) -> npt.NDArray[numpy.float64]: ...
     def predict_mf_batch(
         self,
         arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
         arg1: int,
         arg2: float,
         arg3: int,
-    ) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    ) -> npt.NDArray[numpy.float64]: ...
     @property
-    def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]:
+    def phis(self) -> typing.List[npt.NDArray[numpy.float64]]:
         """
-        :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]
+        :type: typing.List[npt.NDArray[numpy.float64]]
         """
     pass
 
 def learn_dirichlet(
-    arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
-    arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]],
+    arg0: npt.NDArray[numpy.int32],
+    arg1: npt.NDArray[numpy.float64],
     arg2: float,
     arg3: float,
     arg4: int,
-) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]:
+) -> npt.NDArray[numpy.float64]:
     pass
 
 def learn_dirichlet_symmetric(
-    arg0: numpy.ndarray[numpy.int32, _Shape[m, n]],
+    arg0: npt.NDArray[numpy.int32],
     arg1: float,
     arg2: float,
     arg3: float,
@@ -179,9 +174,9 @@ def learn_dirichlet_symmetric(
     pass
 
 def log_likelihood_doc_topic(
-    arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]],
-    arg1: numpy.ndarray[numpy.int32, _Shape[m, n]],
-    arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]],
+    arg0: npt.NDArray[numpy.float64],
+    arg1: npt.NDArray[numpy.int32],
+    arg2: npt.NDArray[numpy.int32],
 ) -> float:
     pass
 
diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py
index add8f46..b19f13b 100644
--- a/lda11/labelled_lda.py
+++ b/lda11/labelled_lda.py
@@ -1,4 +1,7 @@
+from typing import Optional
+
 import numpy as np
+from numpy import typing as npt
 from scipy import sparse as sps
 from tqdm import tqdm
 
@@ -9,6 +12,7 @@
     IntegerType,
     LDAPredictorMixin,
     RealType,
+    ValidXType,
     check_array,
     number_to_array,
 )
@@ -17,47 +21,41 @@
 class LabelledLDA(LDAPredictorMixin):
     def __init__(
         self,
-        alpha=1e-2,
-        epsilon=1e-30,
-        topic_word_prior=None,
-        add_dummy_topic=False,
-        n_iter=1000,
-        n_workers=1,
-        use_cgs_p=True,
+        alpha: float = 1e-2,
+        epsilon: float = 1e-30,
+        n_iter: int = 1000,
+        n_workers: int = 1,
+        use_cgs_p: bool = True,
     ):
-        self.n_components = None
-        self.topic_word_prior = topic_word_prior
+        self.n_components_: Optional[int] = None
         self.alpha = alpha
         self.epsilon = 1e-20
         self.n_vocabs = None
         self.docstate_ = None
-        self.components_ = None
+        self.components_: Optional[npt.NDArray[np.int32]] = None
         self.predictor = None
         self.n_workers = n_workers
         self.epsilon = epsilon
-        self.add_dummy_topic = add_dummy_topic
         self.n_iter = n_iter
         self.use_cgs_p = use_cgs_p
 
-    def fit(self, X, Y):
-        self._fit(X, Y)
+    def fit(self, X: ValidXType, Y: ValidXType) -> "LabelledLDA":
+        self._fit_llda(X, Y)
         return self
 
-    def fit_transform(self, X, Y, **kwargs):
-        result = self._fit(X, **kwargs) + self.doc_topic_prior[np.newaxis, :]
-        result /= result.sum(axis=1)[:, np.newaxis]
-        return result
-
-    def _fit(self, X, Y, ll_freq: int = 10):
+    def _fit_llda(
+        self,
+        X: ValidXType,
+        Y: ValidXType,
+    ) -> npt.NDArray[np.int32]:
         if not sps.issparse(Y):
             Y = sps.csr_matrix(Y).astype(IntegerType)
         else:
             Y = Y.astype(IntegerType)
 
-        self.n_components = Y.shape[1]
-        ones_topic = np.ones(self.n_components, dtype=RealType)
+        self.n_components = int(Y.shape[1])
         self.topic_word_prior = number_to_array(
-            X.shape[1], 1 / float(self.n_components), self.topic_word_prior
+            X.shape[1], 1 / float(self.n_components), None
         )
 
         try:
@@ -107,6 +105,6 @@ def _fit(self, X, Y, ll_freq: int = 10):
         return doc_topic
 
     @property
-    def phi(self):
+    def phi(self) -> npt.NDArray[np.float64]:
         assert self.predictor is not None
         return self.predictor.phis[0]
diff --git a/lda11/lda.py b/lda11/lda.py
index dff1cfb..d0f71f0 100644
--- a/lda11/lda.py
+++ b/lda11/lda.py
@@ -1,9 +1,16 @@
-from gc import collect
-from numbers import Number
-from typing import Dict, List, Literal, NamedTuple, Optional, Tuple, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import numpy as np
-from numpy import integer
 from numpy import typing as npt
 from scipy import sparse as sps
 from tqdm import tqdm
@@ -18,7 +25,7 @@
 IndexType = np.uint64
 
 
-ValidXType = Union[sps.spmatrix, np.ndarray]
+ValidXType = Union[sps.spmatrix, npt.NDArray[np.int32], npt.NDArray[np.int64]]
 PriorType = Union[np.ndarray, float, None]
 
 
@@ -33,7 +40,7 @@ def number_to_array(
     default: float,
     arg_: Union[float, None, np.ndarray] = None,
     ensure_symmetry: bool = False,
-) -> np.ndarray:
+) -> npt.NDArray[np.float64]:
     if arg_ is None or isinstance(arg_, float):
         value_ = default if arg_ is None else float(arg_)
         return np.ones(n_components, dtype=RealType) * value_
@@ -85,13 +92,6 @@ def to_valid_csr(X: ValidXType) -> sps.csr_matrix:
 
 
 class LDAPredictorMixin:
-    """
-    self.components_
-    self.n_components
-    self.predictor
-    are needed
-    """
-
     topic_word_priors_: Optional[List[np.ndarray]]
     predictor: Optional[CorePredictor]
 
@@ -104,7 +104,7 @@ def transform(
         mf_tolerance: float = 1e-10,
         gibbs_burn_in: int = 10,
         use_cgs_p: bool = True,
-        n_workers=1
+        n_workers: int = 1
     ) -> np.ndarray:
         assert self.topic_word_priors_ is not None
         assert self.predictor is not None
@@ -118,7 +118,7 @@ def transform(
             if X is None:
                 Xs_csr.append(
                     sps.csr_matrix(
-                        shape=(shape, self.topic_word_priors_[i].shape[0]),
+                        (shape, self.topic_word_priors_[i].shape[0]),
                         dtype=IntegerType,
                     )
                 )
@@ -129,18 +129,25 @@ def transform(
             return self.predictor.predict_gibbs_batch(
                 Xs_csr, n_iter, gibbs_burn_in, random_seed, use_cgs_p, n_workers
             )
-        else:
+        elif mode == "mf":
             return self.predictor.predict_mf_batch(
                 Xs_csr, n_iter, mf_tolerance, n_workers
             )
+        else:
+            raise ValueError('"mode" argument must be either "gibbs" for "mf".')
 
     def word_topic_assignment(
-        self, *Xs, n_iter=100, random_seed=42, gibbs_burn_in=10, use_cgs_p=True
+        self,
+        *Xs: Union[ValidXType, None],
+        n_iter: int = 100,
+        random_seed: int = 42,
+        gibbs_burn_in: int = 10,
+        use_cgs_p: bool = True
     ) -> List[Tuple[np.ndarray, List[Dict[int, np.ndarray]]]]:
         assert self.topic_word_priors_ is not None
         assert self.predictor is not None
         n_domains = len(Xs)
-        shapes = set({X.shape[0] for X in Xs})
+        shapes = set({X.shape[0] for X in Xs if X is not None})
         if len(shapes) != 1:
             raise ValueError("Got different shape for Xs.")
 
@@ -150,8 +157,7 @@ def word_topic_assignment(
             if X is None:
                 Xs_csr.append(
                     sps.csr_matrix(
-                        ([], ([], [])),
-                        shape=(shape, self.topic_word_priors_[i].shape[0]),
+                        (shape, self.topic_word_priors_[i].shape[0]), dtype=IntegerType
                     )
                 )
         results = []
@@ -176,7 +182,7 @@ def phis(self) -> List[np.ndarray]:
         return self.predictor.phis
 
 
-class MultipleContextLDA(LDAPredictorMixin):
+class LDABase(LDAPredictorMixin):
     def __init__(
         self,
         n_components: int = 100,
@@ -217,11 +223,7 @@ def __init__(
 
         self.n_workers = n_workers
 
-    def fit(self, *X, **kwargs):
-        self._fit(*X, **kwargs)
-        return self
-
-    def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray:
+    def _fit(self, *Xs: ValidXType, ll_freq: int = 10) -> npt.NDArray[IntegerType]:
         """
         Xs should be a list of contents.
         All entries must have the same shape[0].
@@ -247,6 +249,8 @@ def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray:
                     ensure_symmetry=True,
                 )
             )
+        if n_rows is None:
+            raise ValueError("At least one doc-term matrix must be given.")
 
         doc_topic: np.ndarray = np.zeros((n_rows, self.n_components), dtype=IntegerType)
 
@@ -353,7 +357,53 @@ def _fit(self, *Xs: ValidXType, ll_freq=10) -> np.ndarray:
         return doc_topic
 
 
-class LDA(MultipleContextLDA):
+class MultilingualLDA(LDABase):
+    def __init__(
+        self,
+        n_components: int = 100,
+        doc_topic_prior: PriorType = None,
+        n_iter: int = 1000,
+        optimize_interval: Optional[int] = None,
+        optimize_burn_in: Optional[int] = None,
+        n_workers: int = 1,
+        use_cgs_p: bool = True,
+        is_phi_symmetric: bool = True,
+    ):
+        n_components = int(n_components)
+        assert n_iter >= 1
+        assert n_components >= 1
+        self.n_components = n_components
+
+        self.doc_topic_prior = number_to_array(
+            self.n_components, 1 / float(self.n_components), doc_topic_prior
+        )
+        self.topic_word_priors_ = None
+        self.is_phi_symmetric = is_phi_symmetric
+        self.n_vocabs: Optional[List[int]] = None
+        self.docstate_ = None
+        self.components_: Optional[int] = None
+        self.n_modals: Optional[int] = None
+
+        self.predictor: Optional[CorePredictor] = None
+        self.use_cgs_p: bool = use_cgs_p
+
+        self.n_iter = n_iter
+        self.optimize_interval = optimize_interval
+        if optimize_interval is not None:
+            if optimize_burn_in is None:
+                optimize_burn_in = n_iter // 2
+            else:
+                optimize_burn_in = optimize_burn_in
+        self.optimize_burn_in = optimize_burn_in
+
+        self.n_workers = n_workers
+
+    def fit(self, *X: ValidXType, ll_freq: int = 10) -> "MultilingualLDA":
+        self._fit(*X, ll_freq=ll_freq)
+        return self
+
+
+class LDA(LDABase):
     pass
 
     def __init__(
@@ -379,8 +429,8 @@ def __init__(
             is_phi_symmetric=is_phi_symmetric,
         )
 
-    def fit(self, X, **kwargs):
-        super(LDA, self).fit(X, **kwargs)
+    def fit(self, X: ValidXType, ll_freq: int = 10) -> "LDA":
+        self._fit(X, ll_freq=ll_freq)
         return self
 
     @property
diff --git a/lda11/util.py b/lda11/util.py
index c6e01ee..dee0dee 100644
--- a/lda11/util.py
+++ b/lda11/util.py
@@ -1,15 +1,19 @@
+from typing import Optional, Tuple
+
 import numpy as np
 from scipy import sparse as sps
 
 from ._lda import train_test_split
-from .lda import IntegerType, RealType
+from .lda import IntegerType, RealType, ValidXType
 
 
-def rowwise_train_test_split(X, random_seed=None, test_ratio=0.5):
+def rowwise_train_test_split(
+    X: ValidXType, random_seed: Optional[int] = None, test_ratio: float = 0.5
+) -> Tuple[sps.csr_matrix, sps.csr_matrix]:
     """
     split matrix randomly
     """
     if random_seed is None:
-        random_seed = np.random.randint(-(2 ** 63), 2 ** 63 - 1)
+        random_seed = np.random.randint(-(2 ** 31), 2 ** 31 - 1)
     X = sps.csr_matrix(X, dtype=IntegerType)
     return train_test_split(X, test_ratio, random_seed)
diff --git a/pyproject.toml b/pyproject.toml
index 6796ae2..901f417 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,3 +13,14 @@ include_trailing_comma = true
 line_length = 88
 multi_line_output = 3
 use_parentheses = true
+
+[tool.mypy]
+disallow_untyped_defs = true
+exclude = []
+ignore_missing_imports = true
+plugins = "numpy.typing.mypy_plugin"
+python_version = "3.8"
+warn_return_any = true
+warn_unused_configs = true
+
+# mypy per-module options:
diff --git a/stubs/lda11/_lda-stubs/__init__.pyi b/stubs/lda11/_lda-stubs/__init__.pyi
index ed1336a..818f26d 100644
--- a/stubs/lda11/_lda-stubs/__init__.pyi
+++ b/stubs/lda11/_lda-stubs/__init__.pyi
@@ -4,6 +4,7 @@ import lda11._lda
 import typing
 import numpy
 import scipy.sparse
+
 _Shape = typing.Tuple[int, ...]
 
 __all__ = [
@@ -13,46 +14,174 @@ __all__ = [
     "learn_dirichlet",
     "learn_dirichlet_symmetric",
     "log_likelihood_doc_topic",
-    "train_test_split"
+    "train_test_split",
 ]
 
-
-class LDATrainer():
-    def __init__(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg2: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg3: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg4: int, arg5: int, arg6: int) -> None: ...
-    def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
-    def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
-    def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ...
-    def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
-    def set_doc_topic_prior(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]]) -> None: ...
+class LDATrainer:
+    def __init__(
+        self,
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.uint64],
+        arg3: npt.NDArray[numpy.uint64],
+        arg4: int,
+        arg5: int,
+        arg6: int,
+    ) -> None: ...
+    def initialize(
+        self,
+        arg0: npt.NDArray[numpy.int32],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+    ) -> None: ...
+    def iterate_gibbs(
+        self,
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
+    ) -> None: ...
+    def log_likelihood(
+        self,
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+    ) -> float: ...
+    def obtain_phi(
+        self,
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
+    ) -> npt.NDArray[numpy.float64]: ...
+    def set_doc_topic_prior(self, arg0: npt.NDArray[numpy.float64]) -> None: ...
     pass
-class LabelledLDATrainer():
-    def __init__(self, arg0: float, arg1: float, arg2: scipy.sparse.csr_matrix[numpy.int32], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]], arg4: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg5: numpy.ndarray[numpy.uint64, _Shape[m, 1]], arg6: int, arg7: int, arg8: int) -> None: ...
-    def initialize(self, arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
-    def iterate_gibbs(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> None: ...
-    def log_likelihood(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]]) -> float: ...
-    def obtain_phi(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, n]], arg3: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+
+class LabelledLDATrainer:
+    def __init__(
+        self,
+        arg0: float,
+        arg1: float,
+        arg2: scipy.sparse.csr_matrix[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
+        arg4: npt.NDArray[numpy.uint64],
+        arg5: npt.NDArray[numpy.uint64],
+        arg6: int,
+        arg7: int,
+        arg8: int,
+    ) -> None: ...
+    def initialize(
+        self,
+        arg0: npt.NDArray[numpy.int32],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+    ) -> None: ...
+    def iterate_gibbs(
+        self,
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
+    ) -> None: ...
+    def log_likelihood(
+        self,
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+    ) -> float: ...
+    def obtain_phi(
+        self,
+        arg0: npt.NDArray[numpy.float64],
+        arg1: npt.NDArray[numpy.int32],
+        arg2: npt.NDArray[numpy.int32],
+        arg3: npt.NDArray[numpy.int32],
+    ) -> npt.NDArray[numpy.float64]: ...
     pass
-class Predictor():
+
+class Predictor:
     def __getstate__(self) -> tuple: ...
-    def __init__(self, arg0: int, arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: int) -> None: ...
+    def __init__(
+        self, arg0: int, arg1: npt.NDArray[numpy.float64], arg2: int
+    ) -> None: ...
     def __setstate__(self, arg0: tuple) -> None: ...
-    def add_beta(self, arg0: numpy.ndarray[numpy.float64, _Shape[m, n]]) -> None: ...
-    def predict_gibbs(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
-    def predict_gibbs_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: int, arg3: int, arg4: bool, arg5: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
-    def predict_gibbs_with_word_assignment(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: int, arg4: int, arg5: bool) -> typing.Tuple[numpy.ndarray[numpy.float64, _Shape[m, 1]], typing.List[typing.Dict[int, numpy.ndarray[numpy.int32, _Shape[m, 1]]]]]: ...
-    def predict_mf(self, arg0: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg1: typing.List[numpy.ndarray[numpy.int32, _Shape[m, 1]]], arg2: int, arg3: float) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]: ...
-    def predict_mf_batch(self, arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]], arg1: int, arg2: float, arg3: int) -> numpy.ndarray[numpy.float64, _Shape[m, n]]: ...
+    def add_beta(self, arg0: npt.NDArray[numpy.float64]) -> None: ...
+    def predict_gibbs(
+        self,
+        arg0: typing.List[npt.NDArray[numpy.int32]],
+        arg1: typing.List[npt.NDArray[numpy.int32]],
+        arg2: int,
+        arg3: int,
+        arg4: int,
+        arg5: bool,
+    ) -> npt.NDArray[numpy.float64]: ...
+    def predict_gibbs_batch(
+        self,
+        arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
+        arg1: int,
+        arg2: int,
+        arg3: int,
+        arg4: bool,
+        arg5: int,
+    ) -> npt.NDArray[numpy.float64]: ...
+    def predict_gibbs_with_word_assignment(
+        self,
+        arg0: typing.List[npt.NDArray[numpy.int32]],
+        arg1: typing.List[npt.NDArray[numpy.int32]],
+        arg2: int,
+        arg3: int,
+        arg4: int,
+        arg5: bool,
+    ) -> typing.Tuple[
+        npt.NDArray[numpy.float64],
+        typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]],
+    ]: ...
+    def predict_mf(
+        self,
+        arg0: typing.List[npt.NDArray[numpy.int32]],
+        arg1: typing.List[npt.NDArray[numpy.int32]],
+        arg2: int,
+        arg3: float,
+    ) -> npt.NDArray[numpy.float64]: ...
+    def predict_mf_batch(
+        self,
+        arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
+        arg1: int,
+        arg2: float,
+        arg3: int,
+    ) -> npt.NDArray[numpy.float64]: ...
     @property
-    def phis(self) -> typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]:
+    def phis(self) -> typing.List[npt.NDArray[numpy.float64]]:
         """
-        :type: typing.List[numpy.ndarray[numpy.float64, _Shape[m, n]]]
+        :type: typing.List[npt.NDArray[numpy.float64]]
         """
     pass
-def learn_dirichlet(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg2: float, arg3: float, arg4: int) -> numpy.ndarray[numpy.float64, _Shape[m, 1]]:
+
+def learn_dirichlet(
+    arg0: npt.NDArray[numpy.int32],
+    arg1: npt.NDArray[numpy.float64],
+    arg2: float,
+    arg3: float,
+    arg4: int,
+) -> npt.NDArray[numpy.float64]:
     pass
-def learn_dirichlet_symmetric(arg0: numpy.ndarray[numpy.int32, _Shape[m, n]], arg1: float, arg2: float, arg3: float, arg4: int) -> float:
+
+def learn_dirichlet_symmetric(
+    arg0: npt.NDArray[numpy.int32],
+    arg1: float,
+    arg2: float,
+    arg3: float,
+    arg4: int,
+) -> float:
     pass
-def log_likelihood_doc_topic(arg0: numpy.ndarray[numpy.float64, _Shape[m, 1]], arg1: numpy.ndarray[numpy.int32, _Shape[m, n]], arg2: numpy.ndarray[numpy.int32, _Shape[m, 1]]) -> float:
+
+def log_likelihood_doc_topic(
+    arg0: npt.NDArray[numpy.float64],
+    arg1: npt.NDArray[numpy.int32],
+    arg2: npt.NDArray[numpy.int32],
+) -> float:
     pass
-def train_test_split(arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int) -> typing.Tuple[scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32]]:
+
+def train_test_split(
+    arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int
+) -> typing.Tuple[
+    scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32]
+]:
     pass

From 93bf647c34b114fe8c800ab8d05e5a018b3ef8c3 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 11:49:47 +0900
Subject: [PATCH 05/32] adding tests

---
 tests/conftest.py  | 16 ++++++++++++++++
 tests/language.py  | 40 +++++++++++++++++++++++++++++++++++++++
 tests/test_mlds.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/language.py
 create mode 100644 tests/test_mlds.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..0d307e3
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,16 @@
+import numpy as np
+import pytest
+
+from .language import Docs, Language
+
+
+@pytest.fixture
+def docs_gen() -> Docs:
+    language_1 = Language(
+        np.asfarray([1, 1, 1, 0.01, 0.01, 0.01]),
+        np.asfarray([0.01, 0.01, 0.01, 1, 1, 1]),
+    )
+    language_2 = Language(
+        np.asfarray([1, 0.01, 1, 0.01]), np.asfarray([0.01, 1, 0.01, 1])
+    )
+    return Docs([language_1, language_2])
diff --git a/tests/language.py b/tests/language.py
new file mode 100644
index 0000000..0ae1373
--- /dev/null
+++ b/tests/language.py
@@ -0,0 +1,40 @@
+from typing import List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+N_DOCS = 1000
+
+
+class Language:
+    def __init__(
+        self, TOPIC1: npt.NDArray[np.float64], TOPIC2: npt.NDArray[np.float64]
+    ):
+        self.topic_1: npt.NDArray[np.float64] = TOPIC1 / TOPIC1.sum()
+        self.topic_2: npt.NDArray[np.float64] = TOPIC2 / TOPIC2.sum()
+
+
+class Docs:
+    def __init__(self, languages: List[Language]):
+        self.languages = languages
+
+    def gen_doc(
+        self, n_docs: int
+    ) -> Tuple[List[npt.NDArray[np.int32]], npt.NDArray[np.float64]]:
+        rns = np.random.RandomState(0)
+        words: List[List[npt.NDArray[np.int64]]] = [
+            [] for _ in range(len(self.languages))
+        ]
+        thetas: List[np.ndarray] = []
+        for _ in range(n_docs):
+            theta = rns.dirichlet(np.asfarray([1.0, 1.0]))
+            thetas.append(theta)
+            for lind, language in enumerate(self.languages):
+                cnt = rns.poisson(3)
+                wdist = (
+                    float(theta[0]) * language.topic_1
+                    + float(theta[1]) * language.topic_2
+                )
+                words[lind].append(rns.multinomial(cnt, wdist))
+
+        return [np.vstack(x) for x in words], np.vstack(thetas)
diff --git a/tests/test_mlds.py b/tests/test_mlds.py
new file mode 100644
index 0000000..ddea5f4
--- /dev/null
+++ b/tests/test_mlds.py
@@ -0,0 +1,47 @@
+import numpy as np
+
+from lda11 import MultilingualLDA
+
+from .conftest import Docs
+
+
+def test_mlda(docs_gen: Docs) -> None:
+    (X1, X2), true_theta = docs_gen.gen_doc(1000)
+    lda = MultilingualLDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25)
+    lda.fit(X1, X2)
+    phi1, phi2 = lda.phis
+
+    # determin which is TOPIC1
+
+    lang1_topic1_strong_index = np.where(docs_gen.languages[0].topic_1 > 0.1)[0]
+    lang1_topic2_strong_index = np.where(docs_gen.languages[0].topic_1 < 0.1)[0]
+    if (
+        phi1[lang1_topic1_strong_index, 0].mean()
+        > phi1[lang1_topic2_strong_index, 0].mean()
+    ):
+        topic1_index = 0
+        topic2_index = 1
+    else:
+        topic1_index = 1
+        topic2_index = 0
+    for i in lang1_topic1_strong_index:
+        for j in lang1_topic2_strong_index:
+            assert phi1[i, topic1_index] > phi1[j, topic1_index]
+            assert phi1[i, topic2_index] < phi1[j, topic2_index]
+
+    lang2_topic1_strong_index = np.where(docs_gen.languages[1].topic_1 > 0.1)[0]
+    lang2_topic2_strong_index = np.where(docs_gen.languages[1].topic_1 < 0.1)[0]
+    for i in lang2_topic1_strong_index:
+        for j in lang2_topic2_strong_index:
+            assert phi2[i, topic1_index] > phi2[j, topic1_index]
+            assert phi2[i, topic2_index] < phi2[j, topic2_index]
+
+    # just check it works.
+    for algo in ["mf", "gibbs"]:
+        checked_cnt = 0
+        theta_inferred = lda.transform(X1, X2, mode=algo)  # type: ignore
+        for i in range(X1.shape[0]):
+            if (true_theta[i, 0] / true_theta[i, 1]) > 10:
+                checked_cnt += 1
+                assert theta_inferred[i, topic1_index] > theta_inferred[i, topic2_index]
+        assert checked_cnt > 0

From a072f9fee5a355b643b9004bf3ab2462ded581ca Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 11:51:01 +0900
Subject: [PATCH 06/32] Type

---
 lda11/lda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lda11/lda.py b/lda11/lda.py
index d0f71f0..21bd42d 100644
--- a/lda11/lda.py
+++ b/lda11/lda.py
@@ -337,12 +337,12 @@ def _fit(self, *Xs: ValidXType, ll_freq: int = 10) -> npt.NDArray[IntegerType]:
                         topic_word_prior[:] = topic_word_prior_new
                         self.doc_topic_prior = doc_topic_prior_new
                         docstate.set_doc_topic_prior(doc_topic_prior_new)
-        self.topic_word_priors = topic_word_priors_canonical
+        self.topic_word_priors_ = topic_word_priors_canonical
 
         predictor = CorePredictor(self.n_components, self.doc_topic_prior, 42)
 
         for i, (twp, wt, docstate) in enumerate(
-            zip(self.topic_word_priors, word_topics, docstates)
+            zip(self.topic_word_priors_, word_topics, docstates)
         ):
             if self.use_cgs_p:
                 phi = docstate.obtain_phi(twp, doc_topic, wt, topic_counts)

From e50cb28c3274c37a27ba0868d022a93063825c78 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 12:03:15 +0900
Subject: [PATCH 07/32] add test for word_topic_assignment

---
 .gitignore         |  1 +
 lda11/lda.py       | 36 +++++++++---------------------------
 tests/test_mlds.py | 25 +++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
index 19ff8cb..f3e2f8e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ compile_commands.json
 .clangd/
 **.ipynb_checkpoints/**
 .cache/clangd
+.coverage
diff --git a/lda11/lda.py b/lda11/lda.py
index 21bd42d..fb1a198 100644
--- a/lda11/lda.py
+++ b/lda11/lda.py
@@ -369,34 +369,16 @@ def __init__(
         use_cgs_p: bool = True,
         is_phi_symmetric: bool = True,
     ):
-        n_components = int(n_components)
-        assert n_iter >= 1
-        assert n_components >= 1
-        self.n_components = n_components
-
-        self.doc_topic_prior = number_to_array(
-            self.n_components, 1 / float(self.n_components), doc_topic_prior
+        super().__init__(
+            n_components,
+            doc_topic_prior=doc_topic_prior,
+            n_iter=n_iter,
+            optimize_interval=optimize_interval,
+            optimize_burn_in=optimize_burn_in,
+            n_workers=n_workers,
+            use_cgs_p=use_cgs_p,
+            is_phi_symmetric=is_phi_symmetric,
         )
-        self.topic_word_priors_ = None
-        self.is_phi_symmetric = is_phi_symmetric
-        self.n_vocabs: Optional[List[int]] = None
-        self.docstate_ = None
-        self.components_: Optional[int] = None
-        self.n_modals: Optional[int] = None
-
-        self.predictor: Optional[CorePredictor] = None
-        self.use_cgs_p: bool = use_cgs_p
-
-        self.n_iter = n_iter
-        self.optimize_interval = optimize_interval
-        if optimize_interval is not None:
-            if optimize_burn_in is None:
-                optimize_burn_in = n_iter // 2
-            else:
-                optimize_burn_in = optimize_burn_in
-        self.optimize_burn_in = optimize_burn_in
-
-        self.n_workers = n_workers
 
     def fit(self, *X: ValidXType, ll_freq: int = 10) -> "MultilingualLDA":
         self._fit(*X, ll_freq=ll_freq)
diff --git a/tests/test_mlds.py b/tests/test_mlds.py
index ddea5f4..5ee03ce 100644
--- a/tests/test_mlds.py
+++ b/tests/test_mlds.py
@@ -1,4 +1,5 @@
 import numpy as np
+from scipy import sparse as sps
 
 from lda11 import MultilingualLDA
 
@@ -7,6 +8,7 @@
 
 def test_mlda(docs_gen: Docs) -> None:
     (X1, X2), true_theta = docs_gen.gen_doc(1000)
+    X2 = sps.lil_matrix(X2)
     lda = MultilingualLDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25)
     lda.fit(X1, X2)
     phi1, phi2 = lda.phis
@@ -45,3 +47,26 @@ def test_mlda(docs_gen: Docs) -> None:
                 checked_cnt += 1
                 assert theta_inferred[i, topic1_index] > theta_inferred[i, topic2_index]
         assert checked_cnt > 0
+
+    wdt = lda.word_topic_assignment(X1, X2)
+    assert len(wdt) == 1000
+    for i, wdt_result_doc in enumerate(wdt):
+        theta = wdt_result_doc[0]
+        if (true_theta[i, 0] / true_theta[i, 1]) > 10:
+            assert theta[topic1_index] > theta[topic2_index]
+        m = wdt_result_doc[1]
+        assert len(m) == 2
+        # lang 1
+        lang1_assignment = m[0]
+        for word, topic in lang1_assignment.items():
+            if (topic[topic1_index] / (1e-10 + topic[topic2_index])) > 10:
+                assert word in lang1_topic1_strong_index
+            if (topic[topic2_index] / (1e-10 + topic[topic1_index])) > 10:
+                assert word in lang1_topic2_strong_index
+
+        lang2_assignment = m[1]
+        for word, topic in lang2_assignment.items():
+            if (topic[topic1_index] / (1e-10 + topic[topic2_index])) > 10:
+                assert word in lang2_topic1_strong_index
+            if (topic[topic2_index] / (1e-10 + topic[topic1_index])) > 10:
+                assert word in lang2_topic2_strong_index

From 5341ec468876a70a608b8069cb6621ee6ee81d68 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 12:06:42 +0900
Subject: [PATCH 08/32] add test for usual lda

---
 tests/test_lda.py                    | 30 ++++++++++++++++++++++++++++
 tests/{test_mlds.py => test_mlda.py} |  0
 2 files changed, 30 insertions(+)
 create mode 100644 tests/test_lda.py
 rename tests/{test_mlds.py => test_mlda.py} (100%)

diff --git a/tests/test_lda.py b/tests/test_lda.py
new file mode 100644
index 0000000..a7cb1d7
--- /dev/null
+++ b/tests/test_lda.py
@@ -0,0 +1,30 @@
+import numpy as np
+
+from lda11 import LDA
+
+from .conftest import Docs
+
+
+def test_lda(docs_gen: Docs) -> None:
+    (X1, _), true_theta = docs_gen.gen_doc(1000)
+    lda = LDA(2, n_iter=50, optimize_interval=1, optimize_burn_in=25, use_cgs_p=False)
+    lda.fit(X1)
+    phi1 = lda.phi
+
+    # determin which is TOPIC1
+
+    lang1_topic1_strong_index = np.where(docs_gen.languages[0].topic_1 > 0.1)[0]
+    lang1_topic2_strong_index = np.where(docs_gen.languages[0].topic_1 < 0.1)[0]
+    if (
+        phi1[lang1_topic1_strong_index, 0].mean()
+        > phi1[lang1_topic2_strong_index, 0].mean()
+    ):
+        topic1_index = 0
+        topic2_index = 1
+    else:
+        topic1_index = 1
+        topic2_index = 0
+    for i in lang1_topic1_strong_index:
+        for j in lang1_topic2_strong_index:
+            assert phi1[i, topic1_index] > phi1[j, topic1_index]
+            assert phi1[i, topic2_index] < phi1[j, topic2_index]
diff --git a/tests/test_mlds.py b/tests/test_mlda.py
similarity index 100%
rename from tests/test_mlds.py
rename to tests/test_mlda.py

From 70b4089079dbe5eb58930a5e2c4013913f4affa2 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 12:17:40 +0900
Subject: [PATCH 09/32] test utils

---
 lda11/__init__.py  |  3 ++-
 tests/test_util.py | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_util.py

diff --git a/lda11/__init__.py b/lda11/__init__.py
index 7981c69..4c3681a 100644
--- a/lda11/__init__.py
+++ b/lda11/__init__.py
@@ -1,4 +1,5 @@
 from .labelled_lda import LabelledLDA
 from .lda import LDA, MultilingualLDA
+from .util import rowwise_train_test_split
 
-__all__ = ["LDA", "LabelledLDA", "MultilingualLDA"]
+__all__ = ["LDA", "LabelledLDA", "MultilingualLDA", "rowwise_train_test_split"]
diff --git a/tests/test_util.py b/tests/test_util.py
new file mode 100644
index 0000000..d943d73
--- /dev/null
+++ b/tests/test_util.py
@@ -0,0 +1,17 @@
+import numpy as np
+from scipy import sparse as sps
+
+from lda11 import rowwise_train_test_split
+
+from .conftest import Docs
+
+
+def test_split(docs_gen: Docs) -> None:
+    (X1, X2), _ = docs_gen.gen_doc(1000)
+    X2_sp = sps.lil_matrix(X2)
+    X1_tr, X1_te = rowwise_train_test_split(X1)
+    assert np.all(np.asarray(X1 - X1_tr - X1_te) == 0)
+    X2_tr, X2_te = rowwise_train_test_split(X2_sp, random_seed=0)
+    # raise RuntimeError((X2.tocsr() - X2_tr - X2_te))
+    v = np.abs(X2 - X2_tr.toarray() - X2_te.toarray()).sum()
+    assert v == 0

From 435d52b171c8ba05e6b97facf9e6d721eb959307 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 12:45:58 +0900
Subject: [PATCH 10/32] llda test

---
 lda11/labelled_lda.py | 12 +++++-----
 tests/test_llda.py    | 55 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_llda.py

diff --git a/lda11/labelled_lda.py b/lda11/labelled_lda.py
index b19f13b..3bb28fe 100644
--- a/lda11/labelled_lda.py
+++ b/lda11/labelled_lda.py
@@ -54,9 +54,9 @@ def _fit_llda(
             Y = Y.astype(IntegerType)
 
         self.n_components = int(Y.shape[1])
-        self.topic_word_prior = number_to_array(
-            X.shape[1], 1 / float(self.n_components), None
-        )
+        self.topic_word_priors_ = [
+            number_to_array(X.shape[1], 1 / float(self.n_components), None)
+        ]
 
         try:
             count, dix, wix = check_array(X)
@@ -84,7 +84,7 @@ def _fit_llda(
         with tqdm(range(self.n_iter)) as pbar:
             for _ in pbar:
                 docstate.iterate_gibbs(
-                    self.topic_word_prior, doc_topic, word_topic, topic_counts
+                    self.topic_word_priors_[0], doc_topic, word_topic, topic_counts
                 )
 
         doc_topic_prior = self.alpha * np.ones(self.n_components, dtype=RealType)
@@ -93,10 +93,10 @@ def _fit_llda(
         predictor = CorePredictor(self.n_components, doc_topic_prior, 42)
         if self.use_cgs_p:
             phi = docstate.obtain_phi(
-                self.topic_word_prior, doc_topic, word_topic, topic_counts
+                self.topic_word_priors_[0], doc_topic, word_topic, topic_counts
             )
         else:
-            phi = word_topic + self.topic_word_prior[:, np.newaxis]
+            phi = word_topic + self.topic_word_priors_[0][:, np.newaxis]
             phi /= phi.sum(axis=0)[np.newaxis, :]
             phi = phi.transpose()
         predictor.add_beta(phi.transpose())
diff --git a/tests/test_llda.py b/tests/test_llda.py
new file mode 100644
index 0000000..b20820f
--- /dev/null
+++ b/tests/test_llda.py
@@ -0,0 +1,55 @@
+from typing import Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+from lda11 import LabelledLDA
+
+
+class LabelledLanguage:
+    def __init__(
+        self, TOPIC1: npt.NDArray[np.float64], TOPIC2: npt.NDArray[np.float64]
+    ):
+        self.topic_1: npt.NDArray[np.float64] = TOPIC1 / TOPIC1.sum()
+        self.topic_2: npt.NDArray[np.float64] = TOPIC2 / TOPIC2.sum()
+        self.common = np.ones_like(TOPIC1) / TOPIC1.shape[0]
+
+    def gen_doc(
+        self, n_docs: int
+    ) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+        rns = np.random.RandomState(0)
+        Xs = []
+        labels = []
+        for i in range(n_docs):
+            cnt = rns.poisson(10)
+            label = np.asfarray([1, rns.binomial(1, 0.5), rns.binomial(1, 0.5)])
+            p = (
+                label[0] * self.common
+                + label[1] * self.topic_1
+                + label[2] * self.topic_2
+            )
+            words = rns.multinomial(cnt, p / p.sum())
+            Xs.append(words)
+            labels.append(label)
+        return np.vstack(Xs), np.vstack(labels)
+
+
+def test_llda() -> None:
+    TOPIC_A = np.asfarray([0.01, 1, 0.01, 1])
+    TOPIC_B = np.asfarray([1, 0.01, 1, 0.01])
+    for A_index in [1, 2]:
+        if A_index == 1:
+            language = LabelledLanguage(TOPIC_A, TOPIC_B)
+        else:
+            language = LabelledLanguage(TOPIC_B, TOPIC_A)
+        X, Y = language.gen_doc(1000)
+
+        llda = LabelledLDA().fit(X, Y)
+
+        A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32)
+        for mode in ["mf", "gibbs"]:
+            theta = llda.transform(A_DOC, mode=mode)[0]  # type: ignore
+            if A_index == 1:
+                assert (theta[1] / theta[2]) > 5
+            else:
+                assert (theta[2] / theta[1]) > 5

From 89b8cec9b15324b5ae3a5156106765f352479e26 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 12:49:27 +0900
Subject: [PATCH 11/32] further llda test

---
 tests/test_llda.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_llda.py b/tests/test_llda.py
index b20820f..a77e5f3 100644
--- a/tests/test_llda.py
+++ b/tests/test_llda.py
@@ -37,15 +37,26 @@ def gen_doc(
 def test_llda() -> None:
     TOPIC_A = np.asfarray([0.01, 1, 0.01, 1])
     TOPIC_B = np.asfarray([1, 0.01, 1, 0.01])
+    A_word_index = np.where(TOPIC_A > 0.1)[0]
+    B_word_index = np.where(TOPIC_A < 0.1)[0]
+
     for A_index in [1, 2]:
         if A_index == 1:
             language = LabelledLanguage(TOPIC_A, TOPIC_B)
+            B_index = 2
         else:
             language = LabelledLanguage(TOPIC_B, TOPIC_A)
+            B_index = 1
+
         X, Y = language.gen_doc(1000)
 
         llda = LabelledLDA().fit(X, Y)
 
+        for a_word in A_word_index:
+            for b_word in B_word_index:
+                assert llda.phi[a_word, A_index] > llda.phi[b_word, A_index]
+                assert llda.phi[a_word, B_index] < llda.phi[b_word, B_index]
+
         A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32)
         for mode in ["mf", "gibbs"]:
             theta = llda.transform(A_DOC, mode=mode)[0]  # type: ignore

From 8bf8baa367bd6cd90b53f37ab4388245d69f6942 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 14:08:49 +0900
Subject: [PATCH 12/32] Add test workflow

---
 .github/workflows/test.yml | 36 ++++++++++++++++++++++++++++++++++++
 .pre-commit-config.yaml    |  4 ----
 setup.py                   | 24 ++++++++++++++++++++++--
 tests/test_llda.py         |  4 ++--
 4 files changed, 60 insertions(+), 8 deletions(-)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..0d99b19
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,36 @@
+name: Full Test & Upload coverage
+on: [push, pull_request]
+jobs:
+  run_pytest_upload_coverage:
+    runs-on: ubuntu-latest
+    env:
+      OS: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Setup Python
+        uses: actions/setup-python@master
+        with:
+          python-version: 3.10
+      - name: Build lda11
+        run: |
+          pip install --upgrade pip
+          sudo apt-get install lcov
+          TEST_BUILD=true python setup.py develop
+      - name: Run pytest
+        run: |
+          pip install pytest pytest-cov
+          pytest --cov=./lda11 tests/
+      - name: Generate coverage (ubuntu)
+        run: |
+          coverage xml
+          lcov -d `pwd` -c -o coverage.info
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1
+        with:
+          files: ./coverage.xml,./coverage.info
+          verbose: false
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 809da50..97f9721 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,10 +18,6 @@ repos:
     hooks:
       - id: isort
         name: isort
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.790 # Use the sha / tag you want to point at
-    hooks:
-      - id: mypy
   - repo: https://github.com/psf/black
     rev: 20.8b1
     hooks:
diff --git a/setup.py b/setup.py
index 9354f54..0da795b 100644
--- a/setup.py
+++ b/setup.py
@@ -6,13 +6,15 @@
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
-__version__ = "0.2.2.0"
-install_requires = ["pybind11>=2.5", "numpy >= 1.11", "tqdm", "scipy>=1.0.0"]
+__version__ = "0.3.0.0"
+install_requires = ["pybind11>=2.5", "numpy >= 1.22", "tqdm", "scipy>=1.0.0"]
 
 eigen_include_dir = os.environ.get("EIGEN3_INCLUDE_DIR", None)
 if eigen_include_dir is None:
     install_requires.append("requests")
 
+TEST_BUILD = os.environ.get("TEST_BUILD", None) is not None
+
 
 class get_eigen_include(object):
     EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip"
@@ -122,6 +124,24 @@ class BuildExt(build_ext):
         "msvc": [],
         "unix": [],
     }
+    if TEST_BUILD:
+        c_opts: Dict[str, List[str]] = {
+            "msvc": ["/EHsc"],
+            "unix": ["-O0", "-coverage", "-g"],
+        }
+        l_opts: Dict[str, List[str]] = {
+            "msvc": [],
+            "unix": ["-coverage"],
+        }
+    else:
+        c_opts = {
+            "msvc": ["/EHsc"],
+            "unix": [],
+        }
+        l_opts = {
+            "msvc": [],
+            "unix": [],
+        }
 
     if sys.platform == "darwin":
         darwin_opts = ["-stdlib=libc++", "-mmacosx-version-min=10.7"]
diff --git a/tests/test_llda.py b/tests/test_llda.py
index a77e5f3..1571485 100644
--- a/tests/test_llda.py
+++ b/tests/test_llda.py
@@ -40,7 +40,7 @@ def test_llda() -> None:
     A_word_index = np.where(TOPIC_A > 0.1)[0]
     B_word_index = np.where(TOPIC_A < 0.1)[0]
 
-    for A_index in [1, 2]:
+    for A_index, cgs_p in zip([1, 2], [True, False]):
         if A_index == 1:
             language = LabelledLanguage(TOPIC_A, TOPIC_B)
             B_index = 2
@@ -50,7 +50,7 @@ def test_llda() -> None:
 
         X, Y = language.gen_doc(1000)
 
-        llda = LabelledLDA().fit(X, Y)
+        llda = LabelledLDA(use_cgs_p=cgs_p).fit(X, Y)
 
         for a_word in A_word_index:
             for b_word in B_word_index:

From c72481e20b216c37035f907e003e919a945a5fc0 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 14:09:38 +0900
Subject: [PATCH 13/32] 3.10 -> str

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0d99b19..a9d9efc 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@master
         with:
-          python-version: 3.10
+          python-version: "3.10"
       - name: Build lda11
         run: |
           pip install --upgrade pip

From d88cd8c5fffeeacb3737a955e3b445758179eba1 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 14:22:33 +0900
Subject: [PATCH 14/32] Add threading test

---
 setup.py           | 29 ++++++++++++++---------------
 tests/test_llda.py |  6 +++---
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/setup.py b/setup.py
index 0da795b..300ea80 100644
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,15 @@
 import os
 import sys
-from typing import Dict, List
+from typing import Any, Dict, List
 
 import setuptools
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
 __version__ = "0.3.0.0"
-install_requires = ["pybind11>=2.5", "numpy >= 1.22", "tqdm", "scipy>=1.0.0"]
+install_requires = ["numpy >= 1.22", "tqdm", "scipy>=1.0.0"]
+setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"]
+
 
 eigen_include_dir = os.environ.get("EIGEN3_INCLUDE_DIR", None)
 if eigen_include_dir is None:
@@ -20,7 +22,7 @@ class get_eigen_include(object):
     EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip"
     EIGEN3_DIRNAME = "eigen-3.3.7"
 
-    def __str__(self):
+    def __str__(self) -> str:
         if eigen_include_dir is not None:
             return eigen_include_dir
 
@@ -55,7 +57,7 @@ class get_pybind_include(object):
     def __init__(self, user=False):
         self.user = user
 
-    def __str__(self):
+    def __str__(self) -> str:
         import pybind11
 
         return pybind11.get_include(self.user)
@@ -85,7 +87,7 @@ def __str__(self):
 
 # As of Python 3.6, CCompiler has a `has_flag` method.
 # cf http://bugs.python.org/issue26689
-def has_flag(compiler, flagname):
+def has_flag(compiler, flagname) -> bool:
     """Return a boolean indicating whether a flag name is supported on
     the specified compiler.
     """
@@ -100,7 +102,7 @@ def has_flag(compiler, flagname):
     return True
 
 
-def cpp_flag(compiler):
+def cpp_flag(compiler) -> str:
     """Return the -std=c++[11/14/17] compiler flag.
     The newer version is prefered over c++11 (when it is available).
     """
@@ -116,14 +118,6 @@ def cpp_flag(compiler):
 class BuildExt(build_ext):
     """A custom build extension for adding compiler-specific options."""
 
-    c_opts = {
-        "msvc": ["/EHsc"],
-        "unix": [],
-    }
-    l_opts: Dict[str, List[str]] = {
-        "msvc": [],
-        "unix": [],
-    }
     if TEST_BUILD:
         c_opts: Dict[str, List[str]] = {
             "msvc": ["/EHsc"],
@@ -148,7 +142,7 @@ class BuildExt(build_ext):
         c_opts["unix"] += darwin_opts
         l_opts["unix"] += darwin_opts
 
-    def build_extensions(self):
+    def build_extensions(self) -> None:
         ct = self.compiler.compiler_type
         opts = self.c_opts.get(ct, [])
         link_opts = self.l_opts.get(ct, [])
@@ -165,8 +159,13 @@ def build_extensions(self):
         build_ext.build_extensions(self)
 
 
+def local_scheme(version: Any) -> str:
+    return ""
+
+
 setup(
     name="lda11",
+    use_scm_version={"local_scheme": local_scheme},
     version=__version__,
     author="Tomoki Ohtsuki",
     url="https://github.com/tohtsky/lda11",
diff --git a/tests/test_llda.py b/tests/test_llda.py
index 1571485..0bb18d1 100644
--- a/tests/test_llda.py
+++ b/tests/test_llda.py
@@ -40,7 +40,7 @@ def test_llda() -> None:
     A_word_index = np.where(TOPIC_A > 0.1)[0]
     B_word_index = np.where(TOPIC_A < 0.1)[0]
 
-    for A_index, cgs_p in zip([1, 2], [True, False]):
+    for A_index, cgs_p, n_threads in zip([1, 2], [True, False], [1, 2]):
         if A_index == 1:
             language = LabelledLanguage(TOPIC_A, TOPIC_B)
             B_index = 2
@@ -50,7 +50,7 @@ def test_llda() -> None:
 
         X, Y = language.gen_doc(1000)
 
-        llda = LabelledLDA(use_cgs_p=cgs_p).fit(X, Y)
+        llda = LabelledLDA(use_cgs_p=cgs_p, n_workers=n_threads).fit(X, Y)
 
         for a_word in A_word_index:
             for b_word in B_word_index:
@@ -59,7 +59,7 @@ def test_llda() -> None:
 
         A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32)
         for mode in ["mf", "gibbs"]:
-            theta = llda.transform(A_DOC, mode=mode)[0]  # type: ignore
+            theta = llda.transform(A_DOC, mode=mode, n_workers=n_threads)[0]  # type: ignore
             if A_index == 1:
                 assert (theta[1] / theta[2]) > 5
             else:

From 5ffa6052b8599f8dedc97bb5e5ceb2bd5289a392 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 14:24:34 +0900
Subject: [PATCH 15/32] Remove unused func

---
 lda11/_lda.pyi                      |   7 --
 src/predictor.cpp                   |  43 -------
 src/predictor.hpp                   |   3 -
 src/wrapper.cpp                     |   1 -
 stubs/lda11/_lda-stubs/__init__.pyi | 187 ----------------------------
 5 files changed, 241 deletions(-)
 delete mode 100644 stubs/lda11/_lda-stubs/__init__.pyi

diff --git a/lda11/_lda.pyi b/lda11/_lda.pyi
index 902111c..a2322ff 100644
--- a/lda11/_lda.pyi
+++ b/lda11/_lda.pyi
@@ -134,13 +134,6 @@ class Predictor:
         npt.NDArray[numpy.float64],
         typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]],
     ]: ...
-    def predict_mf(
-        self,
-        arg0: typing.List[npt.NDArray[numpy.int32]],
-        arg1: typing.List[npt.NDArray[numpy.int32]],
-        arg2: int,
-        arg3: float,
-    ) -> npt.NDArray[numpy.float64]: ...
     def predict_mf_batch(
         self,
         arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
diff --git a/src/predictor.cpp b/src/predictor.cpp
index 885afda..722583b 100644
--- a/src/predictor.cpp
+++ b/src/predictor.cpp
@@ -106,49 +106,6 @@ RealMatrix Predictor::predict_mf_batch(std::vector<SparseIntegerMatrix> Xs,
   }
   return result;
 }
-RealVector Predictor::predict_mf(std::vector<IntegerVector> nonzeros,
-                                 std::vector<IntegerVector> counts,
-                                 std::size_t iter, Real delta) const {
-  size_t dim_buffer = 0;
-  for (size_t n = 0; n < n_domains_; n++) {
-    dim_buffer += counts[n].sum();
-  }
-  if (dim_buffer == 0) {
-    return doc_topic_prior_ / doc_topic_prior_.sum();
-  }
-  RealMatrix current_prob(dim_buffer, n_topics_);
-  current_prob.array() = 0;
-  RealMatrix new_prob(dim_buffer, n_topics_);
-  RealMatrix beta_rel(dim_buffer, n_topics_);
-
-  size_t current_iter = 0;
-  for (size_t n = 0; n < n_domains_; n++) {
-    size_t n_unique_words = nonzeros[n].rows();
-    for (size_t j = 0; j < n_unique_words; j++) {
-      size_t wid = nonzeros[n](j);
-      size_t count = counts[n][j];
-      for (size_t k = 0; k < count; k++) {
-        beta_rel.row(current_iter) = betas_[n].row(wid);
-        current_iter++;
-      }
-    }
-  }
-
-  for (size_t i = 0; i <= iter; i++) {
-    new_prob = -current_prob;
-    new_prob.rowwise() += current_prob.colwise().sum();
-    new_prob.rowwise() += doc_topic_prior_.transpose();
-    new_prob.array() = new_prob.array() * beta_rel.array();
-    new_prob.array().colwise() /= new_prob.array().rowwise().sum();
-    double diff = (new_prob - current_prob).array().abs().sum();
-    current_prob = new_prob;
-    if (diff < delta)
-      break;
-  }
-  RealVector theta = current_prob.array().colwise().sum().transpose();
-  theta /= theta.sum();
-  return theta;
-}
 
 RealVector Predictor::predict_gibbs_write_assignment(
     const std::vector<IntegerVector> &nonzeros,
diff --git a/src/predictor.hpp b/src/predictor.hpp
index 4268997..cfc390f 100644
--- a/src/predictor.hpp
+++ b/src/predictor.hpp
@@ -8,9 +8,6 @@ struct Predictor {
 
   void add_beta(const RealMatrix &beta);
 
-  RealVector predict_mf(std::vector<IntegerVector> nonzeros,
-                        std::vector<IntegerVector> counts, size_t iter,
-                        Real delta) const;
   RealMatrix predict_mf_batch(std::vector<SparseIntegerMatrix> Xs,
                               std::size_t iter, Real delta,
                               size_t n_workers) const;
diff --git a/src/wrapper.cpp b/src/wrapper.cpp
index c73915b..1583a0f 100644
--- a/src/wrapper.cpp
+++ b/src/wrapper.cpp
@@ -249,7 +249,6 @@ PYBIND11_MODULE(_lda, m) {
       .def("predict_gibbs_with_word_assignment",
            &Predictor::predict_gibbs_with_word_assignment)
       .def("predict_gibbs_batch", &Predictor::predict_gibbs_batch)
-      .def("predict_mf", &Predictor::predict_mf)
       .def("predict_mf_batch", &Predictor::predict_mf_batch)
       .def_readonly("phis", &Predictor::betas_)
       .def(py::pickle(
diff --git a/stubs/lda11/_lda-stubs/__init__.pyi b/stubs/lda11/_lda-stubs/__init__.pyi
deleted file mode 100644
index 818f26d..0000000
--- a/stubs/lda11/_lda-stubs/__init__.pyi
+++ /dev/null
@@ -1,187 +0,0 @@
-"""Backend C++ inplementation for lda11."""
-from __future__ import annotations
-import lda11._lda
-import typing
-import numpy
-import scipy.sparse
-
-_Shape = typing.Tuple[int, ...]
-
-__all__ = [
-    "LDATrainer",
-    "LabelledLDATrainer",
-    "Predictor",
-    "learn_dirichlet",
-    "learn_dirichlet_symmetric",
-    "log_likelihood_doc_topic",
-    "train_test_split",
-]
-
-class LDATrainer:
-    def __init__(
-        self,
-        arg0: npt.NDArray[numpy.float64],
-        arg1: npt.NDArray[numpy.int32],
-        arg2: npt.NDArray[numpy.uint64],
-        arg3: npt.NDArray[numpy.uint64],
-        arg4: int,
-        arg5: int,
-        arg6: int,
-    ) -> None: ...
-    def initialize(
-        self,
-        arg0: npt.NDArray[numpy.int32],
-        arg1: npt.NDArray[numpy.int32],
-        arg2: npt.NDArray[numpy.int32],
-    ) -> None: ...
-    def iterate_gibbs(
-        self,
-        arg0: npt.NDArray[numpy.float64],
-        arg1: npt.NDArray[numpy.int32],
-        arg2: npt.NDArray[numpy.int32],
-        arg3: npt.NDArray[numpy.int32],
-    ) -> None: ...
-    def log_likelihood(
-        self,
-        arg0: npt.NDArray[numpy.float64],
-        arg1: npt.NDArray[numpy.int32],
-    ) -> float: ...
-    def obtain_phi(
-        self,
-        arg0: npt.NDArray[numpy.float64],
-        arg1: npt.NDArray[numpy.int32],
-        arg2: npt.NDArray[numpy.int32],
-        arg3: npt.NDArray[numpy.int32],
-    ) -> npt.NDArray[numpy.float64]: ...
-    def set_doc_topic_prior(self, arg0: npt.NDArray[numpy.float64]) -> None: ...
-    pass
-
-class LabelledLDATrainer:
-    def __init__(
-        self,
-        arg0: float,
-        arg1: float,
-        arg2: scipy.sparse.csr_matrix[numpy.int32],
-        arg3: npt.NDArray[numpy.int32],
-        arg4: npt.NDArray[numpy.uint64],
-        arg5: npt.NDArray[numpy.uint64],
-        arg6: int,
-        arg7: int,
-        arg8: int,
-    ) -> None: ...
-    def initialize(
-        self,
-        arg0: npt.NDArray[numpy.int32],
-        arg1: npt.NDArray[numpy.int32],
-        arg2: npt.NDArray[numpy.int32],
-    ) -> None: ...
-    def iterate_gibbs(
-        self,
-        arg0: npt.NDArray[numpy.float64],
-        arg1: npt.NDArray[numpy.int32],
-        arg2: npt.NDArray[numpy.int32],
-        arg3: npt.NDArray[numpy.int32],
-    ) -> None: ...
-    def log_likelihood(
-        self,
-        arg0: npt.NDArray[numpy.float64],
-        arg1: npt.NDArray[numpy.int32],
-    ) -> float: ...
-    def obtain_phi(
-        self,
-        arg0: npt.NDArray[numpy.float64],
-        arg1: npt.NDArray[numpy.int32],
-        arg2: npt.NDArray[numpy.int32],
-        arg3: npt.NDArray[numpy.int32],
-    ) -> npt.NDArray[numpy.float64]: ...
-    pass
-
-class Predictor:
-    def __getstate__(self) -> tuple: ...
-    def __init__(
-        self, arg0: int, arg1: npt.NDArray[numpy.float64], arg2: int
-    ) -> None: ...
-    def __setstate__(self, arg0: tuple) -> None: ...
-    def add_beta(self, arg0: npt.NDArray[numpy.float64]) -> None: ...
-    def predict_gibbs(
-        self,
-        arg0: typing.List[npt.NDArray[numpy.int32]],
-        arg1: typing.List[npt.NDArray[numpy.int32]],
-        arg2: int,
-        arg3: int,
-        arg4: int,
-        arg5: bool,
-    ) -> npt.NDArray[numpy.float64]: ...
-    def predict_gibbs_batch(
-        self,
-        arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
-        arg1: int,
-        arg2: int,
-        arg3: int,
-        arg4: bool,
-        arg5: int,
-    ) -> npt.NDArray[numpy.float64]: ...
-    def predict_gibbs_with_word_assignment(
-        self,
-        arg0: typing.List[npt.NDArray[numpy.int32]],
-        arg1: typing.List[npt.NDArray[numpy.int32]],
-        arg2: int,
-        arg3: int,
-        arg4: int,
-        arg5: bool,
-    ) -> typing.Tuple[
-        npt.NDArray[numpy.float64],
-        typing.List[typing.Dict[int, npt.NDArray[numpy.int32]]],
-    ]: ...
-    def predict_mf(
-        self,
-        arg0: typing.List[npt.NDArray[numpy.int32]],
-        arg1: typing.List[npt.NDArray[numpy.int32]],
-        arg2: int,
-        arg3: float,
-    ) -> npt.NDArray[numpy.float64]: ...
-    def predict_mf_batch(
-        self,
-        arg0: typing.List[scipy.sparse.csr_matrix[numpy.int32]],
-        arg1: int,
-        arg2: float,
-        arg3: int,
-    ) -> npt.NDArray[numpy.float64]: ...
-    @property
-    def phis(self) -> typing.List[npt.NDArray[numpy.float64]]:
-        """
-        :type: typing.List[npt.NDArray[numpy.float64]]
-        """
-    pass
-
-def learn_dirichlet(
-    arg0: npt.NDArray[numpy.int32],
-    arg1: npt.NDArray[numpy.float64],
-    arg2: float,
-    arg3: float,
-    arg4: int,
-) -> npt.NDArray[numpy.float64]:
-    pass
-
-def learn_dirichlet_symmetric(
-    arg0: npt.NDArray[numpy.int32],
-    arg1: float,
-    arg2: float,
-    arg3: float,
-    arg4: int,
-) -> float:
-    pass
-
-def log_likelihood_doc_topic(
-    arg0: npt.NDArray[numpy.float64],
-    arg1: npt.NDArray[numpy.int32],
-    arg2: npt.NDArray[numpy.int32],
-) -> float:
-    pass
-
-def train_test_split(
-    arg0: scipy.sparse.csr_matrix[numpy.int32], arg1: float, arg2: int
-) -> typing.Tuple[
-    scipy.sparse.csr_matrix[numpy.int32], scipy.sparse.csr_matrix[numpy.int32]
-]:
-    pass

From 58e01a33d3df2e38ef18c5ad03d02bf925301ef0 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 14:25:55 +0900
Subject: [PATCH 16/32] add setup_requires

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 300ea80..175f774 100644
--- a/setup.py
+++ b/setup.py
@@ -174,7 +174,7 @@ def local_scheme(version: Any) -> str:
     long_description="",
     ext_modules=ext_modules,
     install_requires=install_requires,
-    setup_requires=install_requires,
+    setup_requires=setup_requires,
     cmdclass={"build_ext": BuildExt},
     packages=["lda11"],
     zip_safe=False,

From b9249acca8fe00290fa25d33eabc2843eed8a136 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 14:28:07 +0900
Subject: [PATCH 17/32] Fix numpy deps

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 175f774..8c12b82 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
 from setuptools.command.build_ext import build_ext
 
 __version__ = "0.3.0.0"
-install_requires = ["numpy >= 1.22", "tqdm", "scipy>=1.0.0"]
+install_requires = ["numpy>=1.22", "tqdm", "scipy>=1.0.0"]
 setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"]
 
 

From 18f96985a79cdb554ddb46fcef1a2fb911d1815e Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 17:53:57 +0900
Subject: [PATCH 18/32] pre-install numpy & scipy

---
 .github/workflows/test.yml | 1 +
 setup.py                   | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a9d9efc..4ee7fca 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,6 +16,7 @@ jobs:
       - name: Build lda11
         run: |
           pip install --upgrade pip
+          pip install numpy scipy
           sudo apt-get install lcov
           TEST_BUILD=true python setup.py develop
       - name: Run pytest
diff --git a/setup.py b/setup.py
index 8c12b82..f2899f8 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,12 @@
 from setuptools.command.build_ext import build_ext
 
 __version__ = "0.3.0.0"
-install_requires = ["numpy>=1.22", "tqdm", "scipy>=1.0.0"]
+install_requires = [
+    "numpy>=1.22",
+    "tqdm",
+    "scipy>=1.0.0",
+    "typing_extensions>=3.0",
+]
 setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"]
 
 

From fb344a1233c4f7a30e9e5aef2ea3e2a2ff75d8fe Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:01:00 +0900
Subject: [PATCH 19/32] use scm for versioning

---
 lda11/__init__.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lda11/__init__.py b/lda11/__init__.py
index 4c3681a..b3e4181 100644
--- a/lda11/__init__.py
+++ b/lda11/__init__.py
@@ -1,5 +1,18 @@
+from pkg_resources import DistributionNotFound, get_distribution
+
 from .labelled_lda import LabelledLDA
 from .lda import LDA, MultilingualLDA
 from .util import rowwise_train_test_split
 
-__all__ = ["LDA", "LabelledLDA", "MultilingualLDA", "rowwise_train_test_split"]
+try:
+    __version__ = get_distribution("lda11").version
+except DistributionNotFound:  # pragma: no cover
+    __version__ = "unknown"
+
+__all__ = [
+    "__version__",
+    "LDA",
+    "LabelledLDA",
+    "MultilingualLDA",
+    "rowwise_train_test_split",
+]

From 5cfd095031e787e3842ac1cce9d843b085b602de Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:03:13 +0900
Subject: [PATCH 20/32] More efficient test path.

---
 tests/test_llda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_llda.py b/tests/test_llda.py
index 0bb18d1..a6027dd 100644
--- a/tests/test_llda.py
+++ b/tests/test_llda.py
@@ -40,7 +40,7 @@ def test_llda() -> None:
     A_word_index = np.where(TOPIC_A > 0.1)[0]
     B_word_index = np.where(TOPIC_A < 0.1)[0]
 
-    for A_index, cgs_p, n_threads in zip([1, 2], [True, False], [1, 2]):
+    for A_index, cgs_p, n_threads in zip([1, 2], [False, True], [1, 2]):
         if A_index == 1:
             language = LabelledLanguage(TOPIC_A, TOPIC_B)
             B_index = 2

From 86557f71578158f0f0800926eb72ab2f9ef5b5cc Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:11:25 +0900
Subject: [PATCH 21/32] Add mypy check before testing.

---
 .github/workflows/pre-commit.yaml | 13 +++++++++++++
 .github/workflows/test.yml        |  4 ++++
 lda11/__init__.py                 |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/pre-commit.yaml

diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
new file mode 100644
index 0000000..96bb78b
--- /dev/null
+++ b/.github/workflows/pre-commit.yaml
@@ -0,0 +1,13 @@
+name: pre-commit
+on:
+  pull_request:
+  push:
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    env:
+      SKIP: no-commit-to-branch
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+    - uses: pre-commit/action@v2.0.0
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4ee7fca..691ae5f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -19,6 +19,10 @@ jobs:
           pip install numpy scipy
           sudo apt-get install lcov
           TEST_BUILD=true python setup.py develop
+      - name: mypy
+        run: |
+          pip install mypy
+          mypy lda11 --ignore-missing-imports
       - name: Run pytest
         run: |
           pip install pytest pytest-cov
diff --git a/lda11/__init__.py b/lda11/__init__.py
index b3e4181..e00c01c 100644
--- a/lda11/__init__.py
+++ b/lda11/__init__.py
@@ -1,4 +1,4 @@
-from pkg_resources import DistributionNotFound, get_distribution
+from pkg_resources import DistributionNotFound, get_distribution  # type: ignore
 
 from .labelled_lda import LabelledLDA
 from .lda import LDA, MultilingualLDA

From c2196ba51202c3b3bb9d495c0ba3855fddd9fde8 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:19:29 +0900
Subject: [PATCH 22/32] add test for pickling.

---
 tests/test_llda.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/test_llda.py b/tests/test_llda.py
index a6027dd..6d74d30 100644
--- a/tests/test_llda.py
+++ b/tests/test_llda.py
@@ -1,9 +1,12 @@
+import pickle
+import sys
+from tempfile import NamedTemporaryFile
 from typing import Tuple
 
 import numpy as np
 import numpy.typing as npt
 
-from lda11 import LabelledLDA
+from lda11 import LabelledLDA, labelled_lda
 
 
 class LabelledLanguage:
@@ -51,15 +54,23 @@ def test_llda() -> None:
         X, Y = language.gen_doc(1000)
 
         llda = LabelledLDA(use_cgs_p=cgs_p, n_workers=n_threads).fit(X, Y)
+        if sys.platform.startswith("linux"):
+            with NamedTemporaryFile() as temp_fs:
+                pickle.dump(llda, temp_fs)
+                temp_fs.seek(0)
+                del llda
+                llda_new: LabelledLDA = pickle.load(temp_fs)
+        else:
+            llda_new = llda
 
         for a_word in A_word_index:
             for b_word in B_word_index:
-                assert llda.phi[a_word, A_index] > llda.phi[b_word, A_index]
-                assert llda.phi[a_word, B_index] < llda.phi[b_word, B_index]
+                assert llda_new.phi[a_word, A_index] > llda_new.phi[b_word, A_index]
+                assert llda_new.phi[a_word, B_index] < llda_new.phi[b_word, B_index]
 
         A_DOC = np.asarray(([0, 10, 0, 10]), dtype=np.int32)
         for mode in ["mf", "gibbs"]:
-            theta = llda.transform(A_DOC, mode=mode, n_workers=n_threads)[0]  # type: ignore
+            theta = llda_new.transform(A_DOC, mode=mode, n_workers=n_threads)[0]  # type: ignore
             if A_index == 1:
                 assert (theta[1] / theta[2]) > 5
             else:

From 680a728a9bcecb90a1d9673e06e926ce52733286 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:30:56 +0900
Subject: [PATCH 23/32] Add wheel build workflow

---
 .github/workflows/wheels.yaml | 163 ++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 .github/workflows/wheels.yaml

diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml
new file mode 100644
index 0000000..e82f7ad
--- /dev/null
+++ b/.github/workflows/wheels.yaml
@@ -0,0 +1,163 @@
+name: Build
+on:
+  push:
+#    branches:
+#      - main
+  release:
+    types:
+      - created
+env:
+  cibuildwheel_version: "2.2.2"
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v2
+        name: Install Python
+        with:
+          python-version: '3.7'
+      - name: Build sdist
+        run: python setup.py sdist
+      - uses: actions/upload-artifact@v2
+        with:
+          path: dist/*.tar.gz
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    env:
+      MACOSX_DEPLOYMENT_TARGET: "10.9"
+      CIBW_BUILD_VERBOSITY: "1"
+      CIBW_BUILD: "${{ matrix.cibw.build || '*' }}"
+      CIBW_SKIP: "${{ matrix.cibw.skip || '' }}"
+      CIBW_ENVIRONMENT: "${{ matrix.cibw.env || '' }}"
+      CIBW_TEST_COMMAND: pytest {project}/tests
+      CIBW_TEST_REQUIRES: pytest
+      CIBW_MANYLINUX_X86_64_IMAGE: "${{ matrix.cibw.manylinux_image }}"
+      CIBW_MANYLINUX_I686_IMAGE: "${{ matrix.cibw.manylinux_image }}"
+      CIBW_MANYLINUX_AARCH64_IMAGE: "${{ matrix.cibw.manylinux_image }}"
+      CIBW_ARCHS_LINUX: "${{ matrix.cibw.arch || 'auto' }}"
+      CIBW_ARCHS_MACOS: "${{ matrix.cibw.arch || 'auto' }}"
+    strategy:
+      matrix:
+        include:
+          - os: macos-10.15
+            name: mac
+            cibw:
+              arch: x86_64
+              env: CFLAGS='-march=core-avx-i'
+              build: "cp37* cp38*"
+
+          - os: macos-10.15
+            name: mac-arm
+            cibw:
+              arch: universal2
+              build: "cp39* cp310*"
+              env: ''
+
+          - os: ubuntu-20.04
+            name: manylinux1
+            cibw:
+              build: "cp37*"
+              skip: "*musllinux*"
+              manylinux_image: manylinux2010
+              env: CFLAGS='-march=core-avx-i'
+              arch: auto64
+
+          - os: ubuntu-20.04
+            name: manylinux2014
+            cibw:
+              build: "cp38* cp39* cp310"
+              skip: "*musllinux*"
+              manylinux_image: manylinux2014
+              env: CFLAGS='-march=core-avx-i'
+              arch: auto64
+
+          - os: ubuntu-20.04
+            name: manylinux_aarch64_cp37
+            cibw:
+              build: "cp37*"
+              skip: "*musllinux*"
+              manylinux_image: manylinux2014
+              arch: aarch64
+
+          - os: ubuntu-20.04
+            name: manylinux_aarch64_cp38
+            cibw:
+              build: "cp38*"
+              skip: "*musllinux*"
+              manylinux_image: manylinux2014
+              arch: aarch64
+
+          - os: ubuntu-20.04
+            name: manylinux_aarch64_cp39
+            cibw:
+              build: "cp39*"
+              skip: "*musllinux*"
+              manylinux_image: manylinux2014
+              arch: aarch64
+
+          - os: ubuntu-20.04
+            name: manylinux_aarch64_cp310
+            cibw:
+              build: "cp310*"
+              skip: "*musllinux*"
+              manylinux_image: manylinux2014
+              arch: aarch64
+
+          - os: windows-2019
+            name: win_amd64
+            architecture: x64
+            cibw:
+              build: "cp*win_amd64"
+              env: "CL='/arch:AVX'"
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v2
+        name: Install Python
+      - name: register qemu
+        if: contains(matrix.cibw.arch, 'aarch64')
+        run: |
+          docker run --rm --privileged hypriot/qemu-register:v4.2.0
+      - name: Install cibuildwheel
+        run: python -m pip install cibuildwheel=="${{env.cibuildwheel_version}}"
+      - name: Build wheels
+        run: python -m cibuildwheel --output-dir wheelhouse
+
+
+      - uses: actions/upload-artifact@v2
+        with:
+          path: ./wheelhouse/*.whl
+
+  upload_pypi:
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v2
+        with:
+          name: artifact
+          path: dist
+      - name: Publish package to TestPyPI
+        uses: pypa/gh-action-pypi-publish@master
+        with:
+          user: __token__
+          password: ${{ secrets.TEST_PYPI_APITOKEN }}
+          packages_dir: dist/
+          repository_url: https://test.pypi.org/legacy/
+          verbose: true
+          skip_existing: true
+      - name: Publish package to PyPI
+        if: github.event_name == 'release'
+        uses: pypa/gh-action-pypi-publish@master
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_APITOKEN }}
+          packages_dir: dist/
+          verbose: true
+          skip_existing: true

From 5c4f14c133f0257f7fb31a2b2c4a42dd6d2b3900 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:42:39 +0900
Subject: [PATCH 24/32] numpy 1.21 instead & typing_extentions

---
 lda11/lda.py    | 13 ++-----------
 setup.py        |  4 ++--
 src/wrapper.cpp |  8 +++++++-
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/lda11/lda.py b/lda11/lda.py
index fb1a198..c3ed90b 100644
--- a/lda11/lda.py
+++ b/lda11/lda.py
@@ -1,19 +1,10 @@
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    List,
-    Literal,
-    NamedTuple,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import Dict, List, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
 from numpy import typing as npt
 from scipy import sparse as sps
 from tqdm import tqdm
+from typing_extensions import Literal
 
 from ._lda import LDATrainer
 from ._lda import Predictor as CorePredictor
diff --git a/setup.py b/setup.py
index f2899f8..d5d8fb6 100644
--- a/setup.py
+++ b/setup.py
@@ -8,10 +8,10 @@
 
 __version__ = "0.3.0.0"
 install_requires = [
-    "numpy>=1.22",
+    "numpy>=1.21",
     "tqdm",
     "scipy>=1.0.0",
-    "typing_extensions>=3.0",
+    "typing_extensions>=3.10",
 ]
 setup_requires = ["pybind11>=2.5", "requests", "setuptools_scm"]
 
diff --git a/src/wrapper.cpp b/src/wrapper.cpp
index 1583a0f..defd152 100644
--- a/src/wrapper.cpp
+++ b/src/wrapper.cpp
@@ -16,6 +16,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <random>
+#include <sstream>
 #include <stdexcept>
 #include <tuple>
 #include <vector>
@@ -216,7 +217,12 @@ Real log_likelihood_doc_topic(const Eigen::Ref<RealVector> &doc_topic_prior,
 }
 
 PYBIND11_MODULE(_lda, m) {
-  m.doc() = "Backend C++ inplementation for lda11.";
+  std::stringstream doc_stream;
+  doc_stream << "Backend C++ implementation for lda11." << std::endl
+             << "Built to use" << std::endl
+             << "\t" << Eigen::SimdInstructionSetsInUse();
+
+  m.doc() = doc_stream.str();
   py::class_<LDATrainer>(m, "LDATrainer")
       .def(py::init<const RealVector &, Eigen::Ref<IntegerVector>,
                     Eigen::Ref<IndexVector>, Eigen::Ref<IndexVector>,

From 1e1cb90689b72b6fab55b3bc28b43dbb53a99886 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:51:01 +0900
Subject: [PATCH 25/32] Fix readme & packaging.

---
 README.md | 17 ++++++++++-------
 setup.py  |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4f900df..80c88d1 100644
--- a/README.md
+++ b/README.md
@@ -2,22 +2,25 @@
 
 ## Features
 
-- Use [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) for faster array multiplication.
-- Use [pybind11](https://github.com/pybind/pybind11) to bind the code into python.
 - Support parallelized sampler proposed in [Distributed Inference for Latent Dirichlet Allocation](https://dl.acm.org/doi/abs/10.5555/2981562.2981698).
 - Implement [CGS_p estimator](http://www.jmlr.org/papers/volume18/16-526/16-526.pdf) for more precise point estimate of topic-word distribution.
 - Implement [Labelled LDA](https://www-nlp.stanford.edu/cmanning/papers/llda-emnlp09.pdf)
+- Able to obtain per-word topic frequency.
+
+The implementaion relies on [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) for faster array multiplication and  [pybind11](https://github.com/pybind/pybind11) for simple binding.
+
 
 ## Installation
 
+You can install the wheel from pypi:
+
 ```
-pip install git+https://github.com/tohtsky/lda11
+pip install lda11
 ```
 
-The above command will automatically download Eigen (ver 3.3.7).
-If you want to use an existing version of Eigen (located on `path/to/eigen`),
-type
+For x64 architecture, the above wheel is built using AVX.
+If it is not convenient for you, try e.g.
 
 ```
-EIGEN3_INCLUDE_DIR=/path/to/eigen pip install git+https://github.com/tohtsky/lda11
+CFLAGS="-march=native" pip install git+https://github.com/tohtsky/lda11
 ```
diff --git a/setup.py b/setup.py
index d5d8fb6..3c9618b 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, List
 
 import setuptools
-from setuptools import Extension, setup
+from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 
 __version__ = "0.3.0.0"
@@ -181,6 +181,6 @@ def local_scheme(version: Any) -> str:
     install_requires=install_requires,
     setup_requires=setup_requires,
     cmdclass={"build_ext": BuildExt},
-    packages=["lda11"],
+    packages=find_packages(),
     zip_safe=False,
 )

From 4b9519548bbe2191d32698f027b9386a5cbac411 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 18:59:07 +0900
Subject: [PATCH 26/32] Bump eigen version

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 3c9618b..83ec118 100644
--- a/setup.py
+++ b/setup.py
@@ -24,8 +24,8 @@
 
 
 class get_eigen_include(object):
-    EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip"
-    EIGEN3_DIRNAME = "eigen-3.3.7"
+    EIGEN3_URL = "https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip"
+    EIGEN3_DIRNAME = "eigen-3.4.0"
 
     def __str__(self) -> str:
         if eigen_include_dir is not None:

From 467ae2d48e9186a512c2f836fa072c0163688ec9 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 19:58:22 +0900
Subject: [PATCH 27/32] Fix setup.py

---
 .gitignore |  1 +
 setup.py   | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index f3e2f8e..36a4bb0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .python-version
 **.ipynb_checkpoints**
 eigen-3.3.7/
+eigen-3.4.0/
 build/*
 **__pycache__**
 tmp/**
diff --git a/setup.py b/setup.py
index 83ec118..9beb43d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,16 @@
 import os
 import sys
+from pathlib import Path
 from typing import Any, Dict, List
 
 import setuptools
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
 
+SETUP_DIRECTORY = Path(__file__).resolve().parent
+with (SETUP_DIRECTORY / "README.md").open() as ifs:
+    LONG_DESCRIPTION = ifs.read()
+
 __version__ = "0.3.0.0"
 install_requires = [
     "numpy>=1.21",
@@ -174,13 +179,14 @@ def local_scheme(version: Any) -> str:
     version=__version__,
     author="Tomoki Ohtsuki",
     url="https://github.com/tohtsky/lda11",
-    author_email="tomoki.ohtsuki129@gmail.com",
+    author_email="tomoki.ohtsuki.19937@outook.jp",
     description="Yet another CGS sampler for Latent Dirichlet Allocation.",
-    long_description="",
+    long_description=LONG_DESCRIPTION,
+    long_description_content_type="text/markdown",
     ext_modules=ext_modules,
     install_requires=install_requires,
     setup_requires=setup_requires,
     cmdclass={"build_ext": BuildExt},
     packages=find_packages(),
-    zip_safe=False,
+    include_package_data=True,
 )

From 8a16cec1f73c92501c5b28af7014daefcc0eb84a Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 20:08:35 +0900
Subject: [PATCH 28/32] Manually specify packages.

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9beb43d..c652ed7 100644
--- a/setup.py
+++ b/setup.py
@@ -187,6 +187,7 @@ def local_scheme(version: Any) -> str:
     install_requires=install_requires,
     setup_requires=setup_requires,
     cmdclass={"build_ext": BuildExt},
-    packages=find_packages(),
+    packages=["lda11", "lda11._lda"],
     include_package_data=True,
+    zip_safe=False,
 )

From 86cc42fed7a8b09fdb37f2eb2be2604192cf2f21 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 20:40:19 +0900
Subject: [PATCH 29/32] fix layout

---
 .github/workflows/wheels.yaml         |  2 +-
 {src => cpp_sources}/child_worker.cpp |  0
 {src => cpp_sources}/defs.hpp         |  0
 {src => cpp_sources}/labelled_lda.cpp |  0
 {src => cpp_sources}/labelled_lda.hpp |  0
 {src => cpp_sources}/predictor.cpp    |  0
 {src => cpp_sources}/predictor.hpp    |  0
 {src => cpp_sources}/trainer.cpp      |  0
 {src => cpp_sources}/trainer.hpp      |  0
 {src => cpp_sources}/trainer_base.cpp |  0
 {src => cpp_sources}/trainer_base.hpp |  0
 {src => cpp_sources}/util.hpp         |  0
 {src => cpp_sources}/wrapper.cpp      |  0
 setup.py                              | 15 ++++++++-------
 {lda11 => src/lda11}/__init__.py      |  0
 {lda11 => src/lda11}/_lda.pyi         |  1 -
 {lda11 => src/lda11}/labelled_lda.py  |  1 -
 {lda11 => src/lda11}/lda.py           |  0
 {lda11 => src/lda11}/util.py          |  0
 19 files changed, 9 insertions(+), 10 deletions(-)
 rename {src => cpp_sources}/child_worker.cpp (100%)
 rename {src => cpp_sources}/defs.hpp (100%)
 rename {src => cpp_sources}/labelled_lda.cpp (100%)
 rename {src => cpp_sources}/labelled_lda.hpp (100%)
 rename {src => cpp_sources}/predictor.cpp (100%)
 rename {src => cpp_sources}/predictor.hpp (100%)
 rename {src => cpp_sources}/trainer.cpp (100%)
 rename {src => cpp_sources}/trainer.hpp (100%)
 rename {src => cpp_sources}/trainer_base.cpp (100%)
 rename {src => cpp_sources}/trainer_base.hpp (100%)
 rename {src => cpp_sources}/util.hpp (100%)
 rename {src => cpp_sources}/wrapper.cpp (100%)
 rename {lda11 => src/lda11}/__init__.py (100%)
 rename {lda11 => src/lda11}/_lda.pyi (99%)
 rename {lda11 => src/lda11}/labelled_lda.py (99%)
 rename {lda11 => src/lda11}/lda.py (100%)
 rename {lda11 => src/lda11}/util.py (100%)

diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml
index e82f7ad..e9fa3b2 100644
--- a/.github/workflows/wheels.yaml
+++ b/.github/workflows/wheels.yaml
@@ -34,7 +34,7 @@ jobs:
       CIBW_BUILD: "${{ matrix.cibw.build || '*' }}"
       CIBW_SKIP: "${{ matrix.cibw.skip || '' }}"
       CIBW_ENVIRONMENT: "${{ matrix.cibw.env || '' }}"
-      CIBW_TEST_COMMAND: pytest {project}/tests
+      CIBW_TEST_COMMAND: "pytest {project}/tests"
       CIBW_TEST_REQUIRES: pytest
       CIBW_MANYLINUX_X86_64_IMAGE: "${{ matrix.cibw.manylinux_image }}"
       CIBW_MANYLINUX_I686_IMAGE: "${{ matrix.cibw.manylinux_image }}"
diff --git a/src/child_worker.cpp b/cpp_sources/child_worker.cpp
similarity index 100%
rename from src/child_worker.cpp
rename to cpp_sources/child_worker.cpp
diff --git a/src/defs.hpp b/cpp_sources/defs.hpp
similarity index 100%
rename from src/defs.hpp
rename to cpp_sources/defs.hpp
diff --git a/src/labelled_lda.cpp b/cpp_sources/labelled_lda.cpp
similarity index 100%
rename from src/labelled_lda.cpp
rename to cpp_sources/labelled_lda.cpp
diff --git a/src/labelled_lda.hpp b/cpp_sources/labelled_lda.hpp
similarity index 100%
rename from src/labelled_lda.hpp
rename to cpp_sources/labelled_lda.hpp
diff --git a/src/predictor.cpp b/cpp_sources/predictor.cpp
similarity index 100%
rename from src/predictor.cpp
rename to cpp_sources/predictor.cpp
diff --git a/src/predictor.hpp b/cpp_sources/predictor.hpp
similarity index 100%
rename from src/predictor.hpp
rename to cpp_sources/predictor.hpp
diff --git a/src/trainer.cpp b/cpp_sources/trainer.cpp
similarity index 100%
rename from src/trainer.cpp
rename to cpp_sources/trainer.cpp
diff --git a/src/trainer.hpp b/cpp_sources/trainer.hpp
similarity index 100%
rename from src/trainer.hpp
rename to cpp_sources/trainer.hpp
diff --git a/src/trainer_base.cpp b/cpp_sources/trainer_base.cpp
similarity index 100%
rename from src/trainer_base.cpp
rename to cpp_sources/trainer_base.cpp
diff --git a/src/trainer_base.hpp b/cpp_sources/trainer_base.hpp
similarity index 100%
rename from src/trainer_base.hpp
rename to cpp_sources/trainer_base.hpp
diff --git a/src/util.hpp b/cpp_sources/util.hpp
similarity index 100%
rename from src/util.hpp
rename to cpp_sources/util.hpp
diff --git a/src/wrapper.cpp b/cpp_sources/wrapper.cpp
similarity index 100%
rename from src/wrapper.cpp
rename to cpp_sources/wrapper.cpp
diff --git a/setup.py b/setup.py
index c652ed7..bc6dd1f 100644
--- a/setup.py
+++ b/setup.py
@@ -77,12 +77,12 @@ def __str__(self) -> str:
     Extension(
         "lda11._lda",
         [
-            "src/wrapper.cpp",
-            "src/predictor.cpp",
-            "src/trainer_base.cpp",
-            "src/trainer.cpp",
-            "src/child_worker.cpp",
-            "src/labelled_lda.cpp",
+            "cpp_sources/wrapper.cpp",
+            "cpp_sources/predictor.cpp",
+            "cpp_sources/trainer_base.cpp",
+            "cpp_sources/trainer.cpp",
+            "cpp_sources/child_worker.cpp",
+            "cpp_sources/labelled_lda.cpp",
         ],
         include_dirs=[
             # Path to pybind11 headers
@@ -187,7 +187,8 @@ def local_scheme(version: Any) -> str:
     install_requires=install_requires,
     setup_requires=setup_requires,
     cmdclass={"build_ext": BuildExt},
-    packages=["lda11", "lda11._lda"],
+    packages=find_packages("src"),
     include_package_data=True,
     zip_safe=False,
+    package_dir={"": "src"},
 )
diff --git a/lda11/__init__.py b/src/lda11/__init__.py
similarity index 100%
rename from lda11/__init__.py
rename to src/lda11/__init__.py
diff --git a/lda11/_lda.pyi b/src/lda11/_lda.pyi
similarity index 99%
rename from lda11/_lda.pyi
rename to src/lda11/_lda.pyi
index a2322ff..bcac5e4 100644
--- a/lda11/_lda.pyi
+++ b/src/lda11/_lda.pyi
@@ -1,6 +1,5 @@
 """Backend C++ inplementation for lda11."""
 from __future__ import annotations
-import lda11._lda
 import typing
 import numpy
 import numpy.typing as npt
diff --git a/lda11/labelled_lda.py b/src/lda11/labelled_lda.py
similarity index 99%
rename from lda11/labelled_lda.py
rename to src/lda11/labelled_lda.py
index 3bb28fe..c41fc0d 100644
--- a/lda11/labelled_lda.py
+++ b/src/lda11/labelled_lda.py
@@ -8,7 +8,6 @@
 from ._lda import LabelledLDATrainer
 from ._lda import Predictor as CorePredictor
 from .lda import (
-    IndexType,
     IntegerType,
     LDAPredictorMixin,
     RealType,
diff --git a/lda11/lda.py b/src/lda11/lda.py
similarity index 100%
rename from lda11/lda.py
rename to src/lda11/lda.py
diff --git a/lda11/util.py b/src/lda11/util.py
similarity index 100%
rename from lda11/util.py
rename to src/lda11/util.py

From 61e6d1d571a7153f818a60742fa83b21eb0b57f6 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 20:45:39 +0900
Subject: [PATCH 30/32] Fix workflows.

---
 .github/workflows/test.yml                    | 2 +-
 .github/workflows/{wheels.yaml => wheels.yml} | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename .github/workflows/{wheels.yaml => wheels.yml} (99%)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 691ae5f..36586a8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -22,7 +22,7 @@ jobs:
       - name: mypy
         run: |
           pip install mypy
-          mypy lda11 --ignore-missing-imports
+          mypy src/lda11 --ignore-missing-imports
       - name: Run pytest
         run: |
           pip install pytest pytest-cov
diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yml
similarity index 99%
rename from .github/workflows/wheels.yaml
rename to .github/workflows/wheels.yml
index e9fa3b2..1c8320b 100644
--- a/.github/workflows/wheels.yaml
+++ b/.github/workflows/wheels.yml
@@ -112,6 +112,7 @@ jobs:
             name: win_amd64
             architecture: x64
             cibw:
+              skip: "cp36*"
               build: "cp*win_amd64"
               env: "CL='/arch:AVX'"
 

From 880334f5e2b4db99010bb308639608b0cb1e894e Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 21:00:21 +0900
Subject: [PATCH 31/32] Restore branch restriction

---
 .github/workflows/wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 1c8320b..0849f2e 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -1,8 +1,8 @@
 name: Build
 on:
   push:
-#    branches:
-#      - main
+    branches:
+      - main
   release:
     types:
       - created

From 813f8895bf1353fd515ffd659749455eefe1b3c5 Mon Sep 17 00:00:00 2001
From: Tomoki <tomoki.ohtsuki129@gmail.com>
Date: Mon, 10 Jan 2022 21:03:03 +0900
Subject: [PATCH 32/32] Fix test workflow

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 36586a8..4c176ca 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -26,7 +26,7 @@ jobs:
       - name: Run pytest
         run: |
           pip install pytest pytest-cov
-          pytest --cov=./lda11 tests/
+          pytest --cov=./src/lda11 tests/
       - name: Generate coverage (ubuntu)
         run: |
           coverage xml