Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a4269d7
add pre-commit
tohtsky Jan 9, 2022
b46d5c5
Added stubs for _lda
tohtsky Jan 9, 2022
bad1763
mypy
tohtsky Jan 9, 2022
eb343e6
mypy check now passes
tohtsky Jan 10, 2022
93bf647
adding tests
tohtsky Jan 10, 2022
a072f9f
Type
tohtsky Jan 10, 2022
e50cb28
add test for word_topic_assignment
tohtsky Jan 10, 2022
5341ec4
add test for usual lda
tohtsky Jan 10, 2022
70b4089
test utils
tohtsky Jan 10, 2022
435d52b
llda test
tohtsky Jan 10, 2022
89b8cec
further llda test
tohtsky Jan 10, 2022
8bf8baa
Add test workflow
tohtsky Jan 10, 2022
c72481e
3.10 -> str
tohtsky Jan 10, 2022
d88cd8c
Add threading test
tohtsky Jan 10, 2022
5ffa605
Remove unused func
tohtsky Jan 10, 2022
58e01a3
add setup_requires
tohtsky Jan 10, 2022
b9249ac
Fix numpy deps
tohtsky Jan 10, 2022
18f9698
pre-install numpy & scipy
tohtsky Jan 10, 2022
fb344a1
use scm for versioning
tohtsky Jan 10, 2022
5cfd095
More efficient test path.
tohtsky Jan 10, 2022
86557f7
Add mypy check before testing.
tohtsky Jan 10, 2022
c2196ba
add test for pickling.
tohtsky Jan 10, 2022
680a728
Add wheel build workflow
tohtsky Jan 10, 2022
5c4f14c
numpy 1.21 instead & typing_extentions
tohtsky Jan 10, 2022
1e1cb90
Fix readme & packaging.
tohtsky Jan 10, 2022
4b95195
Bump eigen version
tohtsky Jan 10, 2022
467ae2d
Fix setup.py
tohtsky Jan 10, 2022
8a16cec
Manually specify packages.
tohtsky Jan 10, 2022
86cc42f
fix layout
tohtsky Jan 10, 2022
61e6d1d
Fix workflows.
tohtsky Jan 10, 2022
880334f
Restore branch restriction
tohtsky Jan 10, 2022
813f889
Fix test workflow
tohtsky Jan 10, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/pre-commit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: pre-commit
on:
pull_request:
push:
jobs:
pre-commit:
runs-on: ubuntu-latest
env:
SKIP: no-commit-to-branch
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: pre-commit/action@v2.0.0
41 changes: 41 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Full Test & Upload coverage
on: [push, pull_request]
jobs:
run_pytest_upload_coverage:
runs-on: ubuntu-latest
env:
OS: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@master
with:
python-version: "3.10"
- name: Build lda11
run: |
pip install --upgrade pip
pip install numpy scipy
sudo apt-get install lcov
TEST_BUILD=true python setup.py develop
- name: mypy
run: |
pip install mypy
mypy src/lda11 --ignore-missing-imports
- name: Run pytest
run: |
pip install pytest pytest-cov
pytest --cov=./src/lda11 tests/
- name: Generate coverage (ubuntu)
run: |
coverage xml
lcov -d `pwd` -c -o coverage.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
files: ./coverage.xml,./coverage.info
verbose: false
env_vars: OS,PYTHON
name: codecov-umbrella
fail_ci_if_error: false
164 changes: 164 additions & 0 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
name: Build
on:
push:
branches:
- main
release:
types:
- created
env:
cibuildwheel_version: "2.2.2"
jobs:
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- uses: actions/setup-python@v2
name: Install Python
with:
python-version: '3.7'
- name: Build sdist
run: python setup.py sdist
- uses: actions/upload-artifact@v2
with:
path: dist/*.tar.gz
build_wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
env:
MACOSX_DEPLOYMENT_TARGET: "10.9"
CIBW_BUILD_VERBOSITY: "1"
CIBW_BUILD: "${{ matrix.cibw.build || '*' }}"
CIBW_SKIP: "${{ matrix.cibw.skip || '' }}"
CIBW_ENVIRONMENT: "${{ matrix.cibw.env || '' }}"
CIBW_TEST_COMMAND: "pytest {project}/tests"
CIBW_TEST_REQUIRES: pytest
CIBW_MANYLINUX_X86_64_IMAGE: "${{ matrix.cibw.manylinux_image }}"
CIBW_MANYLINUX_I686_IMAGE: "${{ matrix.cibw.manylinux_image }}"
CIBW_MANYLINUX_AARCH64_IMAGE: "${{ matrix.cibw.manylinux_image }}"
CIBW_ARCHS_LINUX: "${{ matrix.cibw.arch || 'auto' }}"
CIBW_ARCHS_MACOS: "${{ matrix.cibw.arch || 'auto' }}"
strategy:
matrix:
include:
- os: macos-10.15
name: mac
cibw:
arch: x86_64
env: CFLAGS='-march=core-avx-i'
build: "cp37* cp38*"

- os: macos-10.15
name: mac-arm
cibw:
arch: universal2
build: "cp39* cp310*"
env: ''

- os: ubuntu-20.04
name: manylinux1
cibw:
build: "cp37*"
skip: "*musllinux*"
manylinux_image: manylinux2010
env: CFLAGS='-march=core-avx-i'
arch: auto64

- os: ubuntu-20.04
name: manylinux2014
cibw:
build: "cp38* cp39* cp310"
skip: "*musllinux*"
manylinux_image: manylinux2014
env: CFLAGS='-march=core-avx-i'
arch: auto64

- os: ubuntu-20.04
name: manylinux_aarch64_cp37
cibw:
build: "cp37*"
skip: "*musllinux*"
manylinux_image: manylinux2014
arch: aarch64

- os: ubuntu-20.04
name: manylinux_aarch64_cp38
cibw:
build: "cp38*"
skip: "*musllinux*"
manylinux_image: manylinux2014
arch: aarch64

- os: ubuntu-20.04
name: manylinux_aarch64_cp39
cibw:
build: "cp39*"
skip: "*musllinux*"
manylinux_image: manylinux2014
arch: aarch64

- os: ubuntu-20.04
name: manylinux_aarch64_cp310
cibw:
build: "cp310*"
skip: "*musllinux*"
manylinux_image: manylinux2014
arch: aarch64

- os: windows-2019
name: win_amd64
architecture: x64
cibw:
skip: "cp36*"
build: "cp*win_amd64"
env: "CL='/arch:AVX'"

steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- uses: actions/setup-python@v2
name: Install Python
- name: register qemu
if: contains(matrix.cibw.arch, 'aarch64')
run: |
docker run --rm --privileged hypriot/qemu-register:v4.2.0
- name: Install cibuildwheel
run: python -m pip install cibuildwheel=="${{env.cibuildwheel_version}}"
- name: Build wheels
run: python -m cibuildwheel --output-dir wheelhouse


- uses: actions/upload-artifact@v2
with:
path: ./wheelhouse/*.whl

upload_pypi:
needs: [build_wheels, build_sdist]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v2
with:
name: artifact
path: dist
- name: Publish package to TestPyPI
uses: pypa/gh-action-pypi-publish@master
with:
user: __token__
password: ${{ secrets.TEST_PYPI_APITOKEN }}
packages_dir: dist/
repository_url: https://test.pypi.org/legacy/
verbose: true
skip_existing: true
- name: Publish package to PyPI
if: github.event_name == 'release'
uses: pypa/gh-action-pypi-publish@master
with:
user: __token__
password: ${{ secrets.PYPI_APITOKEN }}
packages_dir: dist/
verbose: true
skip_existing: true
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.python-version
**.ipynb_checkpoints**
eigen-3.3.7/
eigen-3.4.0/
build/*
**__pycache__**
tmp/**
Expand All @@ -9,10 +11,11 @@ lda11.egg-info
.vscode/*
*.so
test/*
pubind11/
.eggs/
var/
dist/
compile_commands.json
.clangd/
**.ipynb_checkpoints/**
**.ipynb_checkpoints/**
.cache/clangd
.coverage
24 changes: 24 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: check-merge-conflict
- id: check-yaml
- id: end-of-file-fixer
- id: no-commit-to-branch
args: [--branch, main]
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/isort
rev: 5.6.4
hooks:
- id: isort
name: isort
- repo: https://github.com/psf/black
rev: 20.8b1
hooks:
- id: black
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 2.8.12)
set(CMAKE_EXPORT_COMPILE_COMMANDS, True)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
project(lda11)

add_subdirectory(pybind11)
Expand Down
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,25 @@

## Features

- Use [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) for faster array multiplication.
- Use [pybind11](https://github.com/pybind/pybind11) to bind the code into python.
- Support parallelized sampler proposed in [Distributed Inference for Latent Dirichlet Allocation](https://dl.acm.org/doi/abs/10.5555/2981562.2981698).
- Implement [CGS_p estimator](http://www.jmlr.org/papers/volume18/16-526/16-526.pdf) for more precise point estimate of topic-word distribution.
- Implement [Labelled LDA](https://www-nlp.stanford.edu/cmanning/papers/llda-emnlp09.pdf)
- Able to obtain per-word topic frequency.

The implementaion relies on [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) for faster array multiplication and [pybind11](https://github.com/pybind/pybind11) for simple binding.


## Installation

You can install the wheel from pypi:

```
pip install git+https://github.com/tohtsky/lda11
pip install lda11
```

The above command will automatically download Eigen (ver 3.3.7).
If you want to use an existing version of Eigen (located on `path/to/eigen`),
type
For x64 architecture, the above wheel is built using AVX.
If it is not convenient for you, try e.g.

```
EIGEN3_INCLUDE_DIR=/path/to/eigen pip install git+https://github.com/tohtsky/lda11
CFLAGS="-march=native" pip install git+https://github.com/tohtsky/lda11
```


File renamed without changes.
2 changes: 1 addition & 1 deletion src/defs.hpp → cpp_sources/defs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ struct UrandDevice {
private:
std::mt19937 random_state_;
std::uniform_real_distribution<Real> udist_;
};
};
File renamed without changes.
File renamed without changes.
43 changes: 0 additions & 43 deletions src/predictor.cpp → cpp_sources/predictor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,49 +106,6 @@ RealMatrix Predictor::predict_mf_batch(std::vector<SparseIntegerMatrix> Xs,
}
return result;
}
RealVector Predictor::predict_mf(std::vector<IntegerVector> nonzeros,
std::vector<IntegerVector> counts,
std::size_t iter, Real delta) const {
size_t dim_buffer = 0;
for (size_t n = 0; n < n_domains_; n++) {
dim_buffer += counts[n].sum();
}
if (dim_buffer == 0) {
return doc_topic_prior_ / doc_topic_prior_.sum();
}
RealMatrix current_prob(dim_buffer, n_topics_);
current_prob.array() = 0;
RealMatrix new_prob(dim_buffer, n_topics_);
RealMatrix beta_rel(dim_buffer, n_topics_);

size_t current_iter = 0;
for (size_t n = 0; n < n_domains_; n++) {
size_t n_unique_words = nonzeros[n].rows();
for (size_t j = 0; j < n_unique_words; j++) {
size_t wid = nonzeros[n](j);
size_t count = counts[n][j];
for (size_t k = 0; k < count; k++) {
beta_rel.row(current_iter) = betas_[n].row(wid);
current_iter++;
}
}
}

for (size_t i = 0; i <= iter; i++) {
new_prob = -current_prob;
new_prob.rowwise() += current_prob.colwise().sum();
new_prob.rowwise() += doc_topic_prior_.transpose();
new_prob.array() = new_prob.array() * beta_rel.array();
new_prob.array().colwise() /= new_prob.array().rowwise().sum();
double diff = (new_prob - current_prob).array().abs().sum();
current_prob = new_prob;
if (diff < delta)
break;
}
RealVector theta = current_prob.array().colwise().sum().transpose();
theta /= theta.sum();
return theta;
}

RealVector Predictor::predict_gibbs_write_assignment(
const std::vector<IntegerVector> &nonzeros,
Expand Down
3 changes: 0 additions & 3 deletions src/predictor.hpp → cpp_sources/predictor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ struct Predictor {

void add_beta(const RealMatrix &beta);

RealVector predict_mf(std::vector<IntegerVector> nonzeros,
std::vector<IntegerVector> counts, size_t iter,
Real delta) const;
RealMatrix predict_mf_batch(std::vector<SparseIntegerMatrix> Xs,
std::size_t iter, Real delta,
size_t n_workers) const;
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading