diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index bb628e5..096c096 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -1,6 +1,6 @@
 # https://github.com/pypa/cibuildwheel/blob/main/examples/github-deploy.yml
 # except no Windows
-name: Build and upload to PyPI
+name: build
 
 # Build on every branch push, tag push, and pull request change:
 on: [push, pull_request]
@@ -13,18 +13,54 @@ on: [push, pull_request]
 #       - published
 
 jobs:
+  build-and-test:
+    name: Build executable and run test
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          make -j
+      - name: Test
+        run: |
+          cd build
+          ctest --no-tests=error --output-on-failure
+
   build_wheels:
     name: Build wheels on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-20.04, macos-11]
+        os: [ubuntu-latest, macos-latest, windows-latest]
 
     steps:
       - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set up QEMU
+        if: runner.os == 'Linux' && (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/v'))
+        uses: docker/setup-qemu-action@v2
+        with:
+          platforms: all
 
-      - name: Build wheels
+      - name: Build wheels (development)
+        if: github.ref != 'refs/heads/master' && !startsWith(github.ref, 'refs/tags/v')
         uses: pypa/cibuildwheel@v2.11.2
+        env:
+          CIBW_ARCHS_MACOS: "arm64"
+
+      - name: Build wheels (production)
+        if: github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/v')
+        uses: pypa/cibuildwheel@v2.11.2
+        env:
+          CIBW_ARCHS_MACOS: "x86_64 arm64"
+          CIBW_ARCHS_LINUX: "auto aarch64"
 
       - uses: actions/upload-artifact@v3
         with:
@@ -33,8 +69,11 @@ jobs:
   build_sdist:
     name: Build source distribution
     runs-on: ubuntu-latest
+    if: github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/tags/v')
     steps:
       - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
 
       - name: Build sdist
         run: pipx run build --sdist
diff --git a/.gitignore b/.gitignore
index 7c53c88..bf179db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,4 @@ dbscan/build/
 .DS_Store
 dbscan.egg-info/
 __pycache__
+pythonmodule/_version.py
diff --git a/MANIFEST.in b/MANIFEST.in
index 3a46ee3..323e43d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,3 @@
 recursive-include include *
+global-exclude *.py[co] .DS_Store
+exclude src/dbscan
diff --git a/README.md b/README.md
index 5b67d25..757e3d6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,9 @@
-# Overview
+# Theoretically-Efficient and Practical Parallel DBSCAN
+
+[![arXiv](https://img.shields.io/badge/arXiv-1912.06255-b31b1b.svg)](https://arxiv.org/abs/1912.06255)
+[![build](https://github.com/wangyiqiu/dbscan-python/actions/workflows/build_wheels.yml/badge.svg)](https://github.com/wangyiqiu/dbscan-python/actions/workflows/build_wheels.yml)
+
+## Overview
 
 This repository hosts fast parallel DBSCAN clustering code for low dimensional Euclidean space. The code automatically uses the available threads on a parallel shared-memory machine to speedup DBSCAN clustering. It stems from a paper presented in SIGMOD'20: [Theoretically Efficient and Practical Parallel DBSCAN](https://dl.acm.org/doi/10.1145/3318464.3380582).
 
@@ -11,9 +16,9 @@ Data sets with dimensionality 2 - 20 are supported by default, which can be modi
 <img src="https://github.com/wangyiqiu/dbscan-python/blob/master/example.png" alt="example" width="300"/>
 </p>
 
-# Tutorial
+## Tutorial
 
-## Option 1: Use the binary executable
+### Option 1: Use the binary executable
 
 Compile and run the program:
 
@@ -28,75 +33,30 @@ make -j # this will take a while
 
 The `<data-file>` can be any CSV-like point data file, where each line contains a data point -- see an example [here](https://github.com/wangyiqiu/hdbscan/blob/main/example-data.csv). The data file can be either with or without header. The cluster output `clusters.txt` will contain a cluster ID on each line (other than the first-line header), giving a cluster assignment in the same ordering as the input file. A noise point will have a cluster ID of `-1`.
 
-## Option 2: Use the Python binding
+### Option 2: Use the Python binding
 
 There are two ways to install it:
 
-* Compile it yourself: First install dependencies ``pip3 install -r src/requirements.txt`` and ``sudo apt install libpython3-dev``. Run ``python3 setup.py build --inplace``, The compilation will take a few minutes, and generate a ``.so`` library containing the ``DBSCAN`` module.
-* ***OR*** Install it using PyPI: ``pip3 install --user dbscan`` (the latest version is 0.0.9)
-
-An example for using the Python module is provided in ``src/example.py``. If the dependencies above are installed, simply run ``python3 example.py`` from ``src/`` to reproduce the plots above.
-
+* Install it using PyPI: ``pip3 install --user dbscan`` (you can find the wheels [here](https://pypi.org/project/dbscan/#files))
+* (harder and not recommended) Compile it yourself: First install dependencies ``pip3 install -r src/requirements.txt`` and ``sudo apt install libpython3-dev``. Run ``python3 setup.py build --inplace``, The compilation will take a few minutes, and generate a ``.so`` library containing the ``DBSCAN`` module.
 To create a wheel that is supported universally across many Python versions for your given OS, run ``python setup.py bdist_wheel`` in an environment containing the oldest numpy version available for the version of Python that you are compiling for. For example, for Python 3.8, use numpy 1.17 to compile the wheel. Then, the wheel will work on all Python and numpy versions that are newer that that for your given OS. This is done automatically when installing via pip.
 
-## Option 3: Include directly in your own C++ program
-
-Create your own caller header and source file by instantiating the DBSCAN template function in "dbscan/algo.h".
+An example for using the Python module is provided in ``example.py``. If the dependencies above are installed, simply run ``python3 example.py`` from the root directory to reproduce the plots above.
 
-dbscan.h:
-```c++
-template<int dim>
-int DBSCAN(int n, double* PF, double epsilon, int minPts, bool* coreFlagOut, int* coreFlag, int* cluster);
-
-// equivalent to
-// int DBSCAN(intT n, floatT PF[n][dim], double epsilon, intT minPts, bool coreFlagOut[n], intT coreFlag[n], intT cluster[n])
-// if C++ syntax was a little more flexible
-
-template<>
-int DBSCAN<3>(int n, double* PF, double epsilon, int minPts, bool* coreFlagOut, int* coreFlag, int* cluster);
-```
-
-dbscan.cpp:
-```c++
-#include "dbscan/algo.h"
-#include "dbscan.h"
-```
-
-Calling the instantiated function:
-```c++
-int n = ...; // number of data points
-double data[n][3] = ...; // data points
-int labels[n]; // label ids get saved here
-bool core_samples[n]; // a flag determining whether or not the sample is a core sample is saved here
-{
-  int ignore[n];
-  DBSCAN<3>(n, (void*)data, 70, 100, core_samples, ignore, labels);
-}
-```
-
-Doing this will only compile the function for the number of dimensions that you want, which saves on compilation time.
-
-You can also include the "dbscan/capi.h" and define your own ``DBSCAN_MIN_DIMS`` and ``DBSCAN_MAX_DIMS`` macros the same way the Python extension uses it. The function exported has the following signature.
-```c++
-extern "C" int DBSCAN(int dim, int n, double* PF, double epsilon, int minPts, bool* coreFlag, int* cluster);
-```
-
-Right now, the only two files that are guaranteed to remain in the C/C++ API are "dbscan/algo.h" and "dbscan/capi.h" and the functions named DBSCAN within.
-
-### Python API
+#### Python API
 
 ```
 from dbscan import DBSCAN
 labels, core_samples_mask = DBSCAN(X, eps=0.3, min_samples=10)
 ```
 
-##### Input
+#### Input
 
 * ``X``: A 2-D Numpy array (``dtype=np.float64``) containing the input data points. The first dimension of ``X`` is the number of data points ``n``, and the second dimension is the data set dimensionality (the maximum supported dimensionality is 20).
 * ``eps``: The epsilon parameter (default 0.5).
 * ``min_samples``: The minPts parameter (default 5).
 
-##### Output
+#### Output
 
 * ``labels``: A length ``n`` Numpy array (``dtype=np.int32``) containing cluster IDs of the data points, in the same ordering as the input data. Noise points are given a pseudo-ID of ``-1``.
 * ``core_samples_mask``: A length ``n`` Numpy array (``dtype=np.bool``) masking the core points, in the same ordering as the input data.
@@ -146,6 +106,50 @@ plt.title('Estimated number of clusters: %d' % n_clusters_)
 plt.show()
 ```
 
+### Option 3: Include directly in your own C++ program
+
+Create your own caller header and source file by instantiating the DBSCAN template function in "dbscan/algo.h".
+
+dbscan.h:
+```c++
+template<int dim>
+int DBSCAN(int n, double* PF, double epsilon, int minPts, bool* coreFlagOut, int* coreFlag, int* cluster);
+
+// equivalent to
+// int DBSCAN(intT n, floatT PF[n][dim], double epsilon, intT minPts, bool coreFlagOut[n], intT coreFlag[n], intT cluster[n])
+// if C++ syntax was a little more flexible
+
+template<>
+int DBSCAN<3>(int n, double* PF, double epsilon, int minPts, bool* coreFlagOut, int* coreFlag, int* cluster);
+```
+
+dbscan.cpp:
+```c++
+#include "dbscan/algo.h"
+#include "dbscan.h"
+```
+
+Calling the instantiated function:
+```c++
+int n = ...; // number of data points
+double data[n][3] = ...; // data points
+int labels[n]; // label ids get saved here
+bool core_samples[n]; // a flag determining whether or not the sample is a core sample is saved here
+{
+  int ignore[n];
+  DBSCAN<3>(n, (void*)data, 70, 100, core_samples, ignore, labels);
+}
+```
+
+Doing this will only compile the function for the number of dimensions that you want, which saves on compilation time.
+
+You can also include the "dbscan/capi.h" and define your own ``DBSCAN_MIN_DIMS`` and ``DBSCAN_MAX_DIMS`` macros the same way the Python extension uses it. The function exported has the following signature.
+```c++
+extern "C" int DBSCAN(int dim, int n, double* PF, double epsilon, int minPts, bool* coreFlag, int* cluster);
+```
+
+Right now, the only two files that are guaranteed to remain in the C/C++ API are "dbscan/algo.h" and "dbscan/capi.h" and the functions named DBSCAN within.
+
 ## Citation
 
 If you use our work in a publication, we would appreciate citations:
diff --git a/executable/main.cpp b/executable/main.cpp
index 5278dfb..65b9db6 100644
--- a/executable/main.cpp
+++ b/executable/main.cpp
@@ -4,7 +4,6 @@
 #include "dbscan/point.h"
 #include "dbscan/geometryIO.h"
 #include "dbscan/pbbs/parallel.h"
-#include "dbscan/pbbs/gettime.h"
 #include "dbscan/pbbs/parseCommandLine.h"
 
 
diff --git a/include/dbscan/algo.h b/include/dbscan/algo.h
index 6169c14..b636685 100644
--- a/include/dbscan/algo.h
+++ b/include/dbscan/algo.h
@@ -5,7 +5,7 @@
 #include "dbscan/shared.h"
 #include "dbscan/grid.h"
 #include "dbscan/coreBccp.h"
-#include "dbscan/pbbs/gettime.h"
+// #include "dbscan/pbbs/gettime.h"
 #include "dbscan/pbbs/parallel.h"
 #include "dbscan/pbbs/sampleSort.h"
 #include "dbscan/pbbs/unionFind.h"
@@ -96,7 +96,6 @@ int DBSCAN(intT n, floatT* PF, double epsilon, intT minPts, bool* coreFlagOut, i
 
   auto uf = unionFind(G->numCell());
 
-  timing t1;
   parallel_for(0, G->numCell(), [&](intT i) {
       if (ccFlag[i]) {
         auto procTj = [&](cellT* cj) {
diff --git a/include/dbscan/kdNode.h b/include/dbscan/kdNode.h
index fb5a183..0dea9d5 100644
--- a/include/dbscan/kdNode.h
+++ b/include/dbscan/kdNode.h
@@ -47,7 +47,8 @@ class kdNode {
     }}
 
   inline void boundingBoxParallel() {
-    intT P = getWorkers()*8;
+    // intT P = getWorkers()*8;
+    static const intT P = 36 * 8;
     intT blockSize = (n+P-1)/P;
     pointT localMin[P];
     pointT localMax[P];
diff --git a/include/dbscan/pbbs/gettime.h b/include/dbscan/pbbs/gettime.h
index d9dc4de..63be5fa 100644
--- a/include/dbscan/pbbs/gettime.h
+++ b/include/dbscan/pbbs/gettime.h
@@ -1,6 +1,7 @@
 #ifndef GETTIME_H
 #define GETTIME_H
 
+/*
 #include <stdlib.h>
 #include <sys/time.h>
 #include <iomanip>
@@ -92,4 +93,6 @@ struct timing {
 // #define nextTime(_string) _tm.reportNext(_string);
 // #define nextTimeN() _tm.reportT(_tm.next());
 
+*/
+
 #endif
diff --git a/include/dbscan/pbbs/sequence.h b/include/dbscan/pbbs/sequence.h
index 858cfa4..ab5b62c 100644
--- a/include/dbscan/pbbs/sequence.h
+++ b/include/dbscan/pbbs/sequence.h
@@ -28,8 +28,8 @@
 #include "utils.h"
 
 // For fast popcount
-#include <immintrin.h>
-#include <x86intrin.h>
+// #include <immintrin.h>
+// #include <x86intrin.h>
 
 using namespace std;
 
diff --git a/include/dbscan/pbbs/unionFind.h b/include/dbscan/pbbs/unionFind.h
index 13bea25..3767e55 100644
--- a/include/dbscan/pbbs/unionFind.h
+++ b/include/dbscan/pbbs/unionFind.h
@@ -38,7 +38,8 @@ struct unionFind {
       v = find(v);
       if(u == v) break;
       if(u > v) swap(u,v);
-      if(hooks[u] == intMax() && __sync_bool_compare_and_swap(&hooks[u], intMax(), u)){
+      // if(hooks[u] == intMax() && __sync_bool_compare_and_swap(&hooks[u], intMax(), u)){
+      if(hooks[u] == intMax() && utils::myCAS(&hooks[u], intMax(), u)){
         parents[u]=v;
         break;
       }}
@@ -79,7 +80,8 @@ edgeUnionFind(intT nn): n(nn) {
       v = find(v);
       if(u == v) break;
       if(u > v) swap(u,v);
-      if(hooks[u].first == intMax() && __sync_bool_compare_and_swap(&hooks[u].first, intMax(), c_from)){
+      // if(hooks[u].first == intMax() && __sync_bool_compare_and_swap(&hooks[u].first, intMax(), c_from)){
+      if(hooks[u].first == intMax() && utils::myCAS(&hooks[u].first, intMax(), c_from)){
         parents[u]=v;
         hooks[u].second=c_to;
         break;
diff --git a/include/dbscan/pbbs/utils.h b/include/dbscan/pbbs/utils.h
index ade99ce..afddc80 100644
--- a/include/dbscan/pbbs/utils.h
+++ b/include/dbscan/pbbs/utils.h
@@ -26,7 +26,7 @@
 #include <algorithm>
 #include "parallel.h"
 
-
+/*
 #if defined(__APPLE__)
 #define PTCMPXCH "  cmpxchgl %2,%1\n"
 #else
@@ -39,6 +39,7 @@
 static int __ii =  mallopt(M_MMAP_MAX,0);
 static int __jj =  mallopt(M_TRIM_THRESHOLD,-1);
 #endif
+*/
 
 #define newA(__E,__n) (__E*) malloc((__n)*sizeof(__E))
 
diff --git a/include/dbscan/shared.h b/include/dbscan/shared.h
index 82cd53a..737437c 100644
--- a/include/dbscan/shared.h
+++ b/include/dbscan/shared.h
@@ -126,7 +126,8 @@ point<dim> pMinSerial(point<dim>* items, intT n) {
 template<int dim>
 point<dim> pMinParallel(point<dim>* items, intT n) {
   point<dim> pMin = point<dim>(items[0].x);
-  intT P = getWorkers()*8;
+  // intT P = getWorkers()*8;
+  static const intT P = 36 * 8;
   intT blockSize = (n+P-1)/P;
   point<dim> localMin[P];
   for (intT i=0; i<P; ++i) {
diff --git a/pyproject.toml b/pyproject.toml
index 902becf..0c0328b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,6 +3,9 @@ name = "dbscan"
 authors = [
     {name = "Yiqiu Wang", email = "yiqiu_wang@icloud.com"}
 ]
+maintainers = [
+    {name = "Anirudh Vegesana", email = "avegesan@cs.stanford.edu"}
+]
 description = "Theoretically efficient and practical parallel DBSCAN"
 readme = "README.md"
 keywords = ['cluster', 'clustering', 'density', 'dbscan']
@@ -30,21 +33,35 @@ example = [
 ]
 
 [project.urls]
-homepage = "https://github.com/wangyiqiu/dbscan-python"
+homepage = "https://sites.google.com/view/yiqiuwang/dbscan"
+repository = "https://github.com/wangyiqiu/dbscan-python"
 
 [build-system]
-requires = ["setuptools", "wheel", "oldest-supported-numpy"]
+requires = ["setuptools", "setuptools_scm", "wheel", "oldest-supported-numpy"]
+
+[tool.setuptools_scm]
+write_to = "pythonmodule/_version.py"
 
 [tool.cibuildwheel]
 # TODO: Update the pypy to the latest version. This is unnecesary for cpython
 # because it will be compatible with all newer releases. Changing the versions
 # could be convienient in the future, but for now, totally backwards compatible
 # wheels work nicely.
-build = "cp36-* cp38-macosx_arm64 cp39-win_arm64 pp39-*"
-skip = "*-win32 *-manylinux_i686 *-musllinux_*"
+#
+# Currently, musllinux is left off because Numpy doesn't publish wheels for it
+# on PyPI. If you want this package, build it from source. Same for rarer
+# architectures, like ppc64le or s390x. You can do this by running the same
+# command you run normally:
+# pip install dbscan
+build = "cp36-* cp38-macosx_arm64" # pp39-* cp39-win_arm64
+skip = "*-win32 *_i686 *-musllinux_*"
 
-[tool.cibuildwheel.macos]
-archs = ["x86_64"] # "arm64"
+# We will only build x86_64 when merged into master to save on compilation
+# time.
+# [tool.cibuildwheel.macos]
+# archs = ["x86_64", "arm64"]
 
-[tool.cibuildwheel.linux]
-archs = ["auto"] # "aarch64"
+# We will only build aarch64 when merged into master to save on compilation
+# time.
+# [tool.cibuildwheel.linux]
+# archs = ["auto", "aarch64"]
diff --git a/src/__init__.py b/pythonmodule/__init__.py
similarity index 90%
rename from src/__init__.py
rename to pythonmodule/__init__.py
index 952d8b5..5ddd39f 100644
--- a/src/__init__.py
+++ b/pythonmodule/__init__.py
@@ -1,7 +1,14 @@
 from ._dbscan import *
-from ._dbscan import __version__
 
-__all__ = ('DBSCAN',)
+# Load version from _version.py if available
+from . import _dbscan
+__all__ = tuple(v for v in dir(_dbscan) if v.startswith('_'))
+try:
+    from ._version import version as __version__
+    __all__ += ('__version__',)
+except:
+    pass
+del _dbscan
 
 try:
     # Create scikit-learn wrapper if possible
diff --git a/setup.py b/setup.py
index dd771d3..2116a12 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,11 @@
 import setuptools
 from setuptools.extension import Extension
+import setuptools_scm
 import numpy
+
 import ast
+import glob
+import json
 import sys
 import os
 
@@ -20,61 +24,37 @@ def initialize_options(self):
     # Give up if it doesn't work. Not a big deal.
     pass
 
-# Read README.md and set it as the description
-with open("README.md", "r", encoding="utf-8") as fh:
-    long_description = fh.read()
-
-# Get the module version from src/dbscanmodule.cpp
-with open("src/dbscanmodule.cpp", "r") as fh:
-    version = fh.readline().strip()
-    assert version.startswith('#define DBSCAN_VERSION ')
-    version = ast.literal_eval(version.replace('#define DBSCAN_VERSION ', ''))
-    assert isinstance(version, str)
-
-
 if os.name == 'nt':
     # Windows compile time arguments
     extra_compile_args = ["/std:c++17", "/Ot"]
 else:
     # Mac/Linux GCC compile time arguments
     extra_compile_args = ["-std=c++17", "-pthread", "-g", "-O3", "-fPIC", "-Wno-unused"]
+depends = [f for f in glob.glob('include/**', recursive=True) if not os.path.isdir(f)]
+
+version = setuptools_scm.get_version()
 
 setuptools.setup(
     name="dbscan",
     version=version,
-    author="Yiqiu Wang",
-    author_email="yiqiu_wang@icloud.com",
-    description="Theoretically efficient and practical parallel DBSCAN",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    keywords='cluster clustering density dbscan',
-    url="https://github.com/wangyiqiu/dbscan-python",
-    license='MIT',
     packages=('dbscan',),
-    package_dir={'dbscan': 'src'},
+    package_dir={'dbscan': 'pythonmodule'},
     ext_modules=[Extension(
         "dbscan._dbscan",
         ["src/dbscanmodule.cpp", "src/capi.cpp"],
         language = 'c++',
         extra_compile_args=extra_compile_args,
         include_dirs=[numpy.get_include(), 'include'],
+        depends=depends,
         py_limited_api=True,
-        define_macros=[('Py_LIMITED_API', '0x03020000'), ('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')]
+        define_macros=[
+            ('Py_LIMITED_API', '0x03020000'),
+            ('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION'),
+            # ('DBSCAN_VERSION', json.dumps(version)),
+        ]
     )],
-    classifiers=[
-        'Development Status :: 2 - Pre-Alpha',
-        'Intended Audience :: Science/Research',
-        'Intended Audience :: Developers',
-        "License :: OSI Approved :: MIT License",
-        'Programming Language :: C++',
-        'Programming Language :: Python :: 3',
-        'Topic :: Software Development',
-        'Topic :: Scientific/Engineering',
-    ],
     python_requires=f'>={sys.version_info.major}.{sys.version_info.minor},<4',
-    install_requires=[
-        f'numpy>={numpy.__version__},<2'
-    ],
+    install_requires=[f'numpy>={numpy.__version__},<2'],
     extras_require={
         'scikit-learn': ['scikit-learn'],
         'example': ['scikit-learn', 'matplotlib'],
diff --git a/src/dbscanmodule.cpp b/src/dbscanmodule.cpp
index 5cb911f..7668ceb 100644
--- a/src/dbscanmodule.cpp
+++ b/src/dbscanmodule.cpp
@@ -1,5 +1,3 @@
-#define DBSCAN_VERSION "0.0.10"
-
 #include "Python.h"
 #include "numpy/arrayobject.h"
 #include "dbscan/capi.h"
@@ -118,7 +116,11 @@ PyInit__dbscan(void)
 {
     import_array();
     PyObject *module = PyModule_Create(&dbscanModule);
+#ifdef DBSCAN_VERSION
     PyModule_AddStringConstant(module, "__version__", DBSCAN_VERSION);
+#endif
+    PyModule_AddIntMacro(module, DBSCAN_MIN_DIMS);
+    PyModule_AddIntMacro(module, DBSCAN_MAX_DIMS);
 
     return module;
 }