From 0c9a2d8847717535af1ef868e1f978e87fb1b3ab Mon Sep 17 00:00:00 2001
From: zachjweiner <zachjweiner@gmail.com>
Date: Mon, 23 Sep 2019 22:16:01 -0500
Subject: [PATCH] initial release

---
 .gitignore                      |   20 +
 .readthedocs.yml                |   21 +
 LICENSE                         |   21 +
 README.rst                      |   55 ++
 doc/Makefile                    |   19 +
 doc/_static/copybutton.js       |   65 ++
 doc/changes.rst                 |    7 +
 doc/citing.rst                  |   12 +
 doc/conf.py                     |  176 ++++
 doc/faq.rst                     |    4 +
 doc/index.rst                   |   46 +
 doc/installation.rst            |  108 ++
 doc/license.rst                 |   24 +
 doc/make.bat                    |   35 +
 doc/ref_codegen.rst             |   23 +
 doc/ref_fourier.rst             |   29 +
 doc/ref_multigrid.rst           |   19 +
 doc/ref_numerics.rst            |   14 +
 doc/ref_other.rst               |   23 +
 doc/requirements.txt            |    1 +
 environment.yml                 |   21 +
 examples/codegen-tutorial.ipynb | 1634 +++++++++++++++++++++++++++++++
 examples/phi_chi.py             |  182 ++++
 examples/wave-equation.py       |   45 +
 pystella/__init__.py            |  136 +++
 pystella/decomp.py              |  585 +++++++++++
 pystella/derivs.py              |  360 +++++++
 pystella/elementwise.py         |  220 +++++
 pystella/expansion.py           |  170 ++++
 pystella/field/__init__.py      |  375 +++++++
 pystella/field/diff.py          |   89 ++
 pystella/field/sympy.py         |  143 +++
 pystella/fourier/__init__.py    |   37 +
 pystella/fourier/derivs.py      |  161 +++
 pystella/fourier/dft.py         |  440 +++++++++
 pystella/fourier/projectors.py  |  343 +++++++
 pystella/fourier/rayleigh.py    |  395 ++++++++
 pystella/fourier/spectra.py     |  325 ++++++
 pystella/multigrid/__init__.py  |  493 ++++++++++
 pystella/multigrid/relax.py     |  376 +++++++
 pystella/multigrid/transfer.py  |  265 +++++
 pystella/output.py              |  177 ++++
 pystella/reduction.py           |  356 +++++++
 pystella/sectors.py             |  319 ++++++
 pystella/stencil.py             |  136 +++
 pystella/step.py                |  605 ++++++++++++
 run_tests.sh                    |    8 +
 setup.cfg                       |   14 +
 setup.py                        |   66 ++
 test/common.py                  |   63 ++
 test/conftest.py                |   22 +
 test/test_decomp.py             |  196 ++++
 test/test_derivs.py             |  165 ++++
 test/test_dft.py                |  114 +++
 test/test_elementwise.py        |  104 ++
 test/test_energy.py             |  110 +++
 test/test_examples.py           |   61 ++
 test/test_expansion.py          |   82 ++
 test/test_field.py              |  213 ++++
 test/test_multigrid.py          |  122 +++
 test/test_projectors.py         |  392 ++++++++
 test/test_rayleigh.py           |  169 ++++
 test/test_reduction.py          |  200 ++++
 test/test_relax.py              |  139 +++
 test/test_spectra.py            |  198 ++++
 test/test_stencil.py            |  104 ++
 test/test_step.py               |  118 +++
 test/test_transfer.py           |  125 +++
 68 files changed, 11895 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .readthedocs.yml
 create mode 100644 LICENSE
 create mode 100644 README.rst
 create mode 100644 doc/Makefile
 create mode 100644 doc/_static/copybutton.js
 create mode 100644 doc/changes.rst
 create mode 100644 doc/citing.rst
 create mode 100644 doc/conf.py
 create mode 100644 doc/faq.rst
 create mode 100644 doc/index.rst
 create mode 100644 doc/installation.rst
 create mode 100644 doc/license.rst
 create mode 100644 doc/make.bat
 create mode 100644 doc/ref_codegen.rst
 create mode 100644 doc/ref_fourier.rst
 create mode 100644 doc/ref_multigrid.rst
 create mode 100644 doc/ref_numerics.rst
 create mode 100644 doc/ref_other.rst
 create mode 100644 doc/requirements.txt
 create mode 100644 environment.yml
 create mode 100644 examples/codegen-tutorial.ipynb
 create mode 100644 examples/phi_chi.py
 create mode 100644 examples/wave-equation.py
 create mode 100644 pystella/__init__.py
 create mode 100644 pystella/decomp.py
 create mode 100644 pystella/derivs.py
 create mode 100644 pystella/elementwise.py
 create mode 100644 pystella/expansion.py
 create mode 100644 pystella/field/__init__.py
 create mode 100644 pystella/field/diff.py
 create mode 100644 pystella/field/sympy.py
 create mode 100644 pystella/fourier/__init__.py
 create mode 100644 pystella/fourier/derivs.py
 create mode 100644 pystella/fourier/dft.py
 create mode 100644 pystella/fourier/projectors.py
 create mode 100644 pystella/fourier/rayleigh.py
 create mode 100644 pystella/fourier/spectra.py
 create mode 100644 pystella/multigrid/__init__.py
 create mode 100644 pystella/multigrid/relax.py
 create mode 100644 pystella/multigrid/transfer.py
 create mode 100644 pystella/output.py
 create mode 100644 pystella/reduction.py
 create mode 100644 pystella/sectors.py
 create mode 100644 pystella/stencil.py
 create mode 100644 pystella/step.py
 create mode 100644 run_tests.sh
 create mode 100644 setup.cfg
 create mode 100644 setup.py
 create mode 100644 test/common.py
 create mode 100644 test/conftest.py
 create mode 100644 test/test_decomp.py
 create mode 100644 test/test_derivs.py
 create mode 100644 test/test_dft.py
 create mode 100644 test/test_elementwise.py
 create mode 100644 test/test_energy.py
 create mode 100644 test/test_examples.py
 create mode 100644 test/test_expansion.py
 create mode 100644 test/test_field.py
 create mode 100644 test/test_multigrid.py
 create mode 100644 test/test_projectors.py
 create mode 100644 test/test_rayleigh.py
 create mode 100644 test/test_reduction.py
 create mode 100644 test/test_relax.py
 create mode 100644 test/test_spectra.py
 create mode 100644 test/test_stencil.py
 create mode 100644 test/test_step.py
 create mode 100644 test/test_transfer.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ba8896f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+# blacklist all
+/*
+# whitelist directories
+!/*
+# now whitelist stuff
+!.gitignore
+!**.py
+!README.md
+!*.cfg
+# blacklist
+build
+dist
+*.pyc
+*.pyo
+*.egg-info
+*/_git_rev.py
+doc/_build
+.vscode
+*.h5
+.pytest_cache
\ No newline at end of file
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000..326c9f9
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,21 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/conf.py
+
+# Non-HTML is useless until the docs are much longer
+formats: []
+
+python:
+  version: 3.6
+  install:
+    - method: setuptools
+      path: .
+    - requirements: doc/requirements.txt
+  system_packages: true
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..5ab38b5
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Zachary J Weiner
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..590238d
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,55 @@
+pystella: a distributed and accelerated framework for PDE solving
+=================================================================
+
+.. image:: https://readthedocs.org/projects/pystella/badge/?version=latest
+    :target: https://pystella.readthedocs.io/en/latest/?badge=latest
+    :alt: Documentation Status
+
+``pystella`` enables the easy expression of PDE systems and the algorithms to solve
+them in high-performance computing environments within Python.
+It provides interfaces to generate custom computational kernels
+via `loopy <http://mathema.tician.de/software/loopy>`_ which are executed
+on (multiple) CPUs or GPUs using
+`pyopencl <http://mathema.tician.de/software/pyopencl>`_
+and `mpi4py <https://mpi4py.readthedocs.io/en/stable/>`_.
+Moreover, ``pystella`` implements a number of algorithms for PDE time evolution
+and spatial discretization which can be readily applied to a variety of physical
+systems.
+
+Its features include:
+
+* code generation for performant element-wise kernels, stencil-based computations,
+  and reductions
+* distributed domain decomposition and grid boundary sychronization
+* time-stepping algorithms, including low-storage Runge-Kutta schemes
+* finite-difference and spectral-collocation methods for spatial derivatives
+* wrappers to OpenCL-based Fast Fourier Transforms (FFTs) and distributed,
+  CPU FFTs
+* methods for field analysis in Fourier space
+
+All of the above functionality is configured to run at high performance, as are
+the interfaces for generating custom kernels by default (though this is
+entirely user-configurable!).
+Additionally, the provided functionality is intended to work seamlessly whether
+running in distributed- (i.e., multiple devices) or shared-memory
+(i.e., a single device) contexts, without sacrificing performance in either case.
+
+``pystella`` was designed for lattice field theory simulations of *preheating* after
+cosmological inflation and provides functionality for the simple specification
+of physical models of this process (as well as computing the resulting gravitational
+wave emission).
+However, ``pystella`` is much more generic; these components can be viewed
+as examples for the symbolic representation of arbitrary physical systems as an
+interface to its code generation routines.
+Most systems discretized onto cartesian grids should be entirely within scope
+(e.g., lattice field theory, (magneto)hydrodynamics, Einstein's equations,
+electromagnetism, etc.).
+``pystella`` provides entrypoints at varying levels of abstraction---so if you like
+the idea of ``pystella`` but the algorithms you require are not implemented,
+you can create new interfaces (or extend existing ones) for your purposes
+with relative ease.
+(Better yet, consider contributing a PR!)
+
+``pystella`` is `fully documented <https://pystella.readthedocs.io/en/latest/>`_
+and is licensed under the liberal `MIT license
+<http://en.wikipedia.org/wiki/MIT_License>`_.
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..298ea9e
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/doc/_static/copybutton.js b/doc/_static/copybutton.js
new file mode 100644
index 0000000..e567775
--- /dev/null
+++ b/doc/_static/copybutton.js
@@ -0,0 +1,65 @@
+// Copyright 2014 PSF. Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+// File originates from the cpython source found in Doc/tools/sphinxext/static/copybutton.js
+
+$(document).ready(function() {
+    /* Add a [>>>] button on the top-right corner of code samples to hide
+     * the >>> and ... prompts and the output and thus make the code
+     * copyable. */
+    var div = $('.highlight-python .highlight,' +
+                '.highlight-default .highlight,' +
+                '.highlight-python3 .highlight')
+    var pre = div.find('pre');
+
+    // get the styles from the current theme
+    pre.parent().parent().css('position', 'relative');
+    var hide_text = 'Hide the prompts and output';
+    var show_text = 'Show the prompts and output';
+    var border_width = pre.css('border-top-width');
+    var border_style = pre.css('border-top-style');
+    var border_color = pre.css('border-top-color');
+    var button_styles = {
+        'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0',
+        'border-color': border_color, 'border-style': border_style,
+        'border-width': border_width, 'color': border_color, 'text-size': '75%',
+        'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em',
+        'border-radius': '0 3px 0 0'
+    }
+
+    // create and add the button to all the code blocks that contain >>>
+    div.each(function(index) {
+        var jthis = $(this);
+        if (jthis.find('.gp').length > 0) {
+            var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
+            button.css(button_styles)
+            button.attr('title', hide_text);
+            button.data('hidden', 'false');
+            jthis.prepend(button);
+        }
+        // tracebacks (.gt) contain bare text elements that need to be
+        // wrapped in a span to work with .nextUntil() (see later)
+        jthis.find('pre:has(.gt)').contents().filter(function() {
+            return ((this.nodeType == 3) && (this.data.trim().length > 0));
+        }).wrap('<span>');
+    });
+
+    // define the behavior of the button when it's clicked
+    $('.copybutton').click(function(e){
+        e.preventDefault();
+        var button = $(this);
+        if (button.data('hidden') === 'false') {
+            // hide the code output
+            button.parent().find('.go, .gp, .gt').hide();
+            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
+            button.css('text-decoration', 'line-through');
+            button.attr('title', show_text);
+            button.data('hidden', 'true');
+        } else {
+            // show the code output
+            button.parent().find('.go, .gp, .gt').show();
+            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
+            button.css('text-decoration', 'none');
+            button.attr('title', hide_text);
+            button.data('hidden', 'false');
+        }
+    });
+});
diff --git a/doc/changes.rst b/doc/changes.rst
new file mode 100644
index 0000000..47b0c11
--- /dev/null
+++ b/doc/changes.rst
@@ -0,0 +1,7 @@
+User-visible Changes
+====================
+
+Version 2019.5
+--------------
+
+* Initial release.
diff --git a/doc/citing.rst b/doc/citing.rst
new file mode 100644
index 0000000..eee8d9b
--- /dev/null
+++ b/doc/citing.rst
@@ -0,0 +1,12 @@
+Citing pystella
+===============
+
+If you use :mod:`pystella` for your work, please cite the following pair of articles::
+
+    ..
+
+Here are Bibtex entries for your convenience::
+
+    @article{}
+
+    @article{}
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..e101765
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,176 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'pystella'
+copyright = '2019, Zachary J Weiner'
+author = 'Zachary J Weiner'
+
+import pkg_resources
+version = pkg_resources.get_distribution('pystella').version
+release = version
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.linkcode',
+    'sphinx.ext.ifconfig',
+    # 'sphinx_copybutton'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://docs.scipy.org/doc/numpy/', None),
+    'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None),
+    'loopy': ('https://documen.tician.de/loopy', None),
+    'pymbolic': ('https://documen.tician.de/pymbolic', None),
+    'pyopencl': ('https://documen.tician.de/pyopencl', None),
+    'mpi4py': ('https://mpi4py.readthedocs.io/en/stable/', None),
+    'h5py': ('http://docs.h5py.org/en/stable/', None),
+    'mpi4py_fft': ('https://mpi4py-fft.readthedocs.io/en/latest/', None),
+    }
+
+latex_elements = {
+    'maxlistdepth': '99',
+}
+
+autodoc_mock_imports = ['sympy', 'h5py']
+
+import os
+on_rtd = os.environ.get('READTHEDOCS') == 'True'
+
+if on_rtd:
+    exclude_patterns = ['*multigrid*']
+
+
+# setup copy button thing
+def setup(app):
+    app.add_config_value('on_rtd', on_rtd, 'env')
+    app.add_javascript('copybutton.js')
+
+
+# # Resolve function for the linkcode extension.
+# def linkcode_resolve(domain, info):
+#     def find_source():
+#         # try to find the file and line number, based on code from numpy:
+#         # https://github.com/numpy/numpy/blob/master/doc/source/conf.py#L286
+#         import sys
+#         obj = sys.modules[info['module']]
+#         for part in info['fullname'].split('.'):
+#             obj = getattr(obj, part)
+#         import inspect
+#         import os
+#         fn = inspect.getsourcefile(obj)
+#         import pystella
+#         fn = os.path.relpath(fn, start=os.path.dirname(pystella.__file__))
+#         source, lineno = inspect.getsourcelines(obj)
+#         return fn, lineno, lineno + len(source) - 1
+
+#     if domain != 'py' or not info['module']:
+#         return None
+#     try:
+#         filename = 'pystella/%s#L%d-L%d' % find_source()
+#     except Exception:
+#         filename = info['module'].replace('.', '/') + '.py'
+#     tag = 'master'  # if 'dev' in release else ('v' + release)
+#     return "https://github.com/zachjweiner/pystella/blob/%s/%s" % (tag, filename)
+
+
+def linkcode_resolve(domain, info):
+    """
+    Determine the URL corresponding to Python object
+    copied from numpy's conf.py
+    """
+    if domain != 'py':
+        return None
+
+    import sys
+    import inspect
+    from os.path import relpath, dirname
+
+    modname = info['module']
+    fullname = info['fullname']
+
+    submod = sys.modules.get(modname)
+    if submod is None:
+        return None
+
+    obj = submod
+    for part in fullname.split('.'):
+        try:
+            obj = getattr(obj, part)
+        except Exception:
+            return None
+
+    # strip decorators, which would resolve to the source of the decorator
+    # possibly an upstream bug in getsourcefile, bpo-1764286
+    try:
+        unwrap = inspect.unwrap
+    except AttributeError:
+        pass
+    else:
+        obj = unwrap(obj)
+
+    try:
+        fn = inspect.getsourcefile(obj)
+    except Exception:
+        fn = None
+    if not fn:
+        return None
+
+    try:
+        source, lineno = inspect.getsourcelines(obj)
+    except Exception:
+        lineno = None
+
+    if lineno:
+        linespec = "#L%d-L%d" % (lineno, lineno + len(source) - 1)
+    else:
+        linespec = ""
+
+    import pystella
+    fn = relpath(fn, start=dirname(pystella.__file__))
+
+    return "https://github.com/zachjweiner/pystella/blob/master/pystella/%s%s" % (
+           fn, linespec)
diff --git a/doc/faq.rst b/doc/faq.rst
new file mode 100644
index 0000000..c296e61
--- /dev/null
+++ b/doc/faq.rst
@@ -0,0 +1,4 @@
+Frequently Asked Questions
+==========================
+
+Coming soon.
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..b30aa8c
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,46 @@
+Welcome to pystella's documentation!
+====================================
+
+:mod:`pystella` is a package allowing the easy expression and evolution of PDE
+systems with finite-difference methods.
+Here's a simple example which evolves the scalar wave equation (without doing
+anything interesting):
+
+.. literalinclude:: ../examples/wave-equation.py
+
+:mod:`pystella` uses :mod:`loopy` for code generation and :mod:`pyopencl` for
+execution on CPUs and GPUs.
+The above example can even be run in a jupyter notebook, but :mod:`pystella` also
+supports MPI parallelization across multiple GPUs (or devices, more generally) via
+:mod:`mpi4py`.
+
+For a more detailed tutorial on the tools to generate OpenCL kernels provided by
+:mod:`loopy` and :mod:`pystella`, see
+`codegen-tutorial.ipynb <https://github.com/zachjweiner/pystella/blob/master/examples/codegen-tutorial.ipynb>`_.
+
+Table of Contents
+-----------------
+
+Please check :ref:`installation` to get started.
+
+.. toctree::
+    :maxdepth: 2
+
+    installation
+    ref_codegen
+    ref_numerics
+    ref_fourier
+    ref_other
+    ref_multigrid
+    changes
+    license
+    faq
+    citing
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/installation.rst b/doc/installation.rst
new file mode 100644
index 0000000..087a16f
--- /dev/null
+++ b/doc/installation.rst
@@ -0,0 +1,108 @@
+.. highlight:: sh
+
+.. _installation:
+
+Installation
+============
+
+At the bare minimum, :mod:`pystella` requires :mod:`numpy`,
+:mod:`loopy` for code generation, and :mod:`pyopencl`
+(plus an OpenCL implementation) for kernel execution.
+Optional dependencies (and what they are needed for) are:
+
+* :mod:`mpi4py` (and an MPI implementation) for distributed, multi-device execution
+
+* :mod:`gpyfft` (and :mod:`clfft` and :mod:`Cython`) for OpenCL
+  Fast Fourier Transforms (:class:`pystella.fourier.gDFT`) (e.g., to run on a GPU),
+  and/or :mod:`mpi4py_fft` (and :mod:`fftw`) for distributed, CPU FFTs
+  (:class:`pystella.fourier.pDFT`)
+
+* :mod:`h5py` (and :mod:`hdf5`) to use the convenience class
+  :class:`pystella.output.OutputFile`
+
+* :mod:`sympy`, for interoperability between :mod:`pymbolic` and :mod:`sympy`
+
+Fortunately, :mod:`conda` greatly simplifies the installation process with any
+of these dependencies.
+The included :file:`environment.yml` file provides a complete
+installation by default, but one can delete any optional dependencies.
+
+Note that installation has only been tested on Linux, but similar steps should work
+on macOS.
+
+Installation steps
+------------------
+
+Install via the following steps
+(first modifying :file:`environment.yml` as desired):
+
+1. Install `miniconda <https://docs.conda.io/en/latest/miniconda.html>`_ (if you
+   haven't already installed :mod:`conda`).
+
+2. Clone the repository::
+
+    git clone https://github.com/zachjweiner/pystella.git
+
+3. Create a :mod:`pystella` environment as specified by :file:`environment.yml`::
+
+    conda env create --file pystella/environment.yml
+
+   -  This will clone and install (i.e., as if via
+      :command:`python setup.py develop`) :mod:`gpyfft` and :mod:`loopy` into
+      :command:`src/`. To change this, first define the environment variable
+      :command:`PIP_SRC` to be your desired directory,
+      e.g., to your home directory with::
+
+        export PIP_SRC=~
+
+  Alternatively, update your active environment via::
+
+    conda env update --file pystella/environment.yml
+
+4. Activate the environment (if you created a new one)::
+
+    conda activate pystella
+
+  and set up :mod:`pystella`::
+
+    cd pystella/ && python setup.py develop
+
+To test that installation was successful, try running an example
+(e.g., :code:`python examples/scalar-preheating.py`) or run the tests with :mod:`pytest`.
+
+Running on other devices (GPUs, etc.)
+-------------------------------------
+
+The included :file:`environment.yml` installs `pocl <http://portablecl.org/>`__,
+which provides an OpenCL implementation on most CPUs.
+Enabling execution on other hardware (e.g., GPUs) requires making :mod:`pyopencl`
+aware of the corresponding OpenCL driver.
+See :mod:`pyopencl`'s
+`instructions <https://documen.tician.de/pyopencl/misc.html#installation>`__
+(specifically,
+`here <https://documen.tician.de/pyopencl/misc.html#using-vendor-supplied-opencl-drivers-mainly-on-linux>`__).
+For example, installing `CUDA <https://developer.nvidia.com/cuda-downloads>`__
+installs the driver for NVIDIA GPUs; one must then merely copy
+the :file:`nvidia.icd` file via::
+
+    cp /etc/OpenCL/vendors/nvidia.icd $CONDA_PREFIX/etc/OpenCL/vendors
+
+Using an existing MPI implementation
+------------------------------------
+
+To enable MPI support without :mod:`conda` installing its own MPI implementation
+(e.g., to use the optimized implementation already provided on a cluster, etc.),
+simply move :mod:`mpi4py` (and :mod:`mpi4py_fft`) below the :code:`pip` line
+in :file:`environment.yml`::
+
+    ...
+    - pip:
+      - mpi4py
+      - mpi4py-fft
+     ...
+
+:mod:`pip`-installing :mod:`mpi4py` assumes that :code:`mpicc` is available
+(check the output of :code:`which mpicc`).
+See :mod:`mpi4py`'s
+`instructions <https://mpi4py.readthedocs.io/en/stable/install.html>`__ for more
+details.
diff --git a/doc/license.rst b/doc/license.rst
new file mode 100644
index 0000000..65e8756
--- /dev/null
+++ b/doc/license.rst
@@ -0,0 +1,24 @@
+Licensing
+=========
+
+pystella is licensed to you under the MIT/X Consortium license:
+
+Copyright (C) 2019 Zachary J Weiner.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..27f573b
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/doc/ref_codegen.rst b/doc/ref_codegen.rst
new file mode 100644
index 0000000..24710b5
--- /dev/null
+++ b/doc/ref_codegen.rst
@@ -0,0 +1,23 @@
+.. currentmodule:: pystella
+
+Reference: Code Generation
+==========================
+
+Kernel creation
+---------------
+
+.. automodule:: pystella.elementwise
+
+.. automodule:: pystella.stencil
+
+.. automodule:: pystella.reduction
+
+Fields
+------
+
+.. automodule:: pystella.field
+
+Sectors
+-------
+
+.. automodule:: pystella.sectors
diff --git a/doc/ref_fourier.rst b/doc/ref_fourier.rst
new file mode 100644
index 0000000..5e38e09
--- /dev/null
+++ b/doc/ref_fourier.rst
@@ -0,0 +1,29 @@
+.. currentmodule:: pystella
+
+Reference: Fourier Space
+========================
+
+Fast Fourier transforms
+-----------------------
+
+.. automodule:: pystella.fourier.dft
+
+Field power spectra
+-------------------
+
+.. autoclass:: PowerSpectra
+
+Generating Gaussian-random fields
+---------------------------------
+
+.. automodule:: pystella.fourier.rayleigh
+
+Vector and tensor projections
+-----------------------------
+
+.. automodule:: pystella.fourier.projectors
+
+Spectral solvers
+----------------
+
+.. automodule:: pystella.fourier.derivs
diff --git a/doc/ref_multigrid.rst b/doc/ref_multigrid.rst
new file mode 100644
index 0000000..1ce880d
--- /dev/null
+++ b/doc/ref_multigrid.rst
@@ -0,0 +1,19 @@
+.. currentmodule:: pystella.multigrid
+
+Reference: Multigrid
+====================
+
+Multigrid schemes
+-----------------
+
+.. automodule:: pystella.multigrid
+
+Relaxation methods
+------------------
+
+.. automodule:: pystella.multigrid.relax
+
+Grid transfer operations
+------------------------
+
+.. automodule:: pystella.multigrid.transfer
\ No newline at end of file
diff --git a/doc/ref_numerics.rst b/doc/ref_numerics.rst
new file mode 100644
index 0000000..81b0470
--- /dev/null
+++ b/doc/ref_numerics.rst
@@ -0,0 +1,14 @@
+.. currentmodule:: pystella
+
+Reference: Numerical Methods
+============================
+
+Time stepping
+-------------
+
+.. automodule:: pystella.step
+
+Spatial derivatives
+-------------------
+
+.. automodule:: pystella.derivs
diff --git a/doc/ref_other.rst b/doc/ref_other.rst
new file mode 100644
index 0000000..bf315ab
--- /dev/null
+++ b/doc/ref_other.rst
@@ -0,0 +1,23 @@
+.. currentmodule:: pystella
+
+Reference: Other Functionality
+==============================
+
+MPI parallelization
+-------------------
+
+.. autoclass:: DomainDecomposition
+
+Expansion
+---------
+
+.. autoclass:: Expansion
+
+Utilities
+---------
+
+.. autoclass:: FieldStatistics
+
+.. autoclass:: pystella.output.OutputFile
+
+.. autofunction:: choose_device_and_make_context
diff --git a/doc/requirements.txt b/doc/requirements.txt
new file mode 100644
index 0000000..a626eaa
--- /dev/null
+++ b/doc/requirements.txt
@@ -0,0 +1 @@
+sphinx==2.2.0
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..ca7bbaa
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,21 @@
+name: pystella
+
+channels:
+ - conda-forge
+
+dependencies:
+ - pip
+ - numpy
+ - sympy
+ - h5py
+ - pyopencl
+ - pocl
+ - fftw
+ - clfft
+ - Cython
+ - mpi4py
+ - mpi4py-fft
+ - pip:
+   - -e git+ssh://git@github.com/inducer/pymbolic.git@master#egg=pymbolic
+   - -e git+ssh://git@github.com/inducer/loopy.git@master#egg=loo.py
+   - -e git+ssh://git@gitlab.com/zachjweiner/gpyfft.git#egg=gpyfft
diff --git a/examples/codegen-tutorial.ipynb b/examples/codegen-tutorial.ipynb
new file mode 100644
index 0000000..9b9686c
--- /dev/null
+++ b/examples/codegen-tutorial.ipynb
@@ -0,0 +1,1634 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Code generation with `pystella` and `loopy`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pyopencl as cl\n",
+    "import pyopencl.array as cla\n",
+    "import pyopencl.clrandom as clr\n",
+    "import loopy as lp\n",
+    "from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Four ways to an OpenCL kernel\n",
+    "\n",
+    "We're going to create (and run!) an OpenCL kernel that computes\n",
+    "\n",
+    "$$\n",
+    "a(\\mathbf{x}) = b(\\mathbf{x})^2 \\cdot c(\\mathbf{x}) + z\n",
+    "$$\n",
+    "\n",
+    "in four different ways.\n",
+    "\n",
+    "First, we'll generate data and expected results with `numpy`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 64  # the grid size in each dimension\n",
+    "\n",
+    "b_h = np.random.rand(n, n, n).astype(np.float64)\n",
+    "c_h = np.random.rand(n, n, n).astype(np.float64)\n",
+    "z = np.array(3.2)\n",
+    "\n",
+    "a_true_h = b_h**2 * c_h + z"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. `pyopencl` arrays methods"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we need an OpenCL \"context\" (the umbrella construct for running programs with OpenCL) and a \"queue\" (to which kernels will be submitted to execute on a device).\n",
+    "\n",
+    "Check out `pyopencl`'s [docs](https://documen.tician.de/pyopencl/) for examples and details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ctx = cl.create_some_context()\n",
+    "queue = cl.CommandQueue(ctx)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`pyopencl` has a very convenient `Array` construct, which emulates `numpy` arrays---but with memory residing on the device.\n",
+    "We'll copy the data to the device and try it out."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = cla.to_device(queue, b_h)\n",
+    "c = cla.to_device(queue, c_h)\n",
+    "a = cla.zeros_like(b)\n",
+    "a_true = cla.to_device(queue, a_true_h)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a[:] = b**2 * c + z"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To compare results, compute the maximum of `a - a_true`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "difference = a - a_true"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.881784197001252e-16"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.max(difference.get())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that we had to call `difference.get()`, which returns a `numpy.ndarray` on the \"host\" (the CPU) with data copied from `difference` (on the GPU).\n",
+    "We can also use `pyopencl`'s `max` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(8.8817842e-16)"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cla.max(difference)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. OpenCL kernel generation with `loopy`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Refer to `loopy`'s [tutorial](https://documen.tician.de/loopy/tutorial.html) to get started.\n",
+    "\n",
+    "Let's create a kernel which computes the above for $i \\in [0, N_x)$, $j \\in [0, N_y)$, and $k \\in [0, N_z)$:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knl = lp.make_kernel(\n",
+    "    \"{[i, j, k]: 0 <= i < Nx and 0 <= j < Ny and 0 <= k < Nz}\",\n",
+    "    \"\"\"\n",
+    "    a[i, j, k] = b[i, j, k]**2 * c[i, j, k] + z\n",
+    "    \"\"\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Inspect your kernel to see if it appears correct by printing it.\n",
+    "How did `make_kernel` interpret the un-indexed scalar variable `z`?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: <auto/runtime>\n",
+      "Ny: ValueArg, type: <auto/runtime>\n",
+      "Nz: ValueArg, type: <auto/runtime>\n",
+      "a: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "b: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "c: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "z: ValueArg, type: <auto/runtime>\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [i, j, k] : 0 <= i < Nx and 0 <= j < Ny and 0 <= k < Nz }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "i: None\n",
+      "j: None\n",
+      "k: None\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "for k, j, i\n",
+      "      \u001b[36ma[i, j, k]\u001b[0m = \u001b[35mb[i, j, k]**2*c[i, j, k] + z\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "end k, j, i\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(knl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now test our kernel by directly calling the `knl` we created above:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evt, _ = knl(queue, a=a, b=b, c=c, z=z)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that `z` needs to be a `numpy.array` so that `loopy` can infer its datatype."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To compare results, compute the maximum of `a - a_true`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "difference = a - a_true"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.881784197001252e-16"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.max(difference.get())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parallelization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, GPUs are parallel, and the kernel we just wrote isn't making use of any parallelism.\n",
+    "First, let's check what OpenCL code was produced by setting the kernel option `write_cl` to `True`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knl = lp.set_options(knl, write_cl=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we run the kernel now, it will print OpenCL code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\n",
+      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nx, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Ny, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nz, __global \u001b[36mdouble\u001b[39;49;00m *__restrict__ a, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ c, \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m z)\n",
+      "{\n",
+      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m k = \u001b[34m0\u001b[39;49;00m; k <= -\u001b[34m1\u001b[39;49;00m + Nz; ++k)\n",
+      "    \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + Nx >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + Ny >= \u001b[34m0\u001b[39;49;00m)\n",
+      "      \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= -\u001b[34m1\u001b[39;49;00m + Ny; ++j)\n",
+      "        \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= -\u001b[34m1\u001b[39;49;00m + Nx; ++i)\n",
+      "          a[Nz * Ny * i + Nz * j + k] = c[Nz * Ny * i + Nz * j + k] * b[Nz * Ny * i + Nz * j + k] * b[Nz * Ny * i + Nz * j + k] + z;\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "evt, _ = knl(queue, a=a, b=b, c=c, z=z)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It looks correct, array indexing and all.\n",
+    "But that's a lot of sequential loops!\n",
+    "`loopy` enables *code transformations* that (aim to) optimize the performance of a given kernel.\n",
+    "For instance, mapping the `k` index to the \"0\" index of the local and global OpenCL thread dimensions is accomplished via `loopy.split_iname`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knl = lp.split_iname(knl, \"k\", 32, outer_tag=\"g.0\", inner_tag=\"l.0\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see what this did to the kernel:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: <auto/runtime>\n",
+      "Ny: ValueArg, type: <auto/runtime>\n",
+      "Nz: ValueArg, type: <auto/runtime>\n",
+      "a: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "b: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "c: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "z: ValueArg, type: <auto/runtime>\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [i, j, k_outer, k_inner] : 0 <= i < Nx and 0 <= j < Ny and k_inner >= 0 and -32k_outer <= k_inner <= 31 and k_inner < Nz - 32k_outer }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "i: None\n",
+      "j: None\n",
+      "k_inner: l.0\n",
+      "k_outer: g.0\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "for j, k_inner, k_outer, i\n",
+      "        \u001b[36ma[i, j, k_inner + k_outer*32]\u001b[0m = \u001b[35mb[i, j, k_inner + k_outer*32]**2*c[i, j, k_inner + k_outer*32] + z\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "end j, k_inner, k_outer, i\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(knl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above splits the loop over the \"iname\" `k` into (a yet-undetermined number of) blocks of 32 threads each."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The \"iname\" (index name) `k` is gone, repalced by the combination `k_inner + k_outer * 32`.\n",
+    "Observe also that the \"implementation\" of these new inames has been tagged to map to axes of global and local parallelization (as we specified).\n",
+    "If we run the kernel now (enabling `write_cl` again), we see that the sequential loop over `k` is gone, and the indexing of `k` has been replaced by `32 * gid(0) + lid(0)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\n",
+      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m32\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nx, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Ny, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nz, __global \u001b[36mdouble\u001b[39;49;00m *__restrict__ a, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ c, \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m z)\n",
+      "{\n",
+      "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + Nx >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + Ny >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + -\u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + Nz >= \u001b[34m0\u001b[39;49;00m)\n",
+      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= -\u001b[34m1\u001b[39;49;00m + Ny; ++j)\n",
+      "      \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= -\u001b[34m1\u001b[39;49;00m + Nx; ++i)\n",
+      "        a[Nz * Ny * i + Nz * j + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] = c[Nz * Ny * i + Nz * j + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * b[Nz * Ny * i + Nz * j + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * b[Nz * Ny * i + Nz * j + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + z;\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "knl = lp.set_options(knl, write_cl=True)\n",
+    "evt, _ = knl(queue, a=a, b=b, c=c, z=z)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that no loops over `k_inner` nor `k_outer` appear. They have been mapped to the \"hardware\" axes of parallelization: the kernel implicitly runs over a bunch of work groups (one for each value of `gid(0)`), each with 32 work items (each with their own index `lid(0)`)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And the result is still correct!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.881784197001252e-16"
+      ]
+     },
+     "execution_count": 79,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "difference = a - a_true\n",
+    "np.max(difference.get())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can achieve more parallelism by \"tagging\" `j` and `i` as, say, global indices 1 and 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knl = lp.tag_inames(knl, {'j': 'g.1', 'i': 'g.2'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\n",
+      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m32\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nx, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Ny, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nz, __global \u001b[36mdouble\u001b[39;49;00m *__restrict__ a, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ c, \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m z)\n",
+      "{\n",
+      "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + -\u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + Nz >= \u001b[34m0\u001b[39;49;00m)\n",
+      "    a[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] = c[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * b[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * b[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + z;\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "knl = lp.set_options(knl, write_cl=True)\n",
+    "evt, _ = knl(queue, a=a, b=b, c=c, z=z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.881784197001252e-16"
+      ]
+     },
+     "execution_count": 82,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "difference = a - a_true\n",
+    "np.max(difference.get())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Observe the pesky `if` statement, which is ensuring that no out-of-bounds array elements are accessed by the replacement of `k` with `32 * gid(0) + lid(0)`.\n",
+    "If we are *sure* that this won't happen (namely, that `Nz` is divisble by 32), we can add this as an assumption:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knl = lp.assume(knl, 'Nz mod 32 = 0')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mif __OPENCL_C_VERSION__ < 120\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mpragma OPENCL EXTENSION cl_khr_fp64: enable\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\u001b[36m#\u001b[39;49;00m\u001b[36mendif\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
+      "\n",
+      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m32\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nx, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Ny, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m Nz, __global \u001b[36mdouble\u001b[39;49;00m *__restrict__ a, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ c, \u001b[36mdouble\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m z)\n",
+      "{\n",
+      "  a[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] = c[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * b[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * b[Nz * Ny * gid(\u001b[34m2\u001b[39;49;00m) + Nz * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m32\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + z;\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "knl = lp.set_options(knl, write_cl=True)\n",
+    "evt, (x,) = knl(queue, a=a, b=b, c=c, z=z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.881784197001252e-16"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "difference = a - a_true\n",
+    "np.max(difference.get())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Symbolic representation of code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, let's inspect the actual instruction `knl` is executing as represented by `loopy` kernel objects:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Assignment(groups=frozenset(), within_inames=frozenset({'j', 'k_inner', 'k_outer', 'i'}), expression=Sum((Product((Power(Subscript(..., (..., ..., ...)), 2), Subscript(Variable('c'), (Variable('i'), Variable('j'), Sum((..., ...)))))), Variable('z'))), priority=0, depends_on=frozenset(), id='insn', within_inames_is_final=False, temp_var_type=Optional(), depends_on_is_final=False, conflicts_with_groups=frozenset(), boostable=None, atomicity=(), no_sync_with=frozenset(), assignee=Subscript(Variable('a'), (Variable('i'), Variable('j'), Sum((Variable('k_inner'), Product((Variable('k_outer'), 32)))))), tags=frozenset(), boostable_into=None, predicates=frozenset())]"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "knl.instructions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This has quite a lot of details - but we're interested in the \"assignee\" and the \"expression\" of the first (and only) instruction:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Subscript(Variable('a'), (Variable('i'), Variable('j'), Sum((Variable('k_inner'), Product((Variable('k_outer'), 32))))))"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "knl.instructions[0].assignee"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Sum((Product((Power(Subscript(..., (..., ..., ...)), 2), Subscript(Variable('c'), (Variable('i'), Variable('j'), Sum((..., ...)))))), Variable('z')))"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "knl.instructions[0].expression"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an *expression tree*. We can actually see what's going on if we `print(assignee, '=', expression)`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a[i, j, k_inner + k_outer*32] = b[i, j, k_inner + k_outer*32]**2*c[i, j, k_inner + k_outer*32] + z\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(knl.instructions[0].assignee, '=', knl.instructions[0].expression)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is exactly the statement that appears if we print `knl` itself:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: <auto/runtime>\n",
+      "Ny: ValueArg, type: <auto/runtime>\n",
+      "Nz: ValueArg, type: <auto/runtime>\n",
+      "a: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "b: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "c: type: <auto/runtime>, shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1) aspace: global\n",
+      "z: ValueArg, type: <auto/runtime>\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [i, j, k_outer, k_inner] : 0 <= i < Nx and 0 <= j < Ny and k_inner >= 0 and -32k_outer <= k_inner <= 31 and k_inner < Nz - 32k_outer }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "i: g.2\n",
+      "j: g.1\n",
+      "k_inner: l.0\n",
+      "k_outer: g.0\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "for j, k_inner, k_outer, i\n",
+      "        \u001b[36ma[i, j, k_inner + k_outer*32]\u001b[0m = \u001b[35mb[i, j, k_inner + k_outer*32]**2*c[i, j, k_inner + k_outer*32] + z\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "end j, k_inner, k_outer, i\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(knl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's reproduce this instruction using `pymbolic`. First we need to import all the \"primitive\" objects:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pymbolic.primitives as p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now create some named \"Variables\" (prepending their name with an underscore so they don't overwrite our `pyopencl` arrays):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_a = p.Variable('a')\n",
+    "_b = p.Variable('b')\n",
+    "_c = p.Variable('c')\n",
+    "_z = p.Variable('z')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We also need some index variables:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i = p.Variable('i')\n",
+    "j = p.Variable('j')\n",
+    "k = p.Variable('k')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we index (or \"subscript\") a `Variable`, we get a `Subscript` object:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Subscript(Variable('a'), (Variable('i'), Variable('j'), Variable('k')))"
+      ]
+     },
+     "execution_count": 94,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_a[i, j, k]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This matches the `assignee` of the instruction above. Let's try the `expression`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Sum((Product((Power(Subscript(..., (..., ..., ...)), 2), Subscript(Variable('c'), (Variable('i'), Variable('j'), Variable('k'))))), Variable('z')))"
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_b[i, j, k]**2 * _c[i, j, k] + _z"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "b[i, j, k]**2*c[i, j, k] + z\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(_b[i, j, k]**2 * _c[i, j, k] + _z)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Looks good."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The lesson here is that `pymbolic` provides a symbolic way to generate code.\n",
+    "Rather than inputting strings of instructions to `loopy.make_kernel` (which, as we saw above, are parsed to `pymbolic` expressions by `loopy` behind the scenes!), we can work with the symbolic code directly.\n",
+    "This unlocks a lot of potential to actually use python as a scripting language to generate the code (which `loopy` uses to subsequently generate OpenCL code).\n",
+    "`pymbolic` can be thought of as a very simple computer algebra system (it can take derivatives, for instance), but geared toward manipulating and generating code."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. `pystella.ElementWiseMap`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see how `pystella` provides a simpler interface to `loopy` to turn `pymbolic` expressions into kernels.\n",
+    "The fundamental representation is python's dictionary type---key-value pairs which correspond to assignee-expression pairs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we'll recreate our same kernel again."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pystella as ps"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "map_dict = {\n",
+    "    _a[i, j, k]: _b[i, j, k]**2 * _c[i, j, k] + _z\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ewmap = ps.ElementWiseMap(map_dict, dtype='float64', h=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: np:dtype('int64')\n",
+      "Ny: ValueArg, type: np:dtype('int64')\n",
+      "Nz: ValueArg, type: np:dtype('int64')\n",
+      "a: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "b: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "c: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "z: ValueArg, type: np:dtype('float64')\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [k_outer, k_inner, j_outer, j_inner, i_outer, i_inner] : i_inner = 0 and k_inner >= 0 and -16k_outer <= k_inner <= 15 and k_inner < Nz - 16k_outer and j_inner >= 0 and -4j_outer <= j_inner <= 3 and j_inner < Ny - 4j_outer and 0 <= i_outer < Nx }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "i_inner: unr\n",
+      "i_outer: g.2\n",
+      "j_inner: l.1\n",
+      "j_outer: g.1\n",
+      "k_inner: l.0\n",
+      "k_outer: g.0\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "for i_inner, j_outer, i_outer, j_inner, k_outer, k_inner\n",
+      "            \u001b[36ma[i_inner + i_outer, j_inner + j_outer*4, k_inner + k_outer*16]\u001b[0m = \u001b[35mb[i_inner + i_outer, j_inner + j_outer*4, k_inner + k_outer*16]**2*c[i_inner + i_outer, j_inner + j_outer*4, k_inner + k_outer*16] + z\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "end i_inner, j_outer, i_outer, j_inner, k_outer, k_inner\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ewmap.knl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It's the same kernel! Already parallelized---`ElementWiseMap` implements a default parallelization that works well for these types of operations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check the results:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evt, _ = ewmap(queue, a=a, b=b, c=c, z=z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.881784197001252e-16"
+      ]
+     },
+     "execution_count": 102,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "difference = a - a_true\n",
+    "np.max(difference.get())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Using `pystella.Field`'s as input to `pystella.ElementWiseMap`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`pystella.Field`'s can make our life even easier.\n",
+    "Constantly indexing with `[i, j, k]` can get pretty annoying, and can be automate with `pymbolic`'s mapping methods."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_a = ps.Field('a')\n",
+    "_b = ps.Field('b')\n",
+    "_c = ps.Field('c')\n",
+    "_z = p.Variable('z')\n",
+    "\n",
+    "map_dict = {\n",
+    "    _a: _b**2 * _c + _z\n",
+    "}\n",
+    "\n",
+    "ewmap = ps.ElementWiseMap(map_dict, dtype='float64', h=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: np:dtype('int64')\n",
+      "Ny: ValueArg, type: np:dtype('int64')\n",
+      "Nz: ValueArg, type: np:dtype('int64')\n",
+      "a: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "b: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "c: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "z: ValueArg, type: np:dtype('float64')\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [k_outer, k_inner, j_outer, j_inner, i_outer, i_inner] : i_inner = 0 and k_inner >= 0 and -16k_outer <= k_inner <= 15 and k_inner < Nz - 16k_outer and j_inner >= 0 and -4j_outer <= j_inner <= 3 and j_inner < Ny - 4j_outer and 0 <= i_outer < Nx }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "i_inner: unr\n",
+      "i_outer: g.2\n",
+      "j_inner: l.1\n",
+      "j_outer: g.1\n",
+      "k_inner: l.0\n",
+      "k_outer: g.0\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "for i_inner, j_outer, i_outer, j_inner, k_outer, k_inner\n",
+      "            \u001b[36ma[i_inner + i_outer, j_inner + j_outer*4, k_inner + k_outer*16]\u001b[0m = \u001b[35mb[i_inner + i_outer, j_inner + j_outer*4, k_inner + k_outer*16]**2*c[i_inner + i_outer, j_inner + j_outer*4, k_inner + k_outer*16] + z\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "end i_inner, j_outer, i_outer, j_inner, k_outer, k_inner\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ewmap.knl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We made no mention of indices or subscripts, yet the kernels are identical."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evt, _ = ewmap(queue, a=a, b=b, c=c, z=z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8.881784197001252e-16"
+      ]
+     },
+     "execution_count": 106,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "difference = a - a_true\n",
+    "np.max(difference.get())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To further illustrate why `pystella.Field`'s are useful, consider the (extremely common) case where arrays are padded in each direction.\n",
+    "This is implemented by passing a value for `offset`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_a = ps.Field('a', offset='h')\n",
+    "_b = ps.Field('b', offset='h')\n",
+    "_c = ps.Field('c', offset='h')\n",
+    "_z = p.Variable('z')\n",
+    "\n",
+    "map_dict = {\n",
+    "    _a: _b**2 * _c + _z\n",
+    "}\n",
+    "\n",
+    "ewmap = ps.ElementWiseMap(map_dict, dtype='float64', h=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: np:dtype('int64')\n",
+      "Ny: ValueArg, type: np:dtype('int64')\n",
+      "Nz: ValueArg, type: np:dtype('int64')\n",
+      "a: type: np:dtype('float64'), shape: (Nx + 1, Ny + 1, Nz + 1), dim_tags: (N2:stride:(Nz + 1)*(Ny + 1), N1:stride:Nz + 1, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "b: type: np:dtype('float64'), shape: (Nx + 1, Ny + 1, Nz + 1), dim_tags: (N2:stride:(Nz + 1)*(Ny + 1), N1:stride:Nz + 1, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "c: type: np:dtype('float64'), shape: (Nx + 1, Ny + 1, Nz + 1), dim_tags: (N2:stride:(Nz + 1)*(Ny + 1), N1:stride:Nz + 1, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "z: ValueArg, type: np:dtype('float64')\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [k_outer, k_inner, j_outer, j_inner, i_outer, i_inner] : i_inner = 0 and k_inner >= 0 and -16k_outer <= k_inner <= 15 and k_inner < Nz - 16k_outer and j_inner >= 0 and -4j_outer <= j_inner <= 3 and j_inner < Ny - 4j_outer and 0 <= i_outer < Nx }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "i_inner: unr\n",
+      "i_outer: g.2\n",
+      "j_inner: l.1\n",
+      "j_outer: g.1\n",
+      "k_inner: l.0\n",
+      "k_outer: g.0\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "for i_inner, j_outer, i_outer, j_inner, k_outer, k_inner\n",
+      "            \u001b[36ma[1 + i_inner + i_outer, 1 + j_inner + j_outer*4, 1 + k_inner + k_outer*16]\u001b[0m = \u001b[35mb[1 + i_inner + i_outer, 1 + j_inner + j_outer*4, 1 + k_inner + k_outer*16]**2*c[1 + i_inner + i_outer, 1 + j_inner + j_outer*4, 1 + k_inner + k_outer*16] + z\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "end i_inner, j_outer, i_outer, j_inner, k_outer, k_inner\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ewmap.knl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This would get quite cumbersome to type out manually, and it's easy to forget which arrays should be padded.\n",
+    "From experience, it can be difficult to see errors in array indexing."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Indexer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Behind the scenes, `ElementWiseMap` is calling `Indexer`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "f\n",
+      "f[i + h, j + 3, k + 2]\n"
+     ]
+    }
+   ],
+   "source": [
+    "_f = ps.Field('f', offset=('h', 3, 2))\n",
+    "print(_f)\n",
+    "print(ps.Indexer(_f))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise: computing spatial gradients with `pystella.Field` and the `pystella.Stencil` kernel generator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`Field`'s also have a `shift` method, which does what it sounds like:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a[i + h + 1, j + h, k + h]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ps.Indexer(_a.shift((1, 0, 0))))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this type of kernel, `pystella.Stencil` provides good parallelization (by allowing arrays to be *prefetched* into so-called \"shared\" memory)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = ps.Field('f', offset=1)\n",
+    "\n",
+    "dfdx = ps.Field('dfdx', offset=0)\n",
+    "dfdy = ps.Field('dfdy', offset=0)\n",
+    "dfdz = ps.Field('dfdz', offset=0)\n",
+    "\n",
+    "dx = p.Variable('dx')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fill in `map_dict` below to compute the second-order centered-difference approximation to the gradient of `f`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "map_dict = {\n",
+    "    dfdx: (f.shift((1, 0, 0)) - f.shift((-1, 0, 0))) / 2 / dx,\n",
+    "    dfdy: (f.shift((0, 1, 0)) - f.shift((0, -1, 0))) / 2 / dx,\n",
+    "    dfdz: (f.shift((0, 0, 1)) - f.shift((0, 0, -1))) / 2 / dx,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stencil = ps.Stencil(map_dict, prefetch_args=['f'], h=1, dtype='float64')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: np:dtype('int64')\n",
+      "Ny: ValueArg, type: np:dtype('int64')\n",
+      "Nz: ValueArg, type: np:dtype('int64')\n",
+      "dfdx: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "dfdy: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "dfdz: type: np:dtype('float64'), shape: (Nx, Ny, Nz), dim_tags: (N2:stride:Nz*Ny, N1:stride:Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "dx: ValueArg, type: np:dtype('float64')\n",
+      "f: type: np:dtype('float64'), shape: (2 + Nx, 2 + Ny, 2 + Nz), dim_tags: (N2:stride:(2 + Nz)*(2 + Ny), N1:stride:2 + Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [k_outer, k_inner, j_outer, j_inner, i_outer, i_inner, f_dim_0, f_dim_1, f_dim_2] : k_outer >= 0 and 0 <= k_inner <= 7 and k_inner < Nz - 8k_outer and j_outer >= 0 and 0 <= j_inner <= 7 and j_inner < Ny - 8j_outer and i_outer >= 0 and 0 <= i_inner <= 7 and i_inner < Nx - 8i_outer and 0 <= f_dim_0 <= 9 and f_dim_0 <= 1 + Nx - 8i_outer and 0 <= f_dim_1 <= 9 and f_dim_1 <= 1 + Ny - 8j_outer and 0 <= f_dim_2 <= 9 and f_dim_2 <= 1 + Nz - 8k_outer }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "f_dim_0: l.2\n",
+      "f_dim_1: l.1\n",
+      "f_dim_2: l.0\n",
+      "i_inner: l.2\n",
+      "i_outer: g.2\n",
+      "j_inner: l.1\n",
+      "j_outer: g.1\n",
+      "k_inner: l.0\n",
+      "k_outer: g.0\n",
+      "---------------------------------------------------------------------------\n",
+      "TEMPORARIES:\n",
+      "_f: type: np:dtype('float64'), shape: (f_dim_0:10, f_dim_1:10, f_dim_2:10), dim_tags: (N2:stride:100, N1:stride:10, N0:stride:1) scope:auto\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "    for j_outer, k_outer, i_outer, f_dim_1, f_dim_0, f_dim_2\n",
+      "↱               \u001b[36m_f[f_dim_0, f_dim_1, f_dim_2]\u001b[0m = \u001b[35mf[f_dim_0 + 8*i_outer, f_dim_1 + 8*j_outer, f_dim_2 + 8*k_outer]\u001b[0m  {id=\u001b[32mf_fetch_rule\u001b[0m}\n",
+      "│         end f_dim_1, f_dim_0, f_dim_2\n",
+      "│         for i_inner, k_inner, j_inner\n",
+      "├↱              \u001b[36mdfdx[i_inner + i_outer*8, j_inner + j_outer*8, k_inner + k_outer*8]\u001b[0m = \u001b[35m((_f[2 + i_inner, 1 + j_inner, 1 + k_inner] + (-1)*_f[i_inner, 1 + j_inner, 1 + k_inner]) / 2) / dx\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "├└↱             \u001b[36mdfdy[i_inner + i_outer*8, j_inner + j_outer*8, k_inner + k_outer*8]\u001b[0m = \u001b[35m((_f[1 + i_inner, 2 + j_inner, 1 + k_inner] + (-1)*_f[1 + i_inner, j_inner, 1 + k_inner]) / 2) / dx\u001b[0m  {id=\u001b[32minsn_0\u001b[0m}\n",
+      "└ └             \u001b[36mdfdz[i_inner + i_outer*8, j_inner + j_outer*8, k_inner + k_outer*8]\u001b[0m = \u001b[35m((_f[1 + i_inner, 1 + j_inner, 2 + k_inner] + (-1)*_f[1 + i_inner, 1 + j_inner, k_inner]) / 2) / dx\u001b[0m  {id=\u001b[32minsn_1\u001b[0m}\n",
+      "    end j_outer, k_outer, i_outer, i_inner, k_inner, j_inner\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(stencil.knl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also do something more complicated by inputting a `tmp_dict`, which computes temporary values (that don't get stored in global arrays) before executing the assignments specified by `map_dict`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = ps.Field('f', offset=1)\n",
+    "g = ps.Field('g', offset=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp = p.Variable('tmp')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp_dict = {}\n",
+    "for i in range(3):\n",
+    "    shift = [0, 0, 0]\n",
+    "    shift[i] = 1\n",
+    "    expr = f.shift(tuple(shift))\n",
+    "    shift[i] = - 1\n",
+    "    expr += f.shift(tuple(shift))\n",
+    "    tmp_dict[tmp[i]] = expr"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check what we just did:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tmp[0] = f[i + 1 + 1, j + 1, k + 1] + f[i + 1 + -1, j + 1, k + 1]\n",
+      "tmp[1] = f[i + 1, j + 1 + 1, k + 1] + f[i + 1, j + 1 + -1, k + 1]\n",
+      "tmp[2] = f[i + 1, j + 1, k + 1 + 1] + f[i + 1, j + 1, k + 1 + -1]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for key, value in tmp_dict.items():\n",
+    "    print(key, '=', ps.Indexer(value))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "map_dict = {\n",
+    "    g: tmp[0] * tmp[1] * tmp[2]\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------------------------------------\n",
+      "KERNEL: loopy_kernel\n",
+      "---------------------------------------------------------------------------\n",
+      "ARGUMENTS:\n",
+      "Nx: ValueArg, type: np:dtype('int64')\n",
+      "Ny: ValueArg, type: np:dtype('int64')\n",
+      "Nz: ValueArg, type: np:dtype('int64')\n",
+      "f: type: np:dtype('float64'), shape: (2 + Nx, 2 + Ny, 2 + Nz), dim_tags: (N2:stride:(2 + Nz)*(2 + Ny), N1:stride:2 + Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "g: type: np:dtype('float64'), shape: (1 + Nx, 1 + Ny, 1 + Nz), dim_tags: (N2:stride:(1 + Nz)*(1 + Ny), N1:stride:1 + Nz, N0:stride:1), offset: <class 'loopy.kernel.data.auto'> aspace: global\n",
+      "---------------------------------------------------------------------------\n",
+      "DOMAINS:\n",
+      "[Nx, Ny, Nz] -> { [k_outer, k_inner, j_outer, j_inner, i_outer, i_inner, f_dim_0, f_dim_1, f_dim_2] : k_outer >= 0 and 0 <= k_inner <= 7 and k_inner < Nz - 8k_outer and j_outer >= 0 and 0 <= j_inner <= 7 and j_inner < Ny - 8j_outer and i_outer >= 0 and 0 <= i_inner <= 7 and i_inner < Nx - 8i_outer and 0 <= f_dim_0 <= 9 and f_dim_0 <= 1 + Nx - 8i_outer and 0 <= f_dim_1 <= 9 and f_dim_1 <= 1 + Ny - 8j_outer and 0 <= f_dim_2 <= 9 and f_dim_2 <= 1 + Nz - 8k_outer }\n",
+      "---------------------------------------------------------------------------\n",
+      "INAME IMPLEMENTATION TAGS:\n",
+      "f_dim_0: l.2\n",
+      "f_dim_1: l.1\n",
+      "f_dim_2: l.0\n",
+      "i_inner: l.2\n",
+      "i_outer: g.2\n",
+      "j_inner: l.1\n",
+      "j_outer: g.1\n",
+      "k_inner: l.0\n",
+      "k_outer: g.0\n",
+      "---------------------------------------------------------------------------\n",
+      "TEMPORARIES:\n",
+      "_f: type: np:dtype('float64'), shape: (f_dim_0:10, f_dim_1:10, f_dim_2:10), dim_tags: (N2:stride:100, N1:stride:10, N0:stride:1) scope:auto\n",
+      "tmp: type: <auto/runtime>, shape: (3), dim_tags: (N0:stride:1) scope:auto\n",
+      "---------------------------------------------------------------------------\n",
+      "INSTRUCTIONS:\n",
+      "    for j_outer, k_outer, i_outer, f_dim_1, f_dim_0, f_dim_2\n",
+      "↱               \u001b[36m_f[f_dim_0, f_dim_1, f_dim_2]\u001b[0m = \u001b[35mf[f_dim_0 + 8*i_outer, f_dim_1 + 8*j_outer, f_dim_2 + 8*k_outer]\u001b[0m  {id=\u001b[32mf_fetch_rule\u001b[0m}\n",
+      "│         end f_dim_1, f_dim_0, f_dim_2\n",
+      "│         for i_inner, k_inner, j_inner\n",
+      "├↱              \u001b[36mtmp[0]\u001b[0m = \u001b[35m_f[2 + i_inner, 1 + j_inner, 1 + k_inner] + _f[i_inner, 1 + j_inner, 1 + k_inner]\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
+      "├└↱             \u001b[36mtmp[1]\u001b[0m = \u001b[35m_f[1 + i_inner, 2 + j_inner, 1 + k_inner] + _f[1 + i_inner, j_inner, 1 + k_inner]\u001b[0m  {id=\u001b[32minsn_0\u001b[0m}\n",
+      "└↱└             \u001b[36mtmp[2]\u001b[0m = \u001b[35m_f[1 + i_inner, 1 + j_inner, 2 + k_inner] + _f[1 + i_inner, 1 + j_inner, k_inner]\u001b[0m  {id=\u001b[32minsn_1\u001b[0m}\n",
+      " └              \u001b[36mg[1 + i_inner + i_outer*8, 1 + j_inner + j_outer*8, 1 + k_inner + k_outer*8]\u001b[0m = \u001b[35mtmp[0]*tmp[1]*tmp[2]\u001b[0m  {id=\u001b[32minsn_2\u001b[0m}\n",
+      "    end j_outer, k_outer, i_outer, i_inner, k_inner, j_inner\n",
+      "---------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "stencil = ps.Stencil(map_dict, tmp_dict=tmp_dict, prefetch_args=['f'], h=1, dtype='float64')\n",
+    "\n",
+    "print(stencil.knl)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": false,
+   "sideBar": false,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": false,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/phi_chi.py b/examples/phi_chi.py
new file mode 100644
index 0000000..1a69f8d
--- /dev/null
+++ b/examples/phi_chi.py
@@ -0,0 +1,182 @@
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cla
+import pystella as ps
+# pylint: disable=no-member
+
+# set parameters
+grid_shape = (128, 128, 128)
+proc_shape = (1, 1, 1)
+rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+grid_size = np.product(grid_shape)
+
+h = 2
+nscalars = 2
+pencil_shape = tuple(ni + 2 * h for ni in rank_shape)
+
+box_dim = (5, 5, 5)
+volume = np.product(box_dim)
+dx = tuple(Li / Ni for Li, Ni in zip(box_dim, grid_shape))
+dk = tuple(2 * np.pi / Li for Li in box_dim)
+kappa = 1/10
+dt = kappa * min(dx)
+
+dtype = np.float64
+nscalars = 2
+mpl = 1  # change to np.sqrt(8 * np.pi) for reduced Planck mass units
+mphi = 1.e-6 * mpl  # units of mpl
+mchi = [0, 0]  # units of mpl
+gsq = [2.5e-7 / mphi**2, 1.25e-7 / mphi**2]
+f0 = [.193 * mpl, 0, 0]  # units of mpl
+df0 = [-.142231 * mpl, 0, 0]  # units of mpl
+end_time = 1
+end_scale_factor = 20
+Stepper = ps.RungeKutta4
+
+ctx = ps.choose_device_and_make_context()
+queue = cl.CommandQueue(ctx)
+
+decomp = ps.DomainDecomposition(proc_shape, h, rank_shape)
+derivs = ps.GradientLaplacian(decomp, h, dx)
+
+
+def potential(f):
+    phi, chi = f[0], [f[i] for i in range(1, nscalars)]
+    return 1/2 * phi**2 \
+           + 1/2 * sum([mchi[i]**2 * chi[i]**2 for i, _ in enumerate(chi)]) \
+           + 1/2 * phi**2 * sum([gsq[i] * chi[i]**2 for i, _ in enumerate(chi)])
+
+
+scalar_sector = ps.ScalarSector(nscalars, potential=potential)
+
+# create energy computation function
+from pystella.sectors import get_rho_and_p
+reduce_energy = ps.Reduction(decomp, scalar_sector, h=h,
+                             rank_shape=rank_shape, grid_size=grid_size,
+                             callback=get_rho_and_p)
+
+
+def compute_energy(f_s, df_s, lap_f, a):
+    derivs(queue, fx=f_s, lap=lap_f)
+
+    return reduce_energy(queue, f=f_s, dfdt=df_s, lap_f=lap_f, a=np.array(a))
+
+
+stepper = Stepper(scalar_sector, h=h, dt=dt)
+
+# create output function
+if decomp.rank == 0:
+    from pystella.output import OutputFile
+    out = OutputFile(ctx=ctx, runfile=__file__)
+else:
+    out = None
+statistics = ps.FieldStatistics(decomp, h, rank_shape=rank_shape,
+                                grid_size=grid_size)
+fft = ps.DFT(decomp, ctx, queue, grid_shape, dtype)
+spectra = ps.PowerSpectra(decomp, fft, dk, volume)
+projector = ps.Projector(fft, h)
+
+
+def output(step_count, t, f, dfdt, lap_f, energy, expand):
+    if step_count % 2 == 0:
+        f_stats = statistics(f[0])
+        f_stats['mean'] /= mpl
+        f_stats['variance'] /= mpl**2
+
+        true_energy = {}
+        for key, val in energy.items():
+            true_energy[key] = val / expand.a[0]**2 / mpl**2
+
+        if decomp.rank == 0:
+            out.output('energy', t=t, a=expand.a[0],
+                       adot=expand.adot[0]/expand.a[0],
+                       hubble=expand.hubble[0]/expand.a[0],
+                       **true_energy,
+                       eos=energy['pressure']/energy['total'],
+                       constraint=expand.constraint(energy['total'])
+                       )
+
+            out.output('statistics/f', t=t, a=expand.a[0], **f_stats)
+
+    if expand.a[0] / output.a_last_spec >= 1.02:
+        output.a_last_spec = expand.a[0]
+
+        scalar_spectra = spectra(f[0])
+
+        if decomp.rank == 0:
+            out.output('spectra', t=t, a=expand.a[0],
+                       spectra=scalar_spectra)
+
+
+output.a_last_spec = .1
+
+# create cl arrays
+f = cla.empty(queue, (3, nscalars,)+pencil_shape, dtype=dtype)
+dfdt = cla.empty(queue, (3, nscalars,)+pencil_shape, dtype=dtype)
+lap_f = cla.empty(queue, (nscalars,)+rank_shape, dtype=dtype)
+
+# set field means
+for i in range(nscalars):
+    f[0, i] = f0[i]
+    dfdt[0, i] = df0[i]
+
+# compute energy of background fields and initialize expansion
+energy = compute_energy(f[0], dfdt[0], lap_f, 1.)
+expand = ps.Expansion(energy['total'], Stepper, mpl=mpl)
+
+# compute hubble correction to scalar field effective mass
+addot = expand.addot_friedmann_2(expand.a[0], energy['total'], energy['pressure'])
+hubbleCorrection = - addot / expand.a[0]
+
+# effective masses of scalar fields
+from pymbolic import var
+from pymbolic.mapper.evaluator import evaluate_kw
+fields = [var('f0')[i] for i in range(nscalars)]
+d2Vd2f = [ps.diff(potential(fields), field, field) for field in fields]
+eff_mass = [evaluate_kw(x, f0=f0) + hubbleCorrection for x in d2Vd2f]
+
+modes = ps.RayleighGenerator(ctx, fft, dk, volume, seed=13298*(decomp.rank+1))
+
+for fld in range(nscalars):
+    modes.init_WKB_fields(f[0, fld], dfdt[0, fld], norm=mphi**2,
+                          omega_k=lambda k: np.sqrt(k**2 + eff_mass[fld]),
+                          hubble=expand.hubble[0])
+
+for i in range(nscalars):
+    f[0, i] += f0[i]
+    dfdt[0, i] += df0[i]
+
+# re-initialize energy and expansion
+energy = compute_energy(f[0], dfdt[0], lap_f, expand.a[0])
+expand = ps.Expansion(energy['total'], Stepper, mpl=mpl)
+
+# output first slice
+output(0, 0., f, dfdt, lap_f, energy, expand)
+
+# evolution
+t = 0.
+step_count = 0
+
+if decomp.rank == 0:
+    print(energy)
+
+from time import time
+start = time()
+last_out = time()
+
+while t < end_time and expand.a[0] < end_scale_factor:
+    for s in range(stepper.num_stages):
+        stepper(s, queue=queue, a=expand.a, hubble=expand.hubble,
+                f=f, dfdt=dfdt, lap_f=lap_f)
+        expand.step(s, energy['total'], energy['pressure'], dt)
+        q = 0 if s == 3 else 1
+        energy = compute_energy(f[q], dfdt[q], lap_f, expand.a[q])
+
+    t += dt
+    step_count += 1
+    output(step_count, t, f, dfdt, lap_f, energy, expand)
+    if time() - last_out > 6:
+        last_out = time()
+        ms_per_step = (last_out - start) * 1e3 / step_count
+        if decomp.rank == 0:
+            print(t, expand.a[0], ms_per_step, 1e3/ms_per_step, sep='\t')
diff --git a/examples/wave-equation.py b/examples/wave-equation.py
new file mode 100644
index 0000000..2bf507a
--- /dev/null
+++ b/examples/wave-equation.py
@@ -0,0 +1,45 @@
+import pyopencl as cl
+import pyopencl.array as cla
+import pyopencl.clrandom as clr
+import pystella as ps
+
+# set parameters
+grid_shape = (128, 128, 128)
+proc_shape = (1, 1, 1)
+rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+h = 1
+dtype = 'float64'
+dx = tuple(10 / Ni for Ni in grid_shape)
+dt = dx[0] / 10
+
+# create pyopencl context, queue, and halo-sharer
+ctx = ps.choose_device_and_make_context()
+queue = cl.CommandQueue(ctx)
+decomp = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+# initialize arrays with random data
+f = clr.rand(queue, tuple(ni + 2 * h for ni in rank_shape), dtype)
+dfdt = clr.rand(queue, tuple(ni + 2 * h for ni in rank_shape), dtype)
+lap_f = cla.zeros(queue, rank_shape, dtype)
+# temporary array for low-storage integrator
+k_tmp = cla.zeros(queue, (2,) + rank_shape, dtype)
+
+# define system of equations
+f_ = ps.DynamicField('f', offset='h')  # don't overwrite f
+rhs_dict = {
+    f_: f_.dot,  # df/dt = \dot{f}
+    f_.dot: f_.lap  # d\dot{f}/dt = \nabla^2 f
+}
+args = ps.get_field_args(rhs_dict)  # infer argument information from rhs_dict
+
+# create time-stepping and derivative-computing kernels
+stepper = ps.LowStorageRK54(rhs_dict, k_tmp, args=args, dt=dt, h=h)
+derivs = ps.GradientLaplacian(decomp, h, dx)
+
+t = 0.
+# loop over time
+while t < 10.:
+    for s in range(stepper.num_stages):
+        derivs(queue, fx=f, lap=lap_f)
+        stepper(s, queue=queue, f=f, dfdt=dfdt, lap_f=lap_f)
+    t += dt
diff --git a/pystella/__init__.py b/pystella/__init__.py
new file mode 100644
index 0000000..c259bf7
--- /dev/null
+++ b/pystella/__init__.py
@@ -0,0 +1,136 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from pystella.field import Field, DynamicField, Indexer, diff, get_field_args
+from pystella.sectors import Sector, ScalarSector, TensorPerturbationSector
+from pystella.elementwise import ElementWiseMap
+from pystella.stencil import Stencil, StreamingStencil
+from pystella.reduction import Reduction, FieldStatistics
+from pystella.step import (RungeKutta4, RungeKutta3SSP, RungeKutta3Heun,
+                           RungeKutta3Nystrom, RungeKutta3Ralston,
+                           RungeKutta2Midpoint, RungeKutta2Ralston, LowStorageRK54,
+                           LowStorageRK3Williamson, LowStorageRK3Inhomogeneous,
+                           LowStorageRK3SSP)
+from pystella.derivs import GradientLaplacian
+from pystella.decomp import DomainDecomposition
+from pystella.expansion import Expansion
+from pystella.fourier import (DFT, RayleighGenerator, Projector, PowerSpectra,
+                              SpectralGradientLaplacian)
+
+from loopy import set_caching_enabled
+set_caching_enabled(True)
+
+
+def choose_device_and_make_context(platform_choice=None, device_choice=None):
+    """
+    A wrapper to choose a device and create a :class:`pyopencl.Context` on
+    a particular device.
+
+    :arg platform_number: An integer specifying which element of the
+        :class:`list` returned by :func:`pyopencl.get_platforms` to choose.
+        Defaults to *None*, in which case a NVIDIA platform.
+        If one is not found, then the first platform is chosen.
+
+    :arg device_number: An integer specifying which device to run on.
+        Defaults to *None*, in which case a device is chosen according to any
+        available environment variable defining the local MPI rank (defaulting to 0).
+        Currently only looks for OpenMPI and MVAPICH environment variables.
+
+    :returns: A :class:`pyopencl.Context`.
+    """
+
+    import pyopencl as cl
+
+    # look for NVIDIA platform
+    platform = None
+    platforms = cl.get_platforms()
+    if platform_choice is None:
+        for i, plt in enumerate(platforms):
+            if 'NVIDIA' in plt.name:
+                platform = plt
+        platform = platform or platforms[0]
+    else:
+        platform = platforms[platform_choice]
+
+    devices = platform.get_devices()
+    try:
+        # sort devices based on their unique pci bus id
+        devices = sorted(devices, key=lambda dev: dev.pci_bus_id_nv)
+    except:  # noqa
+        pass
+    num_devices = len(devices)
+
+    import os
+    local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK',
+                               os.getenv('MV2_COMM_WORLD_LOCAL_RANK', 0)))
+    choice = device_choice or (local_rank % num_devices)
+
+    return cl.Context([devices[choice]])
+
+
+class DisableLogging():  # silence logging warning
+    def __enter__(self):
+        import logging
+        self.original_level = logging.getLogger().getEffectiveLevel()
+        logging.disable(logging.CRITICAL)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        import logging
+        logging.disable(self.original_level)
+
+
+__all__ = [
+    "Field",
+    "DynamicField",
+    "Indexer",
+    "diff",
+    "get_field_args",
+    "Sector",
+    "ScalarSector",
+    "TensorPerturbationSector",
+    "ElementWiseMap",
+    "RungeKutta4",
+    "RungeKutta3SSP",
+    "RungeKutta3Heun",
+    "RungeKutta3Nystrom",
+    "RungeKutta3Ralston",
+    "RungeKutta2Midpoint",
+    "RungeKutta2Ralston",
+    "LowStorageRK54",
+    "LowStorageRK3Williamson",
+    "LowStorageRK3Inhomogeneous",
+    "LowStorageRK3SSP",
+    "Stencil",
+    "StreamingStencil",
+    "GradientLaplacian",
+    "Reduction",
+    "FieldStatistics",
+    "DomainDecomposition",
+    "Expansion",
+    "DFT",
+    "RayleighGenerator",
+    "Projector",
+    "PowerSpectra",
+    "SpectralGradientLaplacian",
+    "choose_device_and_make_context",
+]
diff --git a/pystella/decomp.py b/pystella/decomp.py
new file mode 100644
index 0000000..9f03fcb
--- /dev/null
+++ b/pystella/decomp.py
@@ -0,0 +1,585 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl.array as cla
+import loopy as lp
+
+
+class DomainDecomposition:
+    """
+    Implements functions needed for the MPI domain decomposition of a 3D grid.
+
+    If :mod:`mpi4py` is not installed, then only single-rank operation is supported.
+
+    .. automethod:: __init__
+    .. automethod:: share_halos
+    .. automethod:: remove_halos
+    .. automethod:: gather_array
+    .. automethod:: restore_halos
+    .. automethod:: scatter_array
+    .. automethod:: rankID
+    .. autoattribute:: rank_tuple
+    .. automethod:: bcast
+    .. automethod:: allreduce
+
+    .. attribute:: comm
+
+        An :class:`mpi4py.MPI.COMM_WORLD` if :mod:`mpi4py` is installed, else *None*.
+
+    .. attribute:: rank
+
+        The integral rank of the calling process, i.e., that returned by
+        :meth:`mpi4py.MPI.COMM_WORLD.Get_rank`.
+
+    .. attribute:: nranks
+
+        The total number of ranks, i.e., that returned by
+        :meth:`mpi4py.MPI.COMM_WORLD.Get_size`.
+
+    .. attribute:: proc_shape
+
+    .. attribute:: rank_shape
+    """
+
+    def __init__(self, proc_shape, h, rank_shape=None):
+        """
+        :arg queue: The :class:`pyopencl.CommandQueue` to enqueue kernels and copies.
+
+        :arg proc_shape: A 3-:class:`tuple` specifying the shape of the MPI
+            processor grid.
+
+            .. note::
+
+                Currently, ``proc_shape[2]`` must be ``1``, i.e., only
+                two-dimensional domain decompositions are supported.
+
+        :arg h: The number of halo padding layers on each face of the numerical grid.
+
+        The following keyword arguments are recognized:
+
+        :arg rank_shape: A 3-:class:`tuple` specifying the shape of the computational
+            sub-grid on the calling process.
+            Defaults to *None*, in which case the global size is not fixed (and
+            will be inferred when, e.g., :meth:`share_halos` is called, at a slight
+            performance penalty).
+
+        :raises NotImplementedError: if ``proc_shape[2] != 1``.
+
+        :raises ValueError: if the size of the processor grid
+            ``proc_shape[0] * proc_shape[1] * proc_shape[2]`` is not equal to the
+            total number of ranks the application was launched with
+            (i.e., that returned by :func:`mpi4py.MPI.COMM_WORLD.Get_size()`).
+        """
+
+        self.proc_shape = proc_shape
+        self.h = h
+        self.buffer_arrays = {}
+        self.rank_shape = rank_shape
+
+        try:
+            from mpi4py import MPI
+            self.comm = MPI.COMM_WORLD
+            self.rank = self.comm.Get_rank()
+            self.nranks = self.comm.Get_size()
+        except ModuleNotFoundError:
+            self.comm = None
+            self.rank = 0
+            self.nranks = 1
+
+        if proc_shape[2] != 1:
+            raise NotImplementedError("decomposition in z not yet supported")
+
+        if proc_shape[0] * proc_shape[1] * proc_shape[2] != self.nranks:
+            raise ValueError(
+                "%s is an invalid decomposition for %d ranks"
+                % (str(proc_shape), self.nranks))
+
+        self.rz = self.rank % proc_shape[2]
+        self.ry = (self.rank - self.rz) // proc_shape[2] % proc_shape[1]
+        self.rx = (self.rank - self.rz - proc_shape[2] * self.ry) // proc_shape[1]
+
+        params_to_fix = {'h': self.h}
+        if self.rank_shape is not None:
+            for k, v in zip(('Nx', 'Ny', 'Nz'), self.rank_shape):
+                params_to_fix[k] = v
+
+        pencil_shape_str = "(Nx+2*h, Ny+2*h, Nz+2*h)"
+
+        def x_comm_knl(instructions):
+            knl = lp.make_kernel(
+                "[Ny, Nz, h] \
+                -> { [i,j,k]: 0<=i<h and 0<=j<Ny+2*h and 0<=k<Nz+2*h }",
+                instructions,
+                [
+                    lp.GlobalArg("arr", shape=pencil_shape_str, offset=lp.auto),
+                    lp.GlobalArg("buf", shape="(2, h, Ny+2*h, Nz+2*h)"),
+                    lp.ValueArg('Nx', dtype='int'),
+                    '...',
+                ],
+                default_offset=lp.auto,
+                lang_version=(2018, 2),
+            )
+            knl = lp.set_options(knl, enforce_variable_access_ordered="no_check")
+            knl = lp.remove_unused_arguments(knl)
+            knl = lp.fix_parameters(knl, **params_to_fix)
+            knl = lp.split_iname(knl, "k", 32, outer_tag="g.0", inner_tag="l.0")
+            knl = lp.split_iname(knl, "j", 2, outer_tag="g.1", inner_tag="l.1")
+            return knl
+
+        self.pack_x_knl = x_comm_knl(["buf[1, i, j, k] = arr[Nx+i, j, k]",
+                                      "buf[0, i, j, k] = arr[i+h, j, k]"])
+        self.unpack_x_knl = x_comm_knl(["arr[i, j, k] = buf[1, i, j, k]",
+                                        "arr[Nx+h+i, j, k] = buf[0, i, j, k]"])
+        self.pack_unpack_x_knl = x_comm_knl(["arr[i, j, k] = arr[Nx+i, j, k]",
+                                             "arr[Nx+h+i, j, k] = arr[i+h, j, k]"])
+
+        def y_comm_knl(instructions):
+            knl = lp.make_kernel(
+                "[Nx, Nz, h] \
+                -> { [i,j,k]: 0<=i<Nx+2*h and 0<=j<h and 0<=k<Nz+2*h }",
+                instructions,
+                [
+                    lp.GlobalArg("arr", shape=pencil_shape_str, offset=lp.auto),
+                    lp.GlobalArg("buf", shape="(2, Nx+2*h, h, Nz+2*h)"),
+                    lp.ValueArg('Ny', dtype='int'),
+                    '...',
+                ],
+                default_offset=lp.auto,
+                lang_version=(2018, 2),
+            )
+            knl = lp.set_options(knl, enforce_variable_access_ordered="no_check")
+            knl = lp.remove_unused_arguments(knl)
+            knl = lp.fix_parameters(knl, **params_to_fix)
+            knl = lp.split_iname(knl, "k", 32, outer_tag="g.0", inner_tag="l.0")
+            knl = lp.split_iname(knl, "i", 2, outer_tag="g.1", inner_tag="l.1")
+            return knl
+
+        self.pack_y_knl = y_comm_knl(["buf[1, i, j, k] = arr[i, Ny+j, k]",
+                                      "buf[0, i, j, k] = arr[i, j+h, k]"])
+        self.unpack_y_knl = y_comm_knl(["arr[i, j, k] = buf[1, i, j, k]",
+                                        "arr[i, Ny+h+j, k] = buf[0, i, j, k]"])
+        self.pack_unpack_y_knl = y_comm_knl(["arr[i, j, k] = arr[i, Ny+j, k]",
+                                             "arr[i, Ny+h+j, k] = arr[i, j+h, k]"])
+
+        def z_comm_knl(instructions):
+            knl = lp.make_kernel(
+                "[Nx, Ny, h] \
+                 -> { [i,j,k]: 0<=i<Nx+2*h and 0<=j<Ny+2*h and 0<=k<h }",
+                instructions,
+                [
+                    lp.GlobalArg("arr", shape=pencil_shape_str, offset=lp.auto),
+                    lp.ValueArg('Nz', dtype='int'),
+                    '...',
+                ],
+                default_offset=lp.auto,
+                lang_version=(2018, 2),
+            )
+            knl = lp.set_options(knl, enforce_variable_access_ordered="no_check")
+            knl = lp.remove_unused_arguments(knl)
+            knl = lp.fix_parameters(knl, **params_to_fix)
+            knl = lp.split_iname(knl, "k", h, outer_tag="g.0", inner_tag="l.0")
+            knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1")
+            knl = lp.split_iname(knl, "i", 1, outer_tag="g.2", inner_tag="l.2")
+            return knl
+
+        self.pack_unpack_z_knl = z_comm_knl(["arr[i, j, k] = arr[i, j, Nz+k]",
+                                             "arr[i, j, Nz+h+k] = arr[i, j, k+h]"])
+
+        def make_G_S_knl(instructions):
+            knl = lp.make_kernel(
+                "[Nx, Ny, Nz, h] -> { [i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz }",
+                instructions,
+                [
+                    lp.GlobalArg("subarr", shape=pencil_shape_str, offset=lp.auto),
+                    lp.GlobalArg("arr", shape="(Nx, Ny, Nz)", offset=lp.auto),
+                    '...',
+                ],
+                default_offset=lp.auto,
+                lang_version=(2018, 2),
+            )
+            knl = lp.fix_parameters(knl, **params_to_fix)
+            knl = lp.split_iname(knl, "k", 32, outer_tag="g.0", inner_tag="l.0")
+            knl = lp.split_iname(knl, "j", 2, outer_tag="g.1", inner_tag="unr")
+            knl = lp.split_iname(knl, "i", 1, outer_tag="g.2", inner_tag="unr")
+            return knl
+
+        self.gather_knl = make_G_S_knl("arr[i, j, k] = subarr[i+h, j+h, k+h]")
+        self.scatter_knl = make_G_S_knl("subarr[i+h, j+h, k+h] = arr[i, j, k]")
+
+    def get_displs_and_counts(self, full_shape, rank_shape):
+        NX, NY, NZ = full_shape
+        Nx, Ny, Nz = rank_shape
+
+        displs = np.ones(np.product(self.proc_shape[:2]), dtype='int')
+        for ri in range(self.proc_shape[0]):
+            for rj in range(self.proc_shape[1]):
+                rid = self.rankID(ri, rj, 0)
+                displs[rid] = NZ * (rj * Ny + NY * ri * Nx)
+
+        counts = Nz * Ny * np.ones(np.product(self.proc_shape[:2]), dtype='int')
+
+        return displs, counts
+
+    def rankID(self, rx, ry, rz):
+        """
+        :returns: The (integer) rank number corresponding to the processor grid
+            site, ``(rx, ry, rz)``.
+        """
+
+        rx_ = (rx + self.proc_shape[0]) % self.proc_shape[0]
+        ry_ = (ry + self.proc_shape[1]) % self.proc_shape[1]
+        rz_ = (rz + self.proc_shape[2]) % self.proc_shape[2]
+        return rz_ + self.proc_shape[2] * (ry_ + self.proc_shape[1] * rx_)
+
+    @property
+    def rank_tuple(self):
+        """
+        A 3-:class:`tuple` containing the MPI rank's location in the processor grid.
+        """
+
+        return (self.rx, self.ry, self.rz)
+
+    def _create_halo_buffers(self, queue, rank_shape, dtype):
+        x_shape = (2, self.h, rank_shape[1] + 2*self.h, rank_shape[2] + 2*self.h)
+        y_shape = (2, rank_shape[0] + 2*self.h, self.h, rank_shape[2] + 2*self.h)
+
+        buf_x = cla.zeros(queue, x_shape, dtype)
+        buf_send_x = buf_x.map_to_host()
+        buf_recv_x = buf_x.map_to_host()
+
+        buf_y = cla.zeros(queue, y_shape, dtype)
+        buf_send_y = buf_y.map_to_host()
+        buf_recv_y = buf_y.map_to_host()
+
+        return buf_x, buf_send_x, buf_recv_x, buf_y, buf_send_y, buf_recv_y
+
+    def share_halos(self, queue, fx):
+        """
+        Communicates halo data across all axes, imposing periodic boundary
+        conditions.
+
+        :arg queue: The :class:`pyopencl.CommandQueue` to enqueue kernels and copies.
+
+        :arg fx: The :class:`pyopencl.array.Array` whose halo elements are to be
+            synchronized across ranks.
+            The number of halo layers on each face of the grid is fixed by
+            :attr:`h`, while the shape of ``fx`` (i.e., subtracting halo padding)
+            is only fixed if ``rank_shape`` was passed to :meth:`__init__`.
+        """
+
+        rank_shape = tuple(ni - 2 * self.h for ni in fx.shape)
+        dtype = fx.dtype
+
+        if (rank_shape, dtype) not in self.buffer_arrays:
+            self.buffer_arrays[(rank_shape, dtype)] = \
+                self._create_halo_buffers(queue, rank_shape, dtype)
+
+        buf_x, buf_send_x, buf_recv_x, buf_y, buf_send_y, buf_recv_y = \
+            self.buffer_arrays[(rank_shape, dtype)]
+
+        if self.proc_shape[2] == 1:
+            evt, _ = self.pack_unpack_z_knl(queue, arr=fx)
+        else:
+            raise NotImplementedError('domain decomposition in z direction')
+
+        if self.proc_shape[0] == 1:
+            evt, _ = self.pack_unpack_x_knl(queue, arr=fx)
+        else:
+            evt, _ = self.pack_x_knl(queue, arr=fx, buf=buf_x)
+            buf_x.get(ary=buf_send_x)
+
+            self.comm.Sendrecv(buf_send_x[0],
+                               self.rankID(self.rx-1, self.ry, self.rz),
+                               recvbuf=buf_recv_x[0],
+                               source=self.rankID(self.rx+1, self.ry, self.rz))
+            self.comm.Sendrecv(buf_send_x[1],
+                               self.rankID(self.rx+1, self.ry, self.rz),
+                               recvbuf=buf_recv_x[1],
+                               source=self.rankID(self.rx-1, self.ry, self.rz))
+
+            buf_x.set(buf_recv_x)
+            evt, _ = self.unpack_x_knl(queue, arr=fx, buf=buf_x)
+
+        if self.proc_shape[1] == 1:
+            evt, _ = self.pack_unpack_y_knl(queue, arr=fx)
+        else:
+            evt, _ = self.pack_y_knl(queue, arr=fx, buf=buf_y)
+            buf_y.get(ary=buf_send_y)
+
+            self.comm.Sendrecv(buf_send_y[0],
+                               self.rankID(self.rx, self.ry-1, self.rz),
+                               recvbuf=buf_recv_y[0],
+                               source=self.rankID(self.rx, self.ry+1, self.rz))
+            self.comm.Sendrecv(buf_send_y[1],
+                               self.rankID(self.rx, self.ry+1, self.rz),
+                               recvbuf=buf_recv_y[1],
+                               source=self.rankID(self.rx, self.ry-1, self.rz))
+
+            buf_y.set(buf_recv_y)
+            evt, _ = self.unpack_y_knl(queue, arr=fx, buf=buf_y)
+
+    def bcast(self, x, root):
+        """
+        A wrapper to :func:`MPI.COMM_WORLD.bcast` which broadcasts an arbitrary
+        python object.
+
+        :arg x: The value to broadcasted. Must be defined on all ranks (i.e.,
+            set ``x = None`` on ranks other than ``root``).
+
+        :arg root: The rank whose value of ``x`` should be broadcasted.
+
+        :returns: The broadcasted value, on all ranks.
+        """
+
+        if self.comm is not None:
+            return self.comm.bcast(x, root=root)
+        else:
+            return x
+
+    def allreduce(self, rank_reduction, op=None):
+        """
+        A wrapper to :func:`MPI.COMM_WORLD.allreduce` which reduces and broadcasts
+        a rank ``rank_reduction`` from the ``root`` rank.
+
+        :arg rank_reduction: The rank's individual value to be reduced.
+
+        :arg op: The MPI reduction operation to perform.
+            Defaults to :class:`MPI.SUM`.
+
+        :returns: The reduced value, on all ranks.
+        """
+
+        if self.comm is not None:
+            from mpi4py import MPI
+            op = op or MPI.SUM
+            return self.comm.allreduce(rank_reduction, op=op)
+        else:
+            return rank_reduction
+
+    def remove_halos(self, queue, in_array, out_array):
+        """
+        Removes the halo padding from an array.
+
+        The only restriction on the shapes of the three-dimensional input arrays
+        is that the shape of ``in_array`` is larger than that of ``out_array``
+        by ``2*h`` along each axis.
+
+        :arg queue: The :class:`pyopencl.CommandQueue` to enqueue kernels and copies.
+
+        :arg in_array: The subarray whose halos will be removed.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray`.
+
+        :arg out_array: The output array.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray`.
+        """
+
+        h = self.h
+        dtype = out_array.dtype
+        if in_array.dtype != dtype:
+            raise ValueError("in_array and out_array have different dtypes")
+
+        cl_in = isinstance(in_array, cla.Array)
+        cl_out = isinstance(out_array, cla.Array)
+        np_in = isinstance(in_array, np.ndarray)
+        np_out = isinstance(out_array, np.ndarray)
+
+        if cl_in and np_out:
+            # FIXME: de-pad in on GPU?
+            out_array[:] = in_array.get()[h:-h, h:-h, h:-h]
+        elif cl_in or cl_out:
+            evt, _ = self.gather_knl(queue, subarr=in_array, arr=out_array)
+            evt.wait()  # FIXME: unnecessary?
+        elif np_in and np_out:
+            out_array[:] = in_array[h:-h, h:-h, h:-h]
+
+    def gather_array(self, queue, in_array, out_array, root):
+        """
+        Gathers the subdomains of an array from each rank into a single array
+        of the entire grid, removing halo padding from ``in_array``.
+
+        :arg queue: The :class:`pyopencl.CommandQueue` to enqueue kernels and copies.
+
+        :arg in_array: The subarrays to be gathered.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray`.
+
+        :arg out_array: The output array for the gathered grid.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray` on rank ``root``, and *None* otherwise.
+
+        :arg root: The rank to which ``in_array`` is gathered.
+        """
+
+        h = self.h
+        dtype = None if self.rank != root else out_array.dtype
+        dtype = self.bcast(dtype, root=root)
+        if in_array.dtype != dtype:
+            raise ValueError("in_array and out_array have different dtypes")
+
+        cl_in = self.bcast(isinstance(in_array, cla.Array), root)
+        cl_out = self.bcast(isinstance(out_array, cla.Array), root)
+
+        if self.nranks == 1:
+            self.remove_halos(queue, in_array, out_array)
+        else:
+            temp_in = in_array.get() if cl_in else in_array
+            if cl_out:
+                temp_out = np.zeros(shape=out_array.shape, dtype=dtype)
+            else:
+                temp_out = out_array
+
+            rank_shape = tuple(i - 2 * h for i in in_array.shape)
+            full_shape = None if self.rank != root else out_array.shape
+            full_shape = self.bcast(full_shape, root)
+            displs, counts = self.get_displs_and_counts(full_shape, rank_shape)
+
+            from mpi4py import MPI
+            MPI_dtype = MPI._typedict[np.dtype(dtype).char]
+
+            for i in range(rank_shape[0]):
+                tmp = temp_in[i+h, h:-h, h:-h].copy()
+                self.comm.Gatherv(tmp,
+                                  [temp_out[i], counts, displs, MPI_dtype],
+                                  root=root)
+
+            if cl_out:
+                out_array.set(temp_out)
+
+    def restore_halos(self, queue, in_array, out_array):
+        """
+        Adds halo padding to an array.
+
+        The only restriction on the shapes of the three-dimensional input arrays
+        is that the shape of ``out_array`` is larger than that of ``in_array``
+        by ``2*h`` along each axis.
+
+        .. note::
+
+            Since :meth:`share_halos` is not currently implemented for
+            :class:`numpy.ndarray`'s, this method does not automatically
+            share halos after they are restored.
+            Thus, halos must be shared manually after the fact (for now).
+
+        :arg queue: The :class:`pyopencl.CommandQueue` to enqueue kernels and copies.
+
+        :arg in_array: The array to add halos to.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray`.
+
+        :arg out_array: The output array.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray`.
+        """
+
+        h = self.h
+        dtype = in_array.dtype
+        if out_array.dtype != dtype:
+            raise ValueError("in_array and out_array have different dtypes")
+
+        cl_in = isinstance(in_array, cla.Array)
+        cl_out = isinstance(out_array, cla.Array)
+        np_in = isinstance(in_array, np.ndarray)
+        np_out = isinstance(out_array, np.ndarray)
+
+        if cl_in and np_out:
+            out_array[h:-h, h:-h, h:-h] = in_array.get()
+        elif cl_out or cl_in:
+            evt, _ = self.scatter_knl(queue, arr=in_array, subarr=out_array)
+            evt.wait()  # FIXME: unnecessary?
+        elif np_in and np_out:
+            out_array[h:-h, h:-h, h:-h] = in_array
+
+        # FIXME: share halos
+
+    def scatter_array(self, queue, in_array, out_array, root):
+        """
+        Scatters the values of a single array of the entire grid to invidual
+        subdomains on each rank.
+        The per-rank ``out_array` is padded by ``h`` on each face of the grid.
+
+        .. note::
+
+            Since :meth:`share_halos` is not currently implemented for
+            :class:`numpy.ndarray`'s, this method does not automatically
+            share halos after scattering.
+            Thus, halos must be shared manually after the fact (for now).
+
+        :arg queue: The :class:`pyopencl.CommandQueue` to enqueue kernels and copies.
+
+        :arg in_array: The full array to be scattered.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray` on rank ``root``, and *None* otherwise.
+
+        :arg out_array: The output array for the scattered arrays.
+            May be either a :class:`pyopencl.array.Array` or a
+            :class:`numpy.ndarray`.
+
+        :arg root: The rank from which ``in_array`` is scattered.
+        """
+
+        h = self.h
+        dtype = None if self.rank != root else in_array.dtype
+        dtype = self.bcast(dtype, root=root)
+        if out_array.dtype != dtype:
+            raise ValueError("in_array and out_array have different dtypes")
+
+        cl_in = self.bcast(isinstance(in_array, cla.Array), root)
+        cl_out = self.bcast(isinstance(out_array, cla.Array), root)
+
+        if self.nranks == 1:
+            self.restore_halos(queue, in_array, out_array)
+        else:
+            rank_shape = tuple(i - 2 * h for i in out_array.shape)
+
+            full_shape = None if self.rank != root else in_array.shape
+            full_shape = self.bcast(full_shape, root)
+
+            tmp = np.zeros(shape=rank_shape[1:], dtype=dtype)
+            if self.rank == root:
+                temp_in = in_array.get() if cl_in else in_array
+            else:
+                temp_in = None
+            if cl_out:
+                temp_out = np.zeros(shape=out_array.shape, dtype=dtype)
+            else:
+                temp_out = out_array
+
+            displs, counts = self.get_displs_and_counts(full_shape, rank_shape)
+
+            from mpi4py import MPI
+            MPI_dtype = MPI._typedict[np.dtype(dtype).char]
+
+            for i in range(rank_shape[0]):
+                tmp_in = None
+                if self.rank == root:
+                    tmp_in = temp_in[i, :, :]
+                self.comm.Scatterv([tmp_in, counts, displs, MPI_dtype],
+                                   tmp,
+                                   root=root)
+                temp_out[i+h, h:-h, h:-h] = tmp
+
+            if cl_out:
+                out_array.set(temp_out)
diff --git a/pystella/derivs.py b/pystella/derivs.py
new file mode 100644
index 0000000..9c8e21c
--- /dev/null
+++ b/pystella/derivs.py
@@ -0,0 +1,360 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import loopy as lp
+from pystella.stencil import Stencil, StreamingStencil
+from pystella.field import Field
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: GradientLaplacian
+.. currentmodule:: pystella.derivs
+.. autofunction:: pystella.derivs.expand_stencil
+.. autofunction:: pystella.derivs.centered_diff
+"""
+
+
+def expand_stencil(f, coefs):
+    """
+    Expands a stencil over a field.
+
+    :arg f: A :class:`~pystella.Field`.
+
+    :arg coefs: A :class:`dict` whose values are the coefficients of the stencil
+        at an offset given by the key. The keys must be 3-:class:`tuple`'s, and the
+        values may be :mod:`pymbolic` expressions or constants.
+
+    Example::
+
+        >>> f = Field('f', offset='h')
+        >>> coefs = {(1, 0, 0): 1, (-1, 0, 0): -1}
+        >>> stencil = expand_stencil(f, coefs)
+        >>> print(Indexer(stencil))
+        f[i + h + 1, j + h, k + h] + (-1)*f[i + h + -1, j + h, k + h]
+    """
+
+    return sum([c * f.shift(offset) for offset, c in coefs.items()])
+
+
+def centered_diff(f, coefs, direction, order):
+    """
+    A convenience wrapper to :func:`expand_stencil` for computing centered
+    differences. By assuming the symmetry of the stencil (which has parity given
+    by the parity of ``order``), no redundant coefficients need to be supplied.
+    Further, by supplying the ``direction`` parameter, the input offset (the keys
+    of ``coefs``) need only be integers.
+
+    :arg f: A :class:`~pystella.Field`.
+
+    :arg coefs: A :class:`dict` whose values are the coefficients of the stencil
+        at an offset given by the key. The keys must be integers, and the
+        values may be :mod:`pymbolic` expressions or constants. Only
+        non-redundant ``(offset, coefficient)`` pairs are needed.
+
+    :arg direction: An integer in ``(0, 1, 2)`` denoting the direction over which
+        to expand the stencil (i.e., to apply the offset).
+
+    :arg order: The order of the derivative being computed, which determines
+        whether coefficients at the opposite offset have the same or opposite
+        sign.
+
+    Example::
+
+        >>> f = Field('f', offset='h')
+        >>> coefs = {1: 1}
+        >>> stencil = centered_diff(f, coefs, 0, 1)
+        >>> print(Indexer(stencil))
+        f[i + h + 1, j + h, k + h] + (-1)*f[i + h + -1, j + h, k + h]
+    """
+
+    all_coefs = {}
+    for s, c in coefs.items():
+        offset = [0, 0, 0]
+
+        # skip central point (s == 0) for odd order
+        if s != 0 or order % 2 == 0:
+            offset[direction-1] = s
+            all_coefs[tuple(offset)] = c
+
+        # add the opposite point
+        if s != 0:
+            offset[direction-1] = - s
+            all_coefs[tuple(offset)] = (-1)**order * c
+
+    return expand_stencil(f, all_coefs)
+
+
+class FiniteDifferenceStencil:
+    coefs = NotImplemented
+    truncation_order = NotImplemented
+    order = NotImplemented
+    is_centered = NotImplemented
+
+    def __call__(self, f, direction):
+        if self.is_centered:
+            return centered_diff(f, self.coefs, direction, self.order)
+        else:
+            return expand_stencil(f, self.coefs)
+
+    def get_eigenvalues(self, k, dx):
+        raise NotImplementedError
+
+
+_grad_coefs = {}
+_grad_coefs[1] = {1: 1/2}
+_grad_coefs[2] = {1: 8/12, 2: -1/12}
+_grad_coefs[3] = {1: 45/60, 2: -9/60, 3: 1/60}
+_grad_coefs[4] = {1: 672/840, 2: -168/840, 3: 32/840, 4: -3/840}
+
+
+class FirstCenteredDifference(FiniteDifferenceStencil):
+    def __init__(self, h):
+        self.coefs = _grad_coefs[h]
+        self.truncation_order = 2 * h
+        self.order = 1
+        self.is_centered = True
+
+    def get_eigenvalues(self, k, dx):
+        import numpy as np
+        th = k * dx
+        if self.truncation_order == 2:
+            return np.sin(th) / dx
+        if self.truncation_order == 4:
+            return (8 * np.sin(th) - np.sin(2 * th)) / (6 * dx)
+        if self.truncation_order == 6:
+            return (45 * np.sin(th) - 9 * np.sin(2 * th)
+                    + np.sin(3 * th)
+                    ) / (30 * dx)
+        if self.truncation_order == 8:
+            return (672 * np.sin(th) - 168 * np.sin(2 * th)
+                    + 32 * np.sin(3 * th) - 3 * np.sin(4 * th)
+                    ) / (420 * dx)
+        else:
+            return k
+
+
+_lap_coefs = {}
+_lap_coefs[1] = {0: -2, 1: 1}
+_lap_coefs[2] = {0: -30/12, 1: 16/12, 2: -1/12}
+_lap_coefs[3] = {0: -490/180, 1: 270/180, 2: -27/180, 3: 2/180}
+_lap_coefs[4] = {0: -14350/5040, 1: 8064/5040, 2: -1008/5040,
+                 3: 128/5040, 4: -9/5040}
+
+
+class SecondCenteredDifference(FiniteDifferenceStencil):
+    def __init__(self, h):
+        self.coefs = _lap_coefs[h]
+        self.truncation_order = 2 * h
+        self.order = 2
+        self.is_centered = True
+
+    def get_eigenvalues(self, k, dx):
+        import numpy as np
+        th = k * dx
+        if self.truncation_order == 2:
+            return (2 * np.cos(th) - 2) / dx**2
+        elif self.truncation_order == 4:
+            return (32 * np.cos(th) - 2 * np.cos(2 * th) - 30) / (12 * dx**2)
+        elif self.truncation_order == 6:
+            return (90 * np.cos(th) - 9 * np.cos(2 * th)
+                    + 2/3 * np.cos(3 * th) - 245/3
+                    ) / (30 * dx**2)
+        elif self.truncation_order == 8:
+            return (1344 * np.cos(th) - 168 * np.cos(2 * th)
+                    + 64/3 * np.cos(3 * th) - 3/2 * np.cos(4 * th) - 7175/6
+                    ) / (420 * dx**2)
+        else:
+            return - k**2
+
+
+class GradientLaplacian:
+    """
+    A convenience class for generating kernels which compute spatial gradients,
+    Laplacians, and combinations thereof.
+
+    See :class:`SpectralGradientLaplacian` for a version of this
+    class implementing spectral collocation.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, decomp, h, dx, **kwargs):
+        """
+        The following arguments are required:
+
+        :arg decomp: An instance of :class:`DomainDecomposition`.
+
+        :arg h: The number of halo padding layers on each face of the numerical grid.
+
+        :arg dx: A 3-:class:`tuple` specifying the grid spacing of each axis.
+
+        The following keyword-only arguments are recognized:
+
+        :arg first_stencil: A :class:`callable` with signature
+            ``(f, direction)`` where f is a :class:`Field` and ``direction``
+            indicates the spatial axis (1, 2, or 3) along which the stencil is taken,
+            returning the (symbolic) first-order stencil.
+            Defaults to the centered-difference of the highest order allowed
+            by the amount of array padding (set by :attr:`h`).
+            See :func:`~pystella.derivs.expand_stencil`.
+
+        :arg second_stencil: Like ``first_stencil``, but for the second-order
+            differences.
+
+        :arg rank_shape: A 3-:class:`tuple` specifying the global size of every
+            kernel call.
+            Defaults to *None*, in which case the global size is not fixed (and
+            will be inferred when the kernel is called, at a slight performance
+            penalty).
+
+        .. ifconfig:: not on_rtd
+
+            :arg stream: Whether to use :class:`StreamingStencil`.
+                Defaults to *False*.
+        """
+
+        self.decomp = decomp
+        stream = kwargs.pop('stream', False)
+        first_stencil = kwargs.pop('first_stencil', FirstCenteredDifference(h))
+        second_stencil = kwargs.pop('second_stencil', SecondCenteredDifference(h))
+        rank_shape = kwargs.pop('rank_shape', None)
+
+        args = [lp.GlobalArg('fx', shape="(Nx+2*h, Ny+2*h, Nz+2*h)", offset=lp.auto)]
+
+        fx = Field('fx', offset='h')
+        pd = tuple(Field(pdi) for pdi in ('pdx', 'pdy', 'pdz'))
+        pdx, pdy, pdz = ({pdi: first_stencil(fx, i+1) * (1/dxi)}
+                         for i, (pdi, dxi) in enumerate(zip(pd, dx)))
+        lap = {Field('lap'): sum(second_stencil(fx, i+1) * dxi**-2
+                                 for i, dxi in enumerate(dx))}
+
+        self.pdx_knl = Stencil(pdx, args=args, prefetch_args=['fx'],
+                               lsize=(16, 2, 16), h=h, rank_shape=rank_shape)
+        self.pdy_knl = Stencil(pdy, args=args, prefetch_args=['fx'],
+                               lsize=(16, 16, 2), h=h, rank_shape=rank_shape)
+        self.pdz_knl = Stencil(pdz, args=args, prefetch_args=['fx'],
+                               lsize=(64, 2, 2), h=h, rank_shape=rank_shape)
+
+        if stream:
+            lsize = {h_: (16, 4, 8) for h_ in range(1, 5)}
+        else:
+            lsize = {1: (8, 8, 8), 2: (8, 4, 4), 3: (4, 4, 4), 4: (2, 2, 2)}
+
+        SS = StreamingStencil if stream else Stencil
+        self.lap_knl = SS(lap, args=args, prefetch_args=['fx'],
+                          lsize=lsize[h], h=h, rank_shape=rank_shape)
+
+        self.grad_knl = SS({**pdx, **pdy, **pdz}, args=args, prefetch_args=['fx'],
+                           lsize=lsize[h], h=h, rank_shape=rank_shape)
+
+        self.grad_lap_knl = SS({**pdx, **pdy, **pdz, **lap},
+                               args=args, prefetch_args=['fx'],
+                               lsize=lsize[h], h=h, rank_shape=rank_shape)
+
+    def __call__(self, queue, fx, *,
+                 lap=None, pdx=None, pdy=None, pdz=None, grd=None):
+        """
+        Computes requested derivatives of the input ``fx``.
+
+        :arg fx: The array to compute derivatives of. Halos are shared using
+            :meth:`DomainDecomposition.share_halos`, and a kernel is called
+            based on what combination of the remainin input arguments are not *None*.
+
+            Valid combinations are
+
+            * all of ``lap``, ``pdx``, ``pdy``, and ``pdz``
+              (or equivalently ``lap`` and ``grd``)
+
+            * any single one of ``lap``, ``pdx``, ``pdy``, or ``pdz``
+
+            * only ``pdx``, ``pdy``, and ``pdz``
+              (or equivalently only ``grd``)
+
+            If ``fx`` has shape ``(...,) + (rank_shape+2*h)``, all the
+            outermost indices (i.e., in place of ``...``) are looped over.
+            As an example, with ``h=1``::
+
+                >>> fx.shape, lap.shape
+                ((2, 3, 130, 130, 130), (2, 3, 128, 128, 128))
+                >>> derivs(queue, fx=fx, lap=lap)
+
+            would loop over the outermost two axes with shape ``(2, 3)``.
+            Note that the shapes of ``fx`` and ``lap`` (or in general all input
+            arrays) must match on these outer axes.
+
+        :arg lap: The array which will store the Laplacian of ``fx``.
+            Defaults to *None*.
+
+        :arg pdx: The array which will store the :math:`x`-derivative of ``fx``.
+            Defaults to *None*.
+
+        :arg pdy: The array which will store the :math:`y`-derivative of ``fx``.
+            Defaults to *None*.
+
+        :arg pdz: The array which will store the :math:`z`-derivative of ``fx``.
+            Defaults to *None*.
+
+        :arg grd: The array containing the gradient of ``fx``, i.e., all three of
+            ``pdx``, ``pdy``, and ``pdz``.
+            If supplied, any input values to ``pdx``, ``pdy``, or ``pdz`` are
+            ignored and replaced via ::
+
+                pdx = grd[..., 0, :, :, :]
+                pdy = grd[..., 1, :, :, :]
+                pdz = grd[..., 2, :, :, :]
+
+            Defaults to *None*.
+
+        :returns: The :class:`pyopencl.Event` associated with the kernel
+            invocation (i.e., of the last called kernel if multiple axes are
+            being looped over).
+        """
+
+        from itertools import product
+        slices = list(product(*[range(n) for n in fx.shape[:-3]]))
+
+        if grd is not None:
+            pdx = grd[..., 0, :, :, :]
+            pdy = grd[..., 1, :, :, :]
+            pdz = grd[..., 2, :, :, :]
+
+        for s in slices:
+            self.decomp.share_halos(queue, fx[s])
+            if (lap is not None and pdx is not None
+                    and pdy is not None and pdz is not None):
+                evt, _ = self.grad_lap_knl(queue, fx=fx[s], lap=lap[s],
+                                           pdx=pdx[s], pdy=pdy[s], pdz=pdz[s])
+            elif pdx is not None and pdy is not None and pdz is not None:
+                evt, _ = self.grad_knl(queue, fx=fx[s],
+                                       pdx=pdx[s], pdy=pdy[s], pdz=pdz[s])
+            elif lap is not None:
+                evt, _ = self.lap_knl(queue, fx=fx[s], lap=lap[s])
+            elif pdx is not None:
+                evt, _ = self.pdx_knl(queue, fx=fx[s], pdx=pdx[s])
+            elif pdy is not None:
+                evt, _ = self.pdy_knl(queue, fx=fx[s], pdy=pdy[s])
+            elif pdz is not None:
+                evt, _ = self.pdz_knl(queue, fx=fx[s], pdz=pdz[s])
+
+        return evt
diff --git a/pystella/elementwise.py b/pystella/elementwise.py
new file mode 100644
index 0000000..7f17b78
--- /dev/null
+++ b/pystella/elementwise.py
@@ -0,0 +1,220 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import loopy as lp
+from pystella.field import Indexer
+import pymbolic.primitives as pp
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: ElementWiseMap
+"""
+
+
+class ElementWiseMap:
+    """
+    An interface to :func:`loopy.make_kernel`, which creates a kernel
+    with parallelization suitable for operations which are "local"--namely,
+    element-wise maps where each workitem (thread) only accesses one element
+    of global arrays.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def parallelize(self, knl, lsize):
+        knl = lp.split_iname(knl, "k", lsize[0], outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_iname(knl, "j", lsize[1], outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "i", lsize[2], outer_tag="g.2", inner_tag="unr")
+        return knl
+
+    def _assignment(self, assignee, expression, **kwargs):
+        no_sync_with = kwargs.pop('no_sync_with', [('*', 'any')])
+        return lp.Assignment(assignee, expression,
+                             no_sync_with=no_sync_with,
+                             **kwargs)
+
+    def make_kernel(self, map_dict, tmp_dict, args, **kwargs):
+        temp_statements = []
+        temp_vars = []
+        for assignee, expression in tmp_dict.items():
+            # only declare temporary variables once
+            if isinstance(assignee, pp.Variable):
+                current_tmp = assignee
+            elif isinstance(assignee, pp.Subscript):
+                current_tmp = assignee.aggregate
+            else:
+                current_tmp = None
+            if current_tmp is not None and current_tmp not in temp_vars:
+                temp_vars += [current_tmp]
+                temp_var_type = lp.Optional(None)
+            else:
+                temp_var_type = lp.Optional()
+
+            stmnt = self._assignment(Indexer(assignee), Indexer(expression),
+                                     temp_var_type=temp_var_type)
+            temp_statements += [stmnt]
+
+        output_statements = []
+        for assignee, expression in map_dict.items():
+            stmnt = self._assignment(Indexer(assignee), Indexer(expression))
+            output_statements += [stmnt]
+
+        options = kwargs.pop('options', lp.Options())
+        # ignore lack of supposed dependency for single-instruction kernels
+        if len(map_dict) + len(tmp_dict) == 1:
+            setattr(options, 'check_dep_resolution', False)
+
+        knl = lp.make_kernel(
+            "[Nx, Ny, Nz] -> {[i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz}",
+            temp_statements + output_statements,
+            args + [lp.ValueArg('Nx, Ny, Nz', dtype='int'), '...'],
+            options=options,
+            **kwargs,
+        )
+
+        new_args = []
+        for arg in knl.args:
+            if isinstance(arg, lp.KernelArgument) and arg.dtype is None:
+                new_arg = arg.copy(dtype=self.dtype)
+                new_args.append(new_arg)
+            else:
+                new_args.append(arg)
+        knl = knl.copy(args=new_args)
+        knl = lp.remove_unused_arguments(knl)
+
+        return knl
+
+    def __init__(self, map_dict, **kwargs):
+        """
+        :arg map_dict: A :class:`dict` of ``lhs``: ``rhs`` pairs representing the
+            statements which write to global arrays. Both the keys and values
+            must be :mod:`pymbolic` expressions. Both the keys and values will be
+            processed with :class:`Indexer`.
+
+        The following keyword-only arguments are recognized:
+
+        :arg tmp_dict: A :class:`dict` of ``lhs``: ``rhs`` pairs representing the
+            statements which write to temporary variables (i.e., local memory or
+            registers). Both the keys and values must be :mod:`pymbolic` expressions.
+            The values will be processed with :class:`Indexer`.
+            The statements produced from ``tmp_dict`` will precede those of
+            ``map_dict``, and :class:`loopy.TemporaryVariable` arguments will be
+            inferred as needed.
+
+        :arg args: A list of kernel arguments to be specified to
+            :func:`loopy.make_kernel`. Defaults to ``['...']``, which instructs
+            :func:`loopy.make_kernel` to infer all arguments and their shapes.
+
+        :arg dtype: The default datatype of arrays to assume.
+            Will only be applied to all :class:`loopy.KernelArgument`'s
+            whose datatypes were not already specified by any input ``args``.
+            Defaults to *None*.
+
+        :arg lsize: The size of local parallel workgroups. Defaults to
+            ``(16, 4, 1)``, which should come close to saturating memory bandwidth
+            in many cases.
+
+        :arg rank_shape: A 3-:class:`tuple` specifying the global size of every
+            kernel call.
+            Defaults to *None*, in which case the global size is not fixed (and
+            will be inferred when the kernel is called, at a slight performance
+            penalty).
+
+        :arg h: The number of halo padding layers on each face of the numerical grid.
+            Defaults to *None*, in which case it is not fixed at kernel creation.
+
+        Any remaining keyword arguments are passed to :func:`loopy.make_kernel`.
+        """
+
+        self.map_dict = map_dict
+        self.tmp_dict = kwargs.pop('tmp_dict', {})
+        self.args = kwargs.pop('args', ['...'])
+        self.dtype = kwargs.pop('dtype', None)
+
+        # default local size which saturates memory bandwidth
+        self.lsize = lsize = kwargs.pop('lsize', (16, 4, 1))
+        rank_shape = kwargs.pop('rank_shape', None)
+        h = kwargs.pop('h', None)
+
+        kernel_kwargs = dict(
+            seq_dependencies=True,
+            default_offset=lp.auto,
+            target=lp.PyOpenCLTarget(),
+            lang_version=(2018, 2),
+        )
+        kernel_kwargs.update(kwargs)
+
+        knl = self.make_kernel(self.map_dict, self.tmp_dict, self.args,
+                               **kernel_kwargs)
+
+        if rank_shape is not None:
+            knl = lp.fix_parameters(
+                knl, Nx=rank_shape[0], Ny=rank_shape[1], Nz=rank_shape[2]
+            )
+        if h is not None:
+            knl = lp.fix_parameters(knl, h=h)
+
+        self.knl = self.parallelize(knl, lsize)
+
+    def __call__(self, queue=None, filter_args=False, **kwargs):
+        """
+        Invokes the kernel, :attr:`knl`. All data arguments required by :attr:`knl`
+        must be passed by keyword.
+
+        The following keyword arguments are recognized
+        (and will not be passed to :attr:`knl`):
+
+        :arg queue: The :class:`pyopencl.CommandQueue` on which to enqueue the
+            kernel.
+            If *None* (the default), ``queue`` is not passed (i.e., for
+            :class:`loopy.ExecutableCTarget`).
+
+            .. note::
+
+                For :class:`loopy.PyOpenCLTarget` (the default), a valid
+                :class:`pyopencl.CommandQueue` is a required argument.
+
+        :arg filter_args: Whether to filter ``kwargs`` such that only arguments to
+            the ``knl`` are passed. Defaults to *False*.
+
+        :returns: ``(evt, output)`` where ``evt`` is the :class:`pyopencl.Event`
+            associated with the kernel invocation and ``output`` is any kernel
+            output.
+            See :mod:`loopy`'s tutorial for details.
+        """
+
+        input_args = kwargs.copy()
+        if filter_args:
+            kernel_args = [arg.name for arg in self.knl.args]
+            for arg in kwargs:
+                if arg not in kernel_args:
+                    input_args.pop(arg)
+
+        # queue is not supplied for *CTarget
+        if queue is None:
+            knl_output = self.knl(**input_args)
+        else:
+            knl_output = self.knl(queue, **input_args)
+
+        return knl_output
diff --git a/pystella/expansion.py b/pystella/expansion.py
new file mode 100644
index 0000000..efc50f6
--- /dev/null
+++ b/pystella/expansion.py
@@ -0,0 +1,170 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import loopy as lp
+
+
+class Expansion:
+    """
+    Implements the time stepping of the scale-factor evolution for conformal
+    FLRW spacetimes.
+
+    .. automethod:: __init__
+    .. automethod:: adot_friedmann_1
+    .. automethod:: addot_friedmann_2
+    .. automethod:: step
+    .. automethod:: constraint
+    """
+
+    def __init__(self, energy, Stepper, mpl=1., dtype=np.float64):
+        """
+        :arg energy: The initial energy density, used to initialize
+            :math:`\\partial a / \\partial \\tau`.
+
+        :arg Stepper: A :class:`~pystella.step.Stepper` to use for time stepping.
+
+        :arg mpl: The unreduced Planck mass,
+            :math:`m_\\mathrm{pl}^2 \\equiv 1 / G_N`.
+            Setting this value chooses the units of the system.
+            For example, to work in units of the *reduced* Planck mass,
+            :math:`M_\\mathrm{pl}^2 \\equiv (8 \\pi G_N)^{-1}`, pass
+            ``mpl = np.sqrt(8 * np.pi)``.
+            Defaults to ``1``.
+
+        :arg dtype: The datatype of the input and output arrays.
+            Defaults to `float64`.
+        """
+
+        self.mpl = mpl
+        from pystella.step import LowStorageRKStepper
+
+        is_low_storage = LowStorageRKStepper in Stepper.__bases__
+        shape = (1,) if is_low_storage else (3,)
+        self.a = np.ones(shape, dtype=dtype)
+        self.adot = self.adot_friedmann_1(self.a, energy)
+
+        from pystella import Field
+        _a = Field('a', indices=[])[(0,) if is_low_storage else ()]
+        _adot = Field('adot', indices=[])[(0,) if is_low_storage else ()]
+        from pymbolic import var
+        _e = var('energy')
+        _p = var('pressure')
+        rhs_dict = {_a: _adot,
+                    _adot: self.addot_friedmann_2(_a, _e, _p)}
+
+        args = [lp.GlobalArg('a', shape=shape, dtype=dtype),
+                lp.GlobalArg('adot', shape=shape, dtype=dtype),
+                lp.ValueArg('energy', dtype=dtype),
+                lp.ValueArg('pressure', dtype=dtype),
+                ]
+
+        from pystella import DisableLogging
+        with DisableLogging():  # silence GCCToolchain warning
+            if is_low_storage:
+                k_tmp = np.zeros(shape=(2,), dtype=dtype)
+                self.stepper = Stepper(rhs_dict, k_tmp, args=args,
+                                       rank_shape=(0, 0, 0), h=0, dtype=dtype,
+                                       target=lp.ExecutableCTarget())
+            else:
+                self.stepper = Stepper(rhs_dict, args=args,
+                                       rank_shape=(0, 0, 0), h=0, dtype=dtype,
+                                       target=lp.ExecutableCTarget())
+
+    def adot_friedmann_1(self, a, energy):
+        """
+        :arg a: The current scale factor, :math:`a`.
+
+        :arg energy: The current energy density, :math:`\\rho \\equiv T_{00}`.
+
+        :returns: The value of :math:`\\partial_\\tau a`
+            as given by Friedmann's first equation,
+
+        .. math::
+
+            \\partial_\\tau a
+            = \\sqrt{\\frac{8 \\pi}{3 m_\\mathrm{pl}^2} \\rho} a
+        """
+
+        return np.sqrt(8 * np.pi / 3 / self.mpl**2 * energy) * a
+
+    def addot_friedmann_2(self, a, energy, pressure):
+        """
+        :arg a: The current scale factor, :math:`a`.
+
+        :arg energy: The current energy density, :math:`\\rho \\equiv T_{00}`.
+
+        :arg pressure: The current pressure,
+            :math:`P \\equiv \\frac{1}{3} T_{ij} \\delta^{ij}`.
+
+        :returns: The value of :math:`\\partial_\\tau^2 a`
+            as given by Friedmann's second equation,
+
+        .. math::
+
+            \\partial_\\tau^2 a
+            = \\frac{4 \\pi}{3 m_\\mathrm{pl}^2}
+            \\left( \\rho - 3 P \\right) a
+        """
+
+        return 4 * np.pi / 3 / self.mpl**2 * (energy - 3 * pressure) * a
+
+    @property
+    def hubble(self):
+        return self.adot / self.a
+
+    def step(self, stage, energy, pressure, dt):
+        """
+        Executes one stage of the time stepper.
+
+        :arg stage: Which stage of the integrator to call.
+
+        :arg energy: The current energy density, :math:`\\rho \\equiv T_{00}`.
+
+        :arg pressure: The current pressure,
+            :math:`P \\equiv \\frac{1}{3} T_{ij} \\delta^{ij}`.
+
+        :arg dt: The timestep to take.
+        """
+
+        self.stepper(stage, a=self.a, adot=self.adot, dt=dt,
+                     energy=energy, pressure=pressure)
+
+    def constraint(self, energy):
+        """
+        A dimensionless measure of the satisfaction of the first Friedmann equation
+        (as a constraint on the evolution), equal to
+
+        .. math::
+
+            \\left\\vert \\frac{1}{\\mathcal{H}}
+            \\sqrt{\\frac{8 \\pi}{3 m_\\mathrm{pl}^2} \\rho} - 1
+            \\right\\vert
+
+        where :math:`\\mathcal{H}` the current conformal Hubble parameter,
+        :math:`\\partial_\\tau a / a`.
+
+        :arg energy: The current energy density, :math:`\\rho \\equiv T_{00}`.
+        """
+
+        return np.abs(self.adot_friedmann_1(self.a[0], energy)/self.adot[0] - 1)
diff --git a/pystella/field/__init__.py b/pystella/field/__init__.py
new file mode 100644
index 0000000..883947e
--- /dev/null
+++ b/pystella/field/__init__.py
@@ -0,0 +1,375 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import pymbolic.primitives as pp
+from pymbolic import parse
+from pymbolic.mapper import IdentityMapper
+from pymbolic.mapper.stringifier import StringifyMapper
+from pystella.field.diff import diff
+# from pystella.field.sympy import pymbolic_to_sympy, sympy_to_pymbolic, simplify
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: Field
+.. autoclass:: DynamicField
+.. autofunction:: Indexer
+.. autofunction:: diff
+.. currentmodule:: pystella.field
+.. autofunction:: get_field_args
+.. automodule:: pystella.field.sympy
+"""
+
+
+def parse_if_str(expr):
+    return parse(expr) if isinstance(expr, str) else expr
+
+
+class Field(pp.Expression):
+    """
+    A :class:`pymbolic.primitives.Expression` designed to mimic an array by carrying
+    information about indexing. Kernel generators (:class:`Reduction`,
+    :class:`ElementWiseMap`, and subclasses) automatically append indexing
+    specified by the attributes :attr:`indices` and :attr:`offset` by pre-processing
+    the expressions with :func:`Indexer`.
+
+    Examples::
+
+        >>> f = Field('f', offset='h')
+        >>> print(Indexer(f))
+        f[i + h, j + h, k + h]
+        >>> print(Indexer(f[0]))
+        f[0, i + h, j + h, k + h]
+
+    See `test_field.py
+    <https://github.com/zachjweiner/pystella/blob/master/test/test_field.py>`_
+    for more examples of
+    the intended functionality.
+
+    .. attribute:: child
+
+        The child expression representing the un-subscripted field. Can be input
+        as a string or a :class:`pymbolic.primitives.Expression`.
+
+    .. attribute:: name
+
+        The name of the :class:`Field` instance, i.e., as would appear in
+        a generated kernel. Defaults to ``str(child)``.
+
+    .. attribute:: indices
+
+        A tuple of (symbolic) array indices that will subscript the array. Each
+        entry may be a :class:`pymbolic.primitives.Variable` or a string which
+        parses to one. Defaults to ``('i', 'j', 'k')``
+
+    .. attribute:: offset
+
+        The amount of padding by which to offset the array axes corresponding to
+        the elements of :attr:`indices`. May be a tuple with the same length as
+        :attr:`indices` or a single value. In the latter case, the input is
+        transformed into a tuple with the same length as :attr:`indices`, each with
+        the same value. Defaults to ``0``.
+
+    .. attribute:: ignore_prepends
+
+        Whether to ignore array subscripts prepended when processed with
+        :func:`Indexer`. Useful for timestepping kernels which prepend array indices
+        corresponding to extra storage axes (to specify that an array does not have
+        this axis). Defaults to *False*.
+    """
+
+    def __init__(self, child, name=None, offset=0, indices=('i', 'j', 'k'),
+                 ignore_prepends=False):
+        self.child = parse_if_str(child)
+        self.name = name if isinstance(name, str) else str(child)
+
+        if not isinstance(offset, (list, tuple)):
+            offset = (offset,)*len(indices)
+        if len(offset) != len(indices):
+            raise ValueError('offset and indices must have same length')
+
+        self.offset = tuple(parse_if_str(o) for o in offset)
+        self.indices = tuple(parse_if_str(i) + off
+                             for i, off in zip(indices, self.offset))
+
+        self.ignore_prepends = ignore_prepends
+
+    def __getinitargs__(self):
+        return (self.child, self.indices, self.name, self.ignore_prepends)
+
+    mapper_method = "map_field"
+
+    def make_stringifier(self, originating_stringifier=None):
+        # FIXME: do something with originating_stringifier?
+        return FieldStringifyMapper()
+
+    def shift(self, vec):
+        return Field(self.child, self.name, offset=vec, indices=self.indices,
+                     ignore_prepends=self.ignore_prepends)
+
+
+class FieldStringifyMapper(StringifyMapper):
+    def map_field(self, expr, enclosing_prec, *args, **kwargs):
+        if expr.name is not None:
+            return self.rec(parse(expr.name), enclosing_prec, *args, **kwargs)
+        else:
+            return self.rec(expr.child, enclosing_prec, *args, **kwargs)
+
+    map_dynamic_field = map_field
+
+
+class DynamicField(Field):
+    """
+    A subclass of :class:`Field` which also contains associated :class:`Field`
+    instances representing various derivatives of the base :class:`Field`.
+
+    .. attribute:: dot
+
+        A :class:`Field` representing the time derivative of the base
+        :class:`Field`. It shares the same :attr:`indices` and :attr:`offset`
+        as the base :class:`Field`. Its name defaults to ``d{self.child}dt``,
+        but may be specified via the argument ``dot_child``.
+
+    .. attribute:: lap
+
+        A :class:`Field` representing the Laplacian of the base
+        :class:`Field`. It shares the same :attr:`indices` as the base
+        :class:`Field` but with ``offset = 0``. Its name defaults to
+        ``lap_{self.child}``, but may be specified via the argument
+        ``lap_child``.
+
+    .. attribute:: pd
+
+        A :class:`Field` representing the spatial derivative(s) of the base
+        :class:`Field`. It shares the same :attr:`indices` as the base
+        :class:`Field` but with ``offset = 0``. Its name defaults to
+        ``d{self.child}dx``, but may be specified via the argument
+        ``pd_child``.
+
+    .. automethod:: d
+
+    """
+
+    def __init__(self, child, name=None, offset='0', indices=('i', 'j', 'k'),
+                 dot_child=None, lap_child=None, pd_child=None):
+        super().__init__(child, name, offset, indices)
+
+        self.dot = Field(dot_child if dot_child is not None else 'd' + child + 'dt',
+                         'd' + self.name + 'dt',
+                         offset, indices=indices)
+
+        self.lap = Field(lap_child if lap_child is not None else 'lap_' + child,
+                         'lap_' + self.name,
+                         offset='0', indices=indices, ignore_prepends=True)
+
+        self.pd = Field(pd_child if pd_child is not None else 'd' + child + 'dx',
+                        'd' + self.name + 'dx',
+                        offset='0', indices=indices, ignore_prepends=True)
+
+    def d(self, *args):
+        """
+        Returns the (subscripted) derivative of the base :class:`Field`, i.e.,
+        either :attr:`dot` or :attr:`pd` with the appropriate index.
+
+        For example, the "time" derivative of a field would be
+
+            >>> f = DynamicField('f')
+            >>> print(f.d(0))  # x^0 = "time"
+            dfdt
+
+        Additional arguments are interpreted as subscripts to the resulting array;
+        the final argument corresponds to the coordinate being differentiated with
+        respect to.
+
+            >>> print(f.d(1, 2, 0))
+            dfdt[1, 2]
+
+        Spatial indices ``1`` through ``3`` denote spatial derivatives (whose
+        array subscripts are ``0`` through ``2``).
+
+            >>> print(f.d(2))  # x^2 = y
+            dfdx[1]
+            >>> print(f.d(0, 1, 3))  # x^3 = z
+            dfdx[0, 1, 2]
+
+        """
+        mu = args[-1]
+        indices = args[:-1]+(mu-1,)
+        return self.dot[args[:-1]] if mu == 0 else self.pd[indices]
+
+    def __getinitargs__(self):
+        return (self.child, self.indices, self.name, self.dot, self.lap, self.pd)
+
+    mapper_method = "map_dynamic_field"
+
+
+class IndexMapper(IdentityMapper):
+    def parse_prepend(self, pre_index):
+        if isinstance(pre_index, str):
+            pre_index = (parse(pre_index),)
+        if isinstance(pre_index, pp.Variable):
+            pre_index = (pre_index,)
+        return pre_index
+
+    def map_field(self, expr, *args, **kwargs):
+        if expr.ignore_prepends:
+            pre_index = ()
+        else:
+            pre_index = self.parse_prepend(kwargs.pop('prepend_with', ()))
+
+        if isinstance(expr.child, pp.Subscript):
+            x = pp.Subscript(expr.child.aggregate,
+                             pre_index + expr.child.index_tuple + expr.indices)
+        elif isinstance(expr.child, pp.Variable):
+            full_index = pre_index + expr.indices
+            if full_index == ():
+                x = expr.child
+            else:
+                x = pp.Subscript(expr.child, pre_index + expr.indices)
+        else:
+            x = expr
+        return self.rec(x)
+
+    map_dynamic_field = map_field
+
+    def map_subscript(self, expr, *args, **kwargs):
+        if isinstance(expr.aggregate, Field):
+            pre_index = () if expr.aggregate.ignore_prepends \
+                        else self.parse_prepend(kwargs.pop('prepend_with', ()))
+
+            a = self.rec(expr.aggregate)
+            if isinstance(a, pp.Subscript):
+                agg = a.aggregate
+                full_index = pre_index + expr.index_tuple + a.index_tuple
+            else:
+                agg = a
+                full_index = pre_index + expr.index_tuple
+
+            if full_index == ():
+                x = agg
+            else:
+                x = pp.Subscript(agg, full_index)
+            return self.rec(x)
+        else:
+            return IdentityMapper.map_subscript(self, expr, *args, **kwargs)
+
+    def map_lookup(self, expr, *args, **kwargs):
+        return self.rec(pp.Variable(expr.name))
+
+
+#: An instance of :class:`IndexMapper` which appends indices to :class:`Field`
+#: instances in an expression, turning them into ordinary
+#: :class:`pymbolic.primitives.Subscript`'s.
+#: See the documentation of :class:`Field` for examples.
+#:
+#: :arg expr: The :mod:`pymbolic` expression to be mapped.
+#:
+#: :arg prepend_with: A :class:`tuple` of indices to prepend to the subscript
+#:  of any :class:`Field`'s in ``expr`` (unless a given :class:`Field` has
+#:  :attr:ignore_prepends` set to *False*. Defaults to an empty :class:`tuple`.
+Indexer = IndexMapper()
+
+from pymbolic.mapper import Collector
+
+
+class FieldCollector(Collector):
+    def map_field(self, expr):
+        return set([expr])
+
+    map_dynamic_field = map_field
+
+
+def get_field_args(expressions, unpadded_shape=None):
+    """
+    A :class:`pymbolic.Collector` which collects all :class:`~pystella.Field`'s
+    from ``expressions`` and returns a corresponding list of
+    :class:`loopy.ArrayArg`'s, using information about array indexing offsets
+    to produce
+
+    .. warning::
+
+        This method currently does not correctly process
+        :class:`~pystella.Field`'s which are subscripted (i.e., nested
+        inside a :class:`pymbolic.primitives.Subscript`).
+        That is, it disregards any information about outer axes as represented
+        by subscripting.
+
+    :arg expressions: The expressions from which to collect
+        :class:`~pystella.Field`'s.
+        May be one of the following:
+
+            * A :class:`dict`, in which case all keys and values are iterated over.
+
+            * A :class:`list`, in which case all elements are iterated over.
+
+            * A :class:`pymbolic.primitives.Expression`.
+
+    The following keyword arguments are recognized:
+
+    :arg unpadded_shape: The shape of :class:`~pystella.Field`'s in ``expressions``
+        (sans padding).
+        Defaults to ``(Nx, Ny, Nz)``.
+
+    :returns: A :class:`list` of :class:`loopy.ArrayArg`'s.
+
+    Example::
+
+        >>> f = Field('f', offset='h)
+        >>> get_field_args(f)
+        [<f: type: <auto/runtime>, shape: (Nx + 2*h, Ny + 2*h, Nz + 2*h)
+        aspace: global>]
+    """
+
+    all_exprs = []
+    if isinstance(expressions, dict):
+        for k, v in expressions.items():
+            all_exprs.append(k)
+            all_exprs.append(v)
+    elif isinstance(expressions, list):
+        all_exprs = expressions
+    else:
+        all_exprs = [expressions]
+
+    if unpadded_shape is None:
+        unpadded_shape = parse('Nx, Ny, Nz')
+
+    from loopy import GlobalArg
+
+    fields = FieldCollector()(all_exprs)
+    args = []
+    for f in fields:
+        shape = tuple(N + 2 * h for N, h in zip(unpadded_shape, f.offset))
+        args.append(GlobalArg(f.child.name, shape=shape))
+
+    return sorted(args, key=lambda f: f.name)
+
+
+__all__ = [
+    "Field",
+    "DynamicField",
+    "Indexer",
+    "diff",
+    "get_field_args",
+    # "pymbolic_to_sympy",
+    # "sympy_to_pymbolic",
+    # "simplify",
+]
diff --git a/pystella/field/diff.py b/pystella/field/diff.py
new file mode 100644
index 0000000..a03ce47
--- /dev/null
+++ b/pystella/field/diff.py
@@ -0,0 +1,89 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import pymbolic.primitives as pp
+from pymbolic.mapper.differentiator import DifferentiationMapper
+from pymbolic import var
+
+
+class FieldDifferentiationMapper(DifferentiationMapper):
+    def __init__(self, variable, xmu=None):
+        if xmu is not None:
+            self.xmu = xmu
+        else:
+            self.xmu = {var('t'): 0, var('x'): 1, var('y'): 2, var('z'): 3}
+        super().__init__(variable)
+
+    map_field = DifferentiationMapper.map_variable
+
+    def map_dynamic_field(self, expr, *args):
+        if self.variable in self.xmu:
+            return expr.d(*args, self.xmu[self.variable])
+        else:
+            return self.map_field(expr, *args)
+
+    def map_subscript(self, expr, *args):
+        from pystella.field import DynamicField
+        if isinstance(expr.aggregate, DynamicField) and self.variable in self.xmu:
+            return self.rec(expr.aggregate, *expr.index_tuple)
+        else:
+            return super().map_subscript(expr, *args)
+
+    def map_if(self, expr, *args):
+        from pymbolic.primitives import If
+        return If(expr.condition, self.rec(expr.then), self.rec(expr.else_))
+
+
+def diff(f, *x):
+    """
+    A differentiator which computes ``\\partial f / \\partial x`` and understands
+    :class:`Field`'s. If ``x`` is one of ``t``, ``x``, ``y``, or ``z`` and ``f``
+    is a :class:`DynamicField`, the corresponding derivative :class:`Field` is
+    returned.
+
+    Examples::
+
+        >>> f = DynamicField('f')
+        >>> print(diff(f**3, f))
+        3*f**2
+        >>> print(diff(f**3, f, f))
+        3*2*f
+        >>> print(diff(f**3, 't'))
+        3*f**2*dfdt
+        >>> print(diff(f**3, f, 't'))
+        3*2*f*dfdt
+        >>> print(diff(f + 2, 'x'))
+        dfdx[0]
+
+    :arg f: A :mod:`pymbolic` expression to be differentiated.
+
+    :arg x: A :class:`pymbolic.primitives.Expression` or a string to be parsed
+        (or multiple thereof). If multiple positional arguments are provided,
+        derivatives are taken with respect to each in order.
+        (See the examples above.)
+    """
+
+    if len(x) > 1:
+        return diff(diff(f, x[0]), *x[1:])
+    else:
+        return FieldDifferentiationMapper(pp.make_variable(x[0]))(f)
diff --git a/pystella/field/sympy.py b/pystella/field/sympy.py
new file mode 100644
index 0000000..dd1b523
--- /dev/null
+++ b/pystella/field/sympy.py
@@ -0,0 +1,143 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import sympy as sym
+import pymbolic.primitives as pp
+from pymbolic.interop.sympy import PymbolicToSympyMapper, SympyToPymbolicMapper
+
+__doc__ = """
+.. currentmodule:: pystella.field.sympy
+
+Sympy interoperability
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: pymbolic_to_sympy
+.. autofunction:: sympy_to_pymbolic
+.. autofunction:: simplify
+"""
+
+
+class SympyField(sym.Symbol):
+    def __new__(cls, field, **assumptions):
+        symb = super().__new__(cls, field.child.name, **assumptions)
+        symb.field = field
+        return symb
+
+
+class PymbolicToSympyMapperWithField(PymbolicToSympyMapper):
+    def map_lookup(self, expr, *args, **kwargs):
+        return pp.Variable(expr.name)
+
+    def map_call(self, expr):
+        function = self.rec(expr.function)
+        if isinstance(function, pp.Variable):
+            func_name = function.name
+            try:
+                func = getattr(self.sym.functions, func_name)
+            except AttributeError:
+                func = self.sym.Function(func_name)
+            return func(*[self.rec(par) for par in expr.parameters])
+        else:
+            self.raise_conversion_error(expr)
+
+    def map_field(self, expr):
+        return SympyField(expr)
+
+    map_dynamic_field = map_field
+
+
+class SympyToPymbolicMapperMathLookup(SympyToPymbolicMapper):
+    functions = {'exp', 'expm1', 'log',
+                 'sin', 'cos', 'tan',
+                 'sinh', 'cosh', 'tanh',
+                 'fabs', 'Abs', 'sign'}
+
+    def map_Function(self, expr):
+        name = self.function_name(expr)
+        if name in self.functions:
+            args = tuple(self.rec(arg) for arg in expr.args)
+
+            from pymbolic.primitives import Variable, Lookup
+            if name == 'Abs':
+                call = Lookup(Variable('math'), 'fabs')
+            elif name == 'sign':
+                call = Lookup(Variable('math'), 'copysign')
+                args = (1,)+args
+            else:
+                call = Lookup(Variable('math'), name)
+            return call(*args)
+        else:
+            return self.not_supported(expr)
+
+
+class SympyToPymbolicMapperWithField(SympyToPymbolicMapperMathLookup):
+    def map_SympyField(self, expr):
+        return expr.field
+
+
+#: A mapper which converts :class:`pymbolic.primitives.Expression`'s into
+#: :mod:`sympy` expressions and understands :class:`~pystella.Field`'s.
+#: The result can be converted back to a :class:`pymbolic.primitives.Expression`
+#: with all :class:`~pystella.Field`'s in place, accomplished via a subclass
+#: of :class:`sympy.Symbol` which retains a copy of the :class:`~pystella.Field`.
+#:
+#: :arg expr: The :mod:`pymbolic` expression to be mapped.
+#:
+pymbolic_to_sympy = PymbolicToSympyMapperWithField()
+
+#: A mapper which converts :mod:`sympy` expressions into
+#: :class:`pymbolic.primitives.Expression`'s and understands the custom :mod:`sympy`
+#: type used to represent :class:`~pystella.Field`'s by :func:`pymbolic_to_sympy`.
+#:
+#: :arg expr: The :mod:`pymbolic` expression to be mapped.
+#:
+sympy_to_pymbolic = SympyToPymbolicMapperWithField()
+
+
+def simplify(expr, sympy_out=False):
+    """
+    A wrapper to :func:`sympy.simplify`.
+
+    :arg expr: The expression to be simplified. May either be a
+        :class:`pymbolic.primitives.Expression` or a :mod:`sympy` expression.
+
+    The following keyword arguments are recognized:
+
+    :arg sympy_out: A :class:`bool` determining whether to return the simplified
+        :mod:`sympy` expression or to first convert it to a
+        :class:`pymbolic.primitives.Expression`.
+        Defaults to *False*.
+
+    :returns: A :class:`pymbolic.primitives.Expression`'s containing the
+        simplified form of ``expr`` if ``sympy_out`` is *True*, else a
+        :mod:`sympy` expression.
+    """
+
+    if isinstance(expr, pp.Expression):
+        expr = pymbolic_to_sympy(expr)
+    expr = sym.simplify(expr)
+
+    if sympy_out:
+        return expr
+    else:
+        return sympy_to_pymbolic(expr)
diff --git a/pystella/fourier/__init__.py b/pystella/fourier/__init__.py
new file mode 100644
index 0000000..021c6a7
--- /dev/null
+++ b/pystella/fourier/__init__.py
@@ -0,0 +1,37 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from pystella.fourier.dft import DFT, gDFT, pDFT
+from pystella.fourier.rayleigh import RayleighGenerator
+from pystella.fourier.projectors import Projector
+from pystella.fourier.spectra import PowerSpectra
+from pystella.fourier.derivs import SpectralGradientLaplacian
+
+__all__ = [
+    "DFT",
+    "gDFT",
+    "pDFT",
+    "RayleighGenerator",
+    "Projector",
+    "PowerSpectra",
+    "SpectralGradientLaplacian",
+]
diff --git a/pystella/fourier/derivs.py b/pystella/fourier/derivs.py
new file mode 100644
index 0000000..ba07e3e
--- /dev/null
+++ b/pystella/fourier/derivs.py
@@ -0,0 +1,161 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import loopy as lp
+import pyopencl.array as cla
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: SpectralGradientLaplacian
+"""
+
+
+class SpectralGradientLaplacian:
+    """
+    Interface (analagous to :class:`~pystella.GradientLaplacian`)
+    for computing spatial gradients via spectral collocation.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, fft, dk):
+        """
+        The following arguments are required:
+
+        :arg fft: An FFT object as returned by :func:`~pystella.DFT`.
+            ``grid_shape`` and ``dtype`` are determined by ``fft``'s attributes.
+
+        :arg dk: A 3-:class:`tuple` of the momentum-space grid spacing of each
+            axis (i.e., the infrared cutoff of the grid in each direction).
+        """
+
+        self.fft = fft
+        grid_size = fft.grid_shape[0] * fft.grid_shape[1] * fft.grid_shape[2]
+
+        queue = self.fft.sub_k['momenta_x'].queue
+        sub_k = list(x.get().astype('int') for x in self.fft.sub_k.values())
+        k_names = ('k_x', 'k_y', 'k_z')
+        self.momenta = {}
+        self.momenta = {}
+        for mu, (name, kk) in enumerate(zip(k_names, sub_k)):
+            kk_mu = dk[mu] * kk.astype(fft.dtype)
+            self.momenta[name+'_2'] = cla.to_device(queue, kk_mu)
+
+            # zero Nyquist mode for first derivatives
+            kk_mu[abs(sub_k[mu]) == fft.grid_shape[mu]//2] = 0.
+            kk_mu[sub_k[mu] == 0] = 0.
+            self.momenta[name+'_1'] = cla.to_device(queue, kk_mu)
+
+        args = [
+            lp.GlobalArg('fk', shape="(Nx, Ny, Nz)"),
+            lp.GlobalArg("k_x_1, k_x_2", self.fft.dtype, shape=('Nx',)),
+            lp.GlobalArg("k_y_1, k_y_2", self.fft.dtype, shape=('Ny',)),
+            lp.GlobalArg("k_z_1, k_z_2", self.fft.dtype, shape=('Nz',)),
+        ]
+
+        from pystella.field import Field
+        fk = Field('fk')
+        pd = tuple(Field(pdi) for pdi in ('pdx_k', 'pdy_k', 'pdz_k'))
+
+        indices = fk.indices
+
+        from pymbolic import var
+        mom_vars = tuple(var(name+'_1') for name in k_names)
+
+        pdx, pdy, pdz = \
+            ({pdi: kk_i[indices[i]] * 1j * fk * (1/grid_size)}
+             for i, (pdi, kk_i) in enumerate(zip(pd, mom_vars)))
+
+        mom_vars = tuple(var(name+'_2') for name in k_names)
+        kmag_sq = sum(kk_i[x_i]**2 for kk_i, x_i in zip(mom_vars, indices))
+        lap = {Field('lap_k'): - kmag_sq * fk * (1/grid_size)}
+
+        from pystella.elementwise import ElementWiseMap
+        options = lp.Options(return_dict=True)
+        self.pdx_knl = ElementWiseMap(pdx, args=args, h=0, options=options)
+        self.pdy_knl = ElementWiseMap(pdy, args=args, h=0, options=options)
+        self.pdz_knl = ElementWiseMap(pdz, args=args, h=0, options=options)
+        self.lap_knl = ElementWiseMap(lap, args=args, h=0, options=options)
+        self.grad_knl = ElementWiseMap({**pdx, **pdy, **pdz}, args=args, h=0,
+                                       options=options)
+        self.grad_lap_knl = ElementWiseMap({**pdx, **pdy, **pdz, **lap}, args=args,
+                                           h=0, options=options)
+
+        self.pool = None
+
+    def __call__(self, queue, fx, *,
+                 lap=None, pdx=None, pdy=None, pdz=None, grd=None):
+        """
+        Computes requested derivatives of the input ``fx``.
+        Provides the same interface as
+        :meth:`pystella.GradientLaplacian.__call__`.
+        """
+
+        if self.pool is None:
+            import pyopencl.tools as clt
+            self.pool = clt.MemoryPool(clt.ImmediateAllocator(queue))
+
+        from itertools import product
+        slices = list(product(*[range(n) for n in fx.shape[:-3]]))
+
+        if grd is not None:
+            pdx = grd[..., 0, :, :, :]
+            pdy = grd[..., 1, :, :, :]
+            pdz = grd[..., 2, :, :, :]
+
+        for s in slices:
+            fk = self.fft.dft(fx[s])
+            if (lap is not None and pdx is not None
+                    and pdy is not None and pdz is not None):
+                evt, out = \
+                    self.grad_lap_knl(queue, fk=fk, **self.momenta,
+                                      allocator=self.pool)
+                self.fft.idft(out['lap_k'], lap[s])
+                self.fft.idft(out['pdx_k'], pdx[s])
+                self.fft.idft(out['pdy_k'], pdy[s])
+                self.fft.idft(out['pdz_k'], pdz[s])
+            elif pdx is not None and pdy is not None and pdz is not None:
+                evt, out = self.grad_knl(queue, fk=fk, **self.momenta,
+                                         allocator=self.pool, filter_args=True)
+                self.fft.idft(out['pdx_k'], pdx[s])
+                self.fft.idft(out['pdy_k'], pdy[s])
+                self.fft.idft(out['pdz_k'], pdz[s])
+            elif lap is not None:
+                evt, out = self.lap_knl(queue, fk=fk, **self.momenta,
+                                        allocator=self.pool, filter_args=True)
+                self.fft.idft(out['lap_k'], lap[s])
+            elif pdx is not None:
+                evt, out = self.pdx_knl(queue, fk=fk, **self.momenta,
+                                        allocator=self.pool, filter_args=True)
+                self.fft.idft(out['pdx_k'], pdx[s])
+            elif pdy is not None:
+                evt, out = self.pdy_knl(queue, fk=fk, **self.momenta,
+                                        allocator=self.pool, filter_args=True)
+                self.fft.idft(out['pdy_k'], pdy[s])
+            elif pdz is not None:
+                evt, out = self.pdz_knl(queue, fk=fk, **self.momenta,
+                                        allocator=self.pool, filter_args=True)
+                self.fft.idft(out['pdz_k'], pdz[s])
+
+        return None
diff --git a/pystella/fourier/dft.py b/pystella/fourier/dft.py
new file mode 100644
index 0000000..ca4a1a9
--- /dev/null
+++ b/pystella/fourier/dft.py
@@ -0,0 +1,440 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl.array as cla
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autofunction:: DFT
+.. currentmodule:: pystella.fourier
+.. autoclass:: pystella.fourier.dft.BaseDFT
+.. autoclass:: pDFT
+.. autoclass:: gDFT
+.. currentmodule:: pystella
+"""
+
+
+def DFT(decomp, context, queue, grid_shape, dtype, **kwargs):
+    """
+    A wrapper to the creation of various FFT class options which determines
+    whether to use :class:`pystella.fourier.gDFT` (for single-GPU FFTs via
+    :mod:`gpyfft`) or :class:`pystella.fourier.pDFT`
+    (for distributed, CPU FFTs via :class:`mpi4py_fft.PFFT`),
+    based on the processor shape ``proc_shape`` and a flag ``use_gpu``.
+
+    :arg decomp: A :class:`DomainDecomposition`.
+
+    :arg context: A :class:`pyopencl.Context`.
+
+    :arg queue: A :class:`pyopencl.CommandQueue`.
+
+    :arg grid_shape: A 3-:class:`tuple` specifying the shape of position-space arrays
+        to be transformed.
+
+    :arg dtype: The datatype of real arrays to be transformed. The complex
+        datatype is chosen to have the same precision.
+
+    The following keyword-only arguments are recognized:
+
+    :arg use_gpu: A :class:`bool` dictating whether to use
+        :class:`pystella.fourier.gDFT`.
+        Defaults to *True*, i.e., this flag must be set to *False* to override the
+        default choice to use :class:`pystella.fourier.gDFT` on a single rank.
+
+    Any remaining keyword arguments are passed to :class:`pystella.fourier.pDFT`,
+    should this function return such an object.
+    """
+
+    use_gpu = kwargs.pop('use_gpu', True)
+    proc_shape = decomp.proc_shape
+    if proc_shape == (1, 1, 1) and use_gpu:
+        return gDFT(decomp, context, queue, grid_shape, dtype)
+    else:
+        # local_shape = tuple(N//P for N, P in zip(grid_shape, proc_shape))
+        # tmp = cla.zeros(queue, local_shape, dtype)
+        return pDFT(decomp, queue, grid_shape, proc_shape, dtype, **kwargs)
+
+
+def _transfer_array(a, b):
+    # set a = b
+    if isinstance(a, np.ndarray) and isinstance(b, cla.Array):
+        b.get(ary=a)
+    elif isinstance(a, cla.Array) and isinstance(b, np.ndarray):
+        a.set(b)
+    return a
+
+
+class BaseDFT:
+    """
+    Base class for all FFT options.
+
+    .. automethod:: shape
+    .. automethod:: dft
+    .. automethod:: idft
+    .. automethod:: zero_corner_modes
+    """
+
+    # pylint: disable=no-member
+    def shape(self, forward_output=True):
+        """
+        :arg forward_output: A :class:`bool` specifying whether to output the
+            shape for the result of the forward Fourier transform.
+
+        :returns: A 3-:class:`tuple` of the (per--MPI-rank) shape of the requested
+            array (as specified by ``forward_output``).
+        """
+
+        raise NotImplementedError
+
+    def forward_transform(self, fx, fk, **kwargs):
+        raise NotImplementedError
+
+    def backward_transform(self, fk, fx, **kwargs):
+        raise NotImplementedError
+
+    def dft(self, fx=None, fk=None, **kwargs):
+        """
+        Computes the forward Fourier transform.
+
+        :arg fx: The array to be transformed.
+            Can be a :class:`pyopencl.array.Array` with or without halo padding
+            (which will be removed by
+            :meth:`pystella.DomainDecomposition.remove_halos`
+            if needed) or a :class:`numpy.ndarray` without halo padding.
+            Arrays are copied as necessary.
+            Defaults to *None*, in which case :attr:`fx` (attached
+            to the transform) is used.
+
+        :arg fk: The array in which to output the result of the transform.
+            Can be a :class:`pyopencl.array.Array` or a :class:`numpy.ndarray`.
+            Arrays are copied as necessary.
+            Defaults to *None*, in which case :attr:`fk` (attached
+            to the transform) is used.
+
+        :returns: The forward Fourier transform of ``fx``.
+            Either ``fk`` if supplied or :attr:`fk`.
+
+        Any remaining keyword arguments are passed to :meth:`forward_transform`.
+
+        .. note::
+            If you need the result of multiple Fourier transforms, you must
+            either supply an ``fk`` array or copy the output.
+            Namely, without passing ``fk`` the same memory (attached to the
+            transform object) will be used as output, overwriting any prior
+            results.
+        """
+
+        if fx is not None:
+            if fx.shape != self.shape(False):
+                if isinstance(fx, cla.Array):
+                    queue = fx.queue
+                elif isinstance(self.fx, cla.Array):
+                    queue = self.fx.queue
+                else:
+                    queue = None
+                self.decomp.remove_halos(queue, fx, self.fx)
+                _fx = self.fx
+            elif not isinstance(fx, type(self.fx)):
+                _fx = _transfer_array(self.fx, fx)
+            else:
+                _fx = fx
+        else:
+            _fx = self.fx
+
+        if fk is not None:
+            if not isinstance(fk, type(self.fk)):
+                _fk = self.fk
+            else:
+                _fk = fk
+        else:
+            _fk = self.fk
+
+        _fk = self.forward_transform(_fx, _fk, **kwargs)
+
+        if fk is not None:
+            if not isinstance(fk, type(self.fk)):
+                _fk = _transfer_array(fk, _fk)
+            else:
+                _fk = fk
+        else:
+            _fk = _fk
+
+        return _fk
+
+    def idft(self, fk=None, fx=None, **kwargs):
+        """
+        Computes the backward Fourier transform.
+
+        :arg fk: The array to be transformed.
+            Can be a :class:`pyopencl.array.Array` or a :class:`numpy.ndarray`.
+            Arrays are copied as necessary.
+            Defaults to *None*, in which case :attr:`fk` (attached
+            to the transform) is used.
+
+        :arg fx: The array in which to output the result of the transform.
+            Can be a :class:`pyopencl.array.Array` with or without halo padding
+            (which will be restored by
+            :meth:`pystella.DomainDecomposition.restore_halos`
+            if needed) or a :class:`numpy.ndarray` without halo padding.
+            Arrays are copied as necessary.
+            Defaults to *None*, in which case :attr:`fx` (attached
+            to the transform) is used.
+
+        :returns: The forward Fourier transform of ``fx``.
+            Either ``fk`` if supplied or :attr:`fk`.
+
+        Any remaining keyword arguments are passed to :meth:`backward_transform`.
+
+        .. note::
+            If you need the result of multiple Fourier transforms, you must
+            either supply an ``fx`` array or copy the output.
+            Namely, without passing ``fx`` the same memory (attached to the
+            transform object) will be used as output, overwriting any prior
+            results.
+        """
+
+        if fk is not None:
+            if not isinstance(fk, type(self.fk)):
+                _fk = _transfer_array(self.fk, fk)
+            else:
+                _fk = fk
+        else:
+            _fk = self.fk
+
+        if fx is not None:
+            if fx.shape == self.shape(False) and isinstance(fx, type(self.fx)):
+                _fx = fx
+            else:
+                _fx = self.fx
+        else:
+            _fx = self.fx
+
+        _fx = self.backward_transform(_fk, _fx, **kwargs)
+
+        if fx is not None:
+            if fx.shape != self.shape(False):
+                if isinstance(fx, cla.Array):
+                    queue = fx.queue
+                elif isinstance(self.fx, cla.Array):
+                    queue = self.fx.queue
+                else:
+                    queue = None
+                self.decomp.restore_halos(queue, _fx, fx)
+                _fx = fx
+            elif not isinstance(fx, type(self.fx)):
+                _fx = _transfer_array(fx, _fx)
+            else:
+                _fx = _fx
+        else:
+            _fx = _fx
+
+        return _fx
+
+    def zero_corner_modes(self, array, only_imag=False):
+        """
+        Zeros the "corner" modes (modes where each component of its
+        integral wavenumber is either zero or the Nyquist along
+        that axis) of ``array`` (or just the imaginary part).
+
+        :arg array: The array to operate on.
+            May be a :class:`pyopencl.array.Array` or a :class:`numpy.ndarray`.
+
+        :arg only_imag: A :class:`bool` determining whether to only
+            set the imaginary part of the array to zero.
+            Defaults to *False*, i.e., setting the mode to ``0+0j``.
+        """
+
+        sub_k = list(x.get().astype('int') for x in self.sub_k.values())
+        shape = self.grid_shape
+
+        where_to_zero = []
+        for mu in range(3):
+            kk = sub_k[mu]
+            where_0 = np.argwhere(abs(kk) == 0).reshape(-1)
+            where_N2 = np.argwhere(abs(kk) == shape[mu]//2).reshape(-1)
+            where_to_zero.append(np.concatenate([where_0, where_N2]))
+
+        from itertools import product
+        for i, j, k in product(*where_to_zero):
+            if only_imag:
+                array[i, j, k] = array[i, j, k].real
+            else:
+                array[i, j, k] = 0.
+
+        return array
+
+
+_c_dtype_mapping = {'float32': 'complex64', 'float64': 'complex128',
+                    np.float32: 'complex64', np.float64: 'complex128'}
+
+
+class pDFT(BaseDFT):
+    """
+    A wrapper to :class:`mpi4py_fft.PFFT` to compute distributed Fast Fourier
+    transforms.
+
+    See :class:`pystella.fourier.dft.BaseDFT`.
+
+    .. automethod:: __init__
+    """
+
+    def __init__(self, decomp, queue, grid_shape, proc_shape, dtype, **kwargs):
+        """
+        :arg decomp: A :class:`pystella.DomainDecomposition`.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg grid_shape: A 3-:class:`tuple` specifying the shape of position-space
+            arrays to be transformed.
+
+        :arg proc_shape: A 3-:class:`tuple` specifying the shape of the MPI
+            processor grid.
+
+        :arg dtype: The datatype of real arrays to be transformed. The complex
+            datatype is chosen to have the same precision.
+
+        Any keyword arguments are passed to :meth:`mpi4py_fft.PFFT.__init__()`.
+        """
+
+        self.decomp = decomp
+        self.grid_shape = grid_shape
+        self.proc_shape = proc_shape
+        self.dtype = dtype
+        cdtype = _c_dtype_mapping[dtype]
+        self.cdtype = cdtype
+
+        if proc_shape[0] > 1 and proc_shape[1] == 1:
+            slab = True
+        else:
+            slab = False
+
+        from mpi4py_fft.pencil import Subcomm
+        default_kwargs = dict(
+            # FIXME: this is weird
+            axes=([0], [1], [2]), threads=16, backend='fftw', collapse=True,
+            )
+        default_kwargs.update(kwargs)
+        comm = decomp.comm if slab else Subcomm(decomp.comm, proc_shape)
+
+        from mpi4py_fft import PFFT
+        self.fft = PFFT(comm, grid_shape, dtype=dtype, slab=slab, **default_kwargs)
+
+        for transform in self.fft.xfftn:
+            transform.M = 1  # ensure normalization is not applied
+
+        self.fx = self.fft.forward.input_array
+        self.fk = self.fft.forward.output_array
+
+        from numpy.fft import fftfreq
+        k = [fftfreq(n, 1/n).astype(dtype) for n in grid_shape]
+
+        if dtype in ('float32', 'float64', np.float32, np.float64):
+            from numpy.fft import rfftfreq
+            k[-1] = rfftfreq(grid_shape[-1], 1/grid_shape[-1]).astype(dtype)
+
+        slc = self.fft.local_slice(True)
+        names = ('momenta_x', 'momenta_y', 'momenta_z')
+        self.sub_k = {direction: cla.to_device(queue, k_i[s_i])
+                      for direction, k_i, s_i in zip(names, k, slc)}
+
+    def shape(self, forward_output=True):
+        return self.fft.shape(forward_output=forward_output)
+
+    def forward_transform(self, fx, fk, **kwargs):
+        return self.fft.forward(input_array=fx, output_array=fk, **kwargs)
+
+    def backward_transform(self, fk, fx, **kwargs):
+        return self.fft.backward(input_array=fk, output_array=fx, **kwargs)
+
+
+class gDFT(BaseDFT):
+    """
+    A wrapper to :mod:`gpyfft` to compute real-to-complex and complex-to-real
+    Fast Fourier transforms.
+
+    See :class:`pystella.fourier.dft.BaseDFT`.
+
+    .. automethod:: __init__
+    """
+
+    def __init__(self, decomp, context, queue, grid_shape, dtype):
+        """
+        :arg decomp: A :class:`pystella.DomainDecomposition`.
+
+        :arg context: A :class:`pyopencl.Context`.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg grid_shape: A 3-:class:`tuple` specifying the shape of position-space
+            arrays to be transformed.
+
+        :arg dtype: The datatype of real arrays to be transformed. The complex
+            datatype is chosen to have the same precision.
+        """
+
+        self.decomp = decomp
+        self.grid_shape = grid_shape
+        self.dtype = dtype
+        cdtype = _c_dtype_mapping[dtype]
+        self.cdtype = cdtype
+
+        self.fx = cla.zeros(queue, grid_shape, dtype)
+        self.fk = cla.zeros(queue, self.shape(True), cdtype)
+        from gpyfft import FFT
+        self.forward = FFT(context, queue, self.fx, out_array=self.fk, real=True,
+                           scale_forward=1, scale_backward=1)
+        self.backward = FFT(context, queue, self.fk, out_array=self.fx, real=True,
+                            scale_forward=1, scale_backward=1)
+
+        from numpy.fft import fftfreq, rfftfreq
+        names = ('momenta_x', 'momenta_y', 'momenta_z')
+
+        slc = ((), (), (),)
+        k = [fftfreq(n, 1/n).astype(dtype) for n in grid_shape]
+        self.sub_k_c = {direction: cla.to_device(queue, k_i[s_i])
+                        for direction, k_i, s_i in zip(names, k, slc)}
+
+        k[-1] = rfftfreq(grid_shape[-1], 1/grid_shape[-1]).astype(dtype)
+        self.sub_k = {direction: cla.to_device(queue, k_i[s_i])
+                      for direction, k_i, s_i in zip(names, k, slc)}
+
+    def shape(self, forward_output=True):
+        if forward_output:
+            shape = list(self.grid_shape)
+            shape[-1] = shape[-1]//2+1
+            return tuple(shape)
+        else:
+            return self.grid_shape
+
+    def forward_transform(self, fx, fk, **kwargs):
+        event, = self.forward.enqueue_arrays(data=fx, result=fk, forward=True)
+        fx.add_event(event)
+        fk.add_event(event)
+        return fk
+
+    def backward_transform(self, fk, fx, **kwargs):
+        event, = self.backward.enqueue_arrays(data=fk, result=fx, forward=False)
+        fx.add_event(event)
+        fk.add_event(event)
+        return fx
diff --git a/pystella/fourier/projectors.py b/pystella/fourier/projectors.py
new file mode 100644
index 0000000..a5d7600
--- /dev/null
+++ b/pystella/fourier/projectors.py
@@ -0,0 +1,343 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import pyopencl.array as cla
+import loopy as lp
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: Projector
+"""
+
+
+class Projector:
+    """
+    Constructs kernels to projector vectors to and from their polarization basis
+    and to project out longitudinal modes, and to project a tensor field to its
+    transverse and traceless component.
+
+    .. automethod:: __init__
+    .. automethod:: transversify
+    .. automethod:: pol_to_vec
+    .. automethod:: vec_to_pol
+    .. automethod:: transverse_traceless
+    """
+
+    def get_pol_to_vec_knl(self):
+        return lp.make_kernel(
+            "[Nx, Ny, Nz] -> \
+                { [i,j,k,mu]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz and 0<=mu<3}",
+            """
+            for i, j, k
+                for mu
+                    <> eps[mu] = 0
+                end
+                <> kx = eff_mom_x[i]
+                <> ky = eff_mom_y[j]
+                <> kz = eff_mom_z[k]
+
+                if fabs(kx) < 1.e-10 and fabs(ky) < 1.e-10
+                    if fabs(kz) > 1.e-10
+                        eps[0] = 1 / sqrt2
+                        eps[1] = 1j / sqrt2
+                    end
+                else
+                    <> Kappa = sqrt(kx**2 + ky**2)
+                    <> kmag = sqrt(kx**2 + ky**2 + kz**2)
+
+                    eps[0] = (kx * kz / kmag - 1j * ky) / Kappa / sqrt2
+                    eps[1] = (ky * kz / kmag + 1j * kx) / Kappa / sqrt2
+                    eps[2] = - Kappa / kmag / sqrt2
+                end
+
+                vector[mu, i, j, k] = eps[mu] * plus[i, j, k] \
+                                    + conj(eps[mu]) * minus[i, j, k] {dup=mu}
+            end
+
+            """,
+            seq_dependencies=True,
+            default_offset=lp.auto,
+            lang_version=(2018, 2),
+        )
+
+    def get_vec_to_pol_knl(self):
+        return lp.make_kernel(
+            "[Nx, Ny, Nz] -> \
+                { [i,j,k,mu]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz and 0<=mu<3}",
+            """
+            for i, j, k
+                for mu
+                    <> eps[mu] = 0
+                end
+                <> kx = eff_mom_x[i]
+                <> ky = eff_mom_y[j]
+                <> kz = eff_mom_z[k]
+
+                if fabs(kx) < 1.e-10 and fabs(ky) < 1.e-10
+                    if fabs(kz) > 1.e-10
+                        eps[0] = 1 / sqrt2
+                        eps[1] = 1j / sqrt2
+                    end
+                else
+                    <> Kappa = sqrt(kx**2 + ky**2)
+                    <> kmag = sqrt(kx**2 + ky**2 + kz**2)
+
+                    eps[0] = (kx * kz / kmag - 1j * ky) / Kappa / sqrt2
+                    eps[1] = (ky * kz / kmag + 1j * kx) / Kappa / sqrt2
+                    eps[2] = - Kappa / kmag / sqrt2
+                end
+
+                plus[i, j, k] = sum(mu, conj(eps[mu]) * vector[mu, i, j, k]) {dup=mu}
+                minus[i, j, k] = sum(mu, eps[mu] * vector[mu, i, j, k]) {dup=mu}
+            end
+            """,
+            seq_dependencies=True,
+            default_offset=lp.auto,
+            lang_version=(2018, 2),
+        )
+
+    def get_transversify_knl(self):
+        return lp.make_kernel(
+            "[Nx, Ny, Nz] -> \
+                { [i,j,k,mu]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz and 0<=mu<3}",
+            """
+            for i, j, k
+                <> kvec[0] = eff_mom_x[i]
+                kvec[1] = eff_mom_y[j]
+                kvec[2] = eff_mom_z[k]
+                if fabs(kvec[0]) < 1.e-14 \
+                    and fabs(kvec[1]) < 1.e-14 \
+                    and fabs(kvec[2]) < 1.e-14
+                    vectorT[mu, i, j, k] = 0
+                else
+                    <> kmag = sqrt(sum(mu, kvec[mu]**2)) {dup=mu}
+                    <> div = sum(mu, kvec[mu] * vector[mu, i, j, k]) {dup=mu}
+
+                    vectorT[mu, i, j, k] = vector[mu, i, j, k] \
+                                        - kvec[mu] / kmag**2 * div {dup=mu,nosync=*}
+                end
+            end
+            """,
+            seq_dependencies=True,
+            default_offset=lp.auto,
+            lang_version=(2018, 2),
+        )
+
+    def get_tt_knl(self):
+        knl = lp.make_kernel(
+            "[Nx, Ny, Nz] -> \
+                { [i,j,k,a,b,c,d]: \
+                    0<=i<Nx and 0<=j<Ny and 0<=k<Nz and 1<=a,b,c,d<=3}",
+            """
+            for i, j, k
+                <> kvec[0] = eff_mom_x[i]
+                kvec[1] = eff_mom_y[j]
+                kvec[2] = eff_mom_z[k]
+                <> kmag = sqrt(kvec[0]**2 + kvec[1]**2 + kvec[2]**2)
+                kvec[0] = kvec[0] / kmag
+                kvec[1] = kvec[1] / kmag
+                kvec[2] = kvec[2] / kmag
+
+                id(a, b) := ((7 - if(a <= b, a, b)) * if(a <= b, a, b)) // 2 \
+                            - 4 + if(a <= b, b, a)
+                P(a, b) := if(a == b, 1, 0) - kvec[a-1] * kvec[b-1]
+
+                for a, b
+                    if a <= b
+                        hTT[id(a, b)] = sum((c, d), \
+                                            (P(a, c) * P(d, b) \
+                                            - .5 * P(a, b) * P(c, d)) \
+                                            * hij[id(c, d), i, j, k])
+                    end
+                end
+
+                for a, b
+                    if a <= b
+                        hijTT[id(a, b), i, j, k] = hTT[id(a, b)] {dup=a,dup=b}
+                    end
+                end
+            end
+            """,
+            [
+                lp.GlobalArg('hij', shape='(6, Nx, Ny, Nz)'),
+                lp.GlobalArg('hijTT', shape='(6, Nx, Ny, Nz)'),
+                lp.TemporaryVariable('hTT', shape='(6,)'),
+                '...'
+            ],
+            seq_dependencies=True,
+            default_offset=lp.auto,
+            lang_version=(2018, 2),
+        )
+        return lp.expand_subst(knl)
+
+    def __init__(self, fft, effective_k):
+        """
+        :arg fft: An FFT object as returned by :func:`DFT`.
+            ``grid_shape`` and ``dtype`` are determined by ``fft``'s attributes.
+
+        :arg effective_k: A :class:`callable` with signature ``(k, dx)`` returning
+            the effective momentum (eigenvalue) of the corresponding stencil.
+            That is, projections are implemented relative to the stencil
+            whose eigenvalues are returned by this function.
+        """
+
+        self.fft = fft
+
+        if not callable(effective_k):
+            if effective_k != 0:
+                from pystella.derivs import FirstCenteredDifference
+                h = effective_k
+                effective_k = FirstCenteredDifference(h).get_eigenvalues
+            else:
+                def effective_k(k, dx):  # pylint: disable=function-redefined
+                    return k
+
+        from math import pi
+        grid_shape = fft.grid_shape
+        # since projectors only need the unit momentum vectors, can pass
+        # k = k_hat * dk * dx = k_hat * 2 * pi * grid_shape and dx = 1,
+        # where k_hat is the integer momentum gridpoint
+        dk_dx = tuple(2 * pi / Ni for Ni in grid_shape)
+
+        queue = self.fft.sub_k['momenta_x'].queue
+        sub_k = list(x.get().astype('int') for x in self.fft.sub_k.values())
+        eff_mom_names = ('eff_mom_x', 'eff_mom_y', 'eff_mom_z')
+        self.eff_mom = {}
+        for mu, (name, kk) in enumerate(zip(eff_mom_names, sub_k)):
+            eff_k = effective_k(kk.astype(fft.dtype) * dk_dx[mu], 1)
+            eff_k[abs(sub_k[mu]) == fft.grid_shape[mu]//2] = 0.
+            eff_k[sub_k[mu] == 0] = 0.
+            self.eff_mom[name] = cla.to_device(queue, eff_k)
+
+        def process(knl):
+            knl = lp.fix_parameters(knl, sqrt2=2**.5)
+            knl = lp.split_iname(knl, "k", 32, outer_tag="g.0", inner_tag="l.0")
+            knl = lp.split_iname(knl, "j", 1, outer_tag="g.1", inner_tag="unr")
+            knl = lp.split_iname(knl, "i", 1, outer_tag="g.2", inner_tag="unr")
+            knl = lp.set_options(knl, enforce_variable_access_ordered="no_check")
+            return knl
+
+        self.pol_to_vec_knl = process(self.get_pol_to_vec_knl())
+        self.vec_to_pol_knl = process(self.get_vec_to_pol_knl())
+        self.transversify_knl = process(self.get_transversify_knl())
+        self.tt_knl = process(self.get_tt_knl())
+
+    def transversify(self, queue, vector, vector_T=None):
+        """
+        Projects out longitudinal modes of a vector field.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg vector: The array containing the
+            momentum-space vector field to be projected.
+            Must have shape ``(3,)+k_shape``, where
+            ``k_shape`` is the shape of a single momentum-space field array.
+
+        :arg vector_T: The array in wihch the resulting
+            projected vector field will be stored.
+            Must have the same shape as ``vector``.
+            Defaults to *None*, in which case the projection is performed in-place.
+
+        :returns: The :class:`pyopencl.Event` associated with the kernel invocation.
+        """
+
+        vector_T = vector_T or vector
+        evt, _ = self.transversify_knl(queue, **self.eff_mom,
+                                       vector=vector, vectorT=vector)
+        return evt
+
+    def pol_to_vec(self, queue, plus, minus, vector):
+        """
+        Projects the plus and minus polarizations of a vector field onto the
+        vector components.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg plus: The array containing the
+            momentum-space field of the plus polarization.
+
+        :arg minus: The array containing the
+            momentum-space field of the minus polarization.
+
+        :arg vector: The array into which the vector
+            field will be stored.
+            Must have shape ``(3,)+k_shape``, where ``k_shape`` is the shape of a
+            single momentum-space field array.
+
+        :returns: The :class:`pyopencl.Event` associated with the kernel invocation.
+        """
+
+        evt, _ = self.pol_to_vec_knl(queue, **self.eff_mom,
+                                     vector=vector, plus=plus, minus=minus)
+        return evt
+
+    def vec_to_pol(self, queue, plus, minus, vector):
+        """
+        Projects the components of a vector field onto the basis of plus and
+        minus polarizations.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg plus: The array into which will be stored the
+            momentum-space field of the plus polarization.
+
+        :arg minus: The array into which will be stored the
+            momentum-space field of the minus polarization.
+
+        :arg vector: The array whose polarization
+            components will be computed.
+            Must have shape ``(3,)+k_shape``, where ``k_shape`` is the shape of a
+            single momentum-space field array.
+
+        :returns: The :class:`pyopencl.Event` associated with the kernel invocation.
+        """
+
+        evt, _ = self.vec_to_pol_knl(queue, **self.eff_mom,
+                                     vector=vector, plus=plus, minus=minus)
+        return evt
+
+    def transverse_traceless(self, queue, hij, hij_TT=None):
+        """
+        Projects a tensor field to be transverse and traceless.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg hij: The array containing the
+            momentum-space tensor field to be projected.
+            Must have shape ``(6,)+k_shape``, where
+            ``k_shape`` is the shape of a single momentum-space field array.
+
+        :arg hij_TT: The array in wihch the resulting projected
+            tensor field will be stored.
+            Must have the same shape as ``hij``.
+            Defaults to *None*, in which case the projection is performed in-place.
+
+        :returns: The :class:`pyopencl.Event` associated with the kernel invocation.
+        """
+
+        hij_TT = hij_TT or hij
+        evt, _ = self.tt_knl(queue, hij=hij, hijTT=hij_TT, **self.eff_mom)
+
+        # re-set to zero
+        for mu in range(6):
+            self.fft.zero_corner_modes(hij_TT[mu])
diff --git a/pystella/fourier/rayleigh.py b/pystella/fourier/rayleigh.py
new file mode 100644
index 0000000..c3b4215
--- /dev/null
+++ b/pystella/fourier/rayleigh.py
@@ -0,0 +1,395 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl.array as cla
+import pyopencl.clrandom as clr
+import loopy as lp
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: RayleighGenerator
+"""
+
+
+def make_hermitian(fk):
+    grid_shape = list(fk.shape)
+    grid_shape[-1] = 2 * (grid_shape[-1] - 1)
+    pos = [np.arange(0, Ni//2+1) for Ni in grid_shape]
+    neg = [np.concatenate([np.array([0]), np.arange(Ni-1, Ni//2-1, -1)])
+           for Ni in grid_shape]
+
+    for k in [0, grid_shape[-1]//2]:
+        for n, p in zip(neg[0], pos[0]):
+            fk[n, neg[1], k] = np.conj(fk[p, pos[1], k])
+            fk[p, neg[1], k] = np.conj(fk[n, pos[1], k])
+        for n, p in zip(neg[1], pos[1]):
+            fk[neg[0], n, k] = np.conj(fk[pos[0], p, k])
+            fk[neg[0], p, k] = np.conj(fk[pos[0], n, k])
+
+    for i in [0, grid_shape[0]//2]:
+        for j in [0, grid_shape[1]//2]:
+            for k in [0, grid_shape[2]//2]:
+                fk[i, j, k] = np.real(fk[i, j, k])
+    return fk
+
+
+class RayleighGenerator:
+    """
+    Constructs kernels to generate Gaussian-random fields with a chosen power
+    spectrum in Fourier space by drawing modes according to the corresponding
+    Rayleigh distribution.
+
+    .. automethod:: __init__
+    .. automethod:: generate
+    .. automethod:: init_field
+    .. automethod:: init_transverse_vector
+    .. automethod:: init_vector_from_pol
+
+    In addition, the following methods apply the WKB approximation to
+    initialize a field and its (conformal-) time derivative in FLRW spacetime.
+
+    .. automethod:: generate_WKB
+    .. automethod:: init_WKB_fields
+    """
+
+    def get_wkb_knl(self):
+        knl = lp.make_kernel(
+            "[Nx, Ny, Nz] ->  { [i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz }",
+            """
+            <> amp_1 = sqrt(- log(rands[0, i, j, k]))
+            <> amp_2 = sqrt(- log(rands[2, i, j, k]))
+            <> phs_1 = exp(1j * 2. * pi * rands[1, i, j, k])
+            <> phs_2 = exp(1j * 2. * pi * rands[3, i, j, k])
+            <> power = f_power[i, j, k]
+            <> Lmode = phs_1 * amp_1 * sqrt(power)
+            <> Rmode = phs_2 * amp_2 * sqrt(power)
+            <> fk_ = (Lmode + Rmode) / sqrt2
+            fk[i, j, k] = fk_
+            dfk[i, j, k] = 1j * wk[i, j, k] * (Lmode - Rmode) / sqrt2 - hubble * fk_
+            """,
+            [
+                lp.ValueArg("hubble", self.dtype),
+                lp.GlobalArg('fk, dfk', shape=lp.auto, dtype=self.cdtype),
+                "..."
+            ],
+            seq_dependencies=True,
+            silenced_warnings=['inferred_iname'],
+            lang_version=(2018, 2),
+        )
+        knl = lp.set_options(knl, return_dict=True)
+        return knl
+
+    def get_non_wkb_knl(self):
+        knl = lp.make_kernel(
+            "[Nx, Ny, Nz] ->  { [i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz }",
+            """
+            <> amp = sqrt(- log(rands[0, i, j, k]))
+            <> phs = exp(1j * 2. * pi * rands[1, i, j, k])
+            fk[i, j, k] = phs * amp * sqrt(f_power[i, j, k])
+            """,
+            [lp.GlobalArg('fk', shape=lp.auto, dtype=self.cdtype), "..."],
+            seq_dependencies=True,
+            lang_version=(2018, 2),
+        )
+        return knl
+
+    def __init__(self, context, fft, dk, volume, **kwargs):
+        """
+        :arg context: A :class:`pyopencl.Context`.
+
+        :arg fft: An FFT object as returned by :func:`DFT`.
+
+        :arg dk: A 3-:class:`tuple` of the momentum-space grid spacing of each
+            axis (i.e., the infrared cutoff of the grid in each direction).
+
+        :arg volume: The physical volume of the grid.
+
+        The following keyword-only arguments are recognized:
+
+        :arg seed: The seed to the random number generator.
+            Defaults to ``13298``.
+        """
+
+        self.fft = fft
+        self.dtype = fft.dtype
+        self.cdtype = fft.cdtype
+        self.volume = volume
+
+        sub_k = list(x.get() for x in self.fft.sub_k.values())
+        kvecs = np.meshgrid(*sub_k, indexing='ij', sparse=False)
+        self.rkmags = np.sqrt(sum((dki * ki)**2 for dki, ki in zip(dk, kvecs)))
+
+        seed = kwargs.pop('seed', 13298)
+        self.rng = clr.ThreefryGenerator(context, seed=seed)
+
+        def parallelize(knl):
+            knl = lp.fix_parameters(knl, pi=np.pi, sqrt2=np.sqrt(2.))
+            knl = lp.split_iname(knl, 'k', 32, inner_tag='l.0', outer_tag='g.0')
+            knl = lp.split_iname(knl, 'j', 1, inner_tag='unr', outer_tag='g.1')
+            knl = lp.split_iname(knl, 'i', 1, inner_tag='unr', outer_tag='g.2')
+            return knl
+
+        self.wkb_knl = parallelize(self.get_wkb_knl())
+        self.non_wkb_knl = parallelize(self.get_non_wkb_knl())
+
+    def post_process(self, fk, is_real):
+        from pystella.fourier import gDFT
+        if is_real and isinstance(self.fft, gDFT):
+            # real fields must be Hermitian-symmetric, and it seems we
+            # need to do this manually when FFT'ing with gpyfft
+            fk = make_hermitian(fk)
+        # can at least do this in general
+        self.fft.zero_corner_modes(fk, only_imag=True)
+        return fk
+
+    # wrapper to remove 1/0 and set homogeneous power to zero
+    def _ps_wrapper(self, ps_func, wk, kmags):
+        if kmags[0, 0, 0] == 0.:
+            wk0 = wk[0, 0, 0]
+            wk[0, 0, 0] = 1.
+        power = ps_func(wk)
+        if kmags[0, 0, 0] == 0.:
+            power[0, 0, 0] = 0.
+            wk[0, 0, 0] = wk0
+        return power
+
+    def generate(self, queue, random=True, field_ps=lambda kmag: 1/2/kmag,
+                 norm=1, is_real=True, window=lambda kmag: 1.):
+        """
+        Generate a 3-D array of Fourier modes with a given power spectrum and
+        random phases.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg random: Whether to randomly sample the Rayleigh distribution
+            of mode amplitudes.
+            Defaults to *True*.
+
+        :arg field_ps: A :class:`callable` returning the desired
+            power spectrum of the field as a function of momentum ``kmag``.
+            Defaults to the Bunch-Davies vacuum,
+            ``lambda kmag: 1/2/kmag``.
+
+        :arg norm: A constant normalization factor by which to multiply all
+            power spectra.
+            Defaults to ``1``.
+
+        :arg is_real: Whether the fields to be generated are real or complex
+            (in position space).
+            Defaults to *True*.
+
+            .. note::
+
+                Currently, only ``is_real=True`` is supported.
+
+        :arg window: A :class:`callable` window function filtering initial mode
+            amplitudes.
+            Defaults to ``lambda kmag: 1``, i.e., no filter.
+
+        :returns: An :class:`numpy.ndarray` containing the generated Fourier modes
+            of the field.
+        """
+
+        amplitude_sq = norm / self.volume
+        kmags = self.rkmags  # if is_real else self.ckmags
+
+        rands = self.rng.uniform(queue, (2,)+kmags.shape, self.dtype)
+        if not random:
+            rands[0] = np.exp(-1)
+
+        f_power = (amplitude_sq * window(kmags)**2
+                   * self._ps_wrapper(field_ps, kmags, kmags))
+
+        evt, (fk,) = self.non_wkb_knl(queue, rands=rands, f_power=f_power,
+                                      out_host=True)
+
+        return self.post_process(fk, is_real)
+
+    def init_field(self, fx, queue=None, **kwargs):
+        """
+        A wrapper which calls :meth:`generate` to initialize a field
+        in Fourier space and returns its inverse Fourier transform.
+
+        :arg fx: The array in which the field will be stored.
+
+        The following keyword arguments are recognized:
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+            Defaults to ``fx.queue``.
+
+        Any additional keyword arguments are passed to :meth:`generate`.
+        """
+
+        queue = queue or fx.queue
+        fk = self.generate(queue, **kwargs)
+        self.fft.idft(fk, fx)
+
+    def init_transverse_vector(self, projector, vector, queue=None, **kwargs):
+        """
+        A wrapper which calls :meth:`generate` to initialize a transverse
+        three-vector field in Fourier space and returns its inverse Fourier
+        transform.
+        Each component will have the same power spectrum.
+
+        :arg projector: A :class:`Projector` used to project out
+            longitudinal components of the vector field.
+
+        :arg vector: The array in which the vector field will be stored.
+
+        The following keyword arguments are recognized:
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+            Defaults to ``vector.queue``.
+
+        Any additional keyword arguments are passed to :meth:`generate`.
+        """
+
+        queue = queue or vector.queue
+
+        vector_k = cla.empty(queue, (3,)+self.fft.shape(True), self.cdtype)
+
+        for mu in range(3):
+            fk = self.generate(queue, **kwargs)
+            vector_k[mu].set(fk)
+
+        projector.transversify(queue, vector_k)
+
+        for mu in range(3):
+            self.fft.idft(vector_k[mu], vector[mu])
+
+    def init_vector_from_pol(self, projector, vector, plus_ps, minus_ps,
+                             queue=None, **kwargs):
+        """
+        A wrapper which calls :meth:`generate` to initialize a transverse
+        three-vector field in Fourier space and returns its inverse Fourier
+        transform.
+        In contrast to :meth:`init_transverse_vector`, modes are generated
+        for the plus and minus polarizations of the vector field, from which
+        the vector field itself is constructed.
+
+        :arg projector: A :class:`Projector` used to project out
+            longitudinal components of the vector field.
+
+        :arg vector: The array in which the vector field will be stored.
+
+        :arg plus_ps: A :class:`callable` returning the power spectrum of the
+            plus polarization as a function of momentum ``kmag``.
+
+        :arg minus_ps: A :class:`callable` returning the power spectrum of the
+            minus polarization as a function of momentum ``kmag``.
+
+        The following keyword arguments are recognized:
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+            Defaults to ``vector.queue``.
+
+        Any additional keyword arguments are passed to :meth:`generate`.
+        """
+
+        queue = queue or vector.queue
+
+        fk = self.generate(queue, field_ps=plus_ps, **kwargs)
+        plus_k = cla.to_device(queue, fk)
+
+        fk = self.generate(queue, field_ps=minus_ps, **kwargs)
+        minus_k = cla.to_device(queue, fk)
+
+        vector_k = cla.empty(queue, (3,)+self.fft.shape(True), self.cdtype)
+        projector.pol_to_vec(queue, plus_k, minus_k, vector_k)
+
+        for mu in range(3):
+            self.fft.idft(vector_k[mu], vector[mu])
+
+    def generate_WKB(self, queue, random=True,
+                     field_ps=lambda wk: 1/2/wk,
+                     norm=1, omega_k=lambda kmag: kmag,
+                     hubble=0., is_real=True, window=lambda kmag: 1.):
+        """
+        Generate a 3-D array of Fourier modes with a given power spectrum and
+        random phases, along with that of its time derivative
+        according to the WKB approximation (for Klein-Gordon fields in
+        conformal FLRW spacetime).
+
+        Arguments match those of :meth:`generate`, with the following
+        exceptions/additions:
+
+        :arg field_ps: A :class:`callable` returning the desired
+            power spectrum of the field as a function of :math:`\\omega(k)``.
+            Defaults to the Bunch-Davies vacuum, ``lambda wk: 1/2/wk``,
+            where ``wk=omega_k(kmag)``.
+
+        :arg omega_k: A :class:`callable` defining the dispersion relation
+            of the field.
+            Defaults to ``lambda kmag: kmag``.
+
+        :arg hubble: The value of the (conformal) Hubble parameter to use in
+            generating modes for the field's time derivative.
+            Only used when ``WKB=True``.
+            Defaults to ``0``.
+
+        :returns: A tuple ``(fk, dfk)`` containing the generated Fourier modes
+            of the field and its time derivative.
+        """
+
+        amplitude_sq = norm / self.volume
+        kmags = self.rkmags  # if is_real else self.ckmags
+        kshape = kmags.shape
+
+        rands = self.rng.uniform(queue, (4,)+kshape, self.dtype)
+        if not random:
+            rands[0] = rands[2] = np.exp(-1)
+
+        wk = omega_k(kmags)
+        f_power = (amplitude_sq * window(kmags)**2
+                   * self._ps_wrapper(field_ps, wk, kmags))
+
+        evt, out = self.wkb_knl(queue, rands=rands, hubble=hubble,
+                                wk=wk, f_power=f_power, out_host=True)
+
+        fk = self.post_process(out['fk'], is_real)
+        dfk = self.post_process(out['dfk'], is_real)
+
+        return fk, dfk
+
+    def init_WKB_fields(self, fx, dfx, queue=None, **kwargs):
+        """
+        A wrapper which calls :meth:`generate_WKB` to initialize a field and
+        its dime derivative in Fourier space and inverse Fourier transform
+        the results.
+
+        :arg fx: The array in which the field will be stored.
+
+        :arg dfx: The array in which the field's time derivative will
+            be stored.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+            Defaults to ``fx.queue``.
+
+        Any additional keyword arguments are passed to :meth:`generate_WKB`.
+        """
+
+        queue = queue or fx.queue
+        fk, dfk = self.generate_WKB(queue, **kwargs)
+        self.fft.idft(fk, fx)
+        self.fft.idft(dfk, dfx)
diff --git a/pystella/fourier/spectra.py b/pystella/fourier/spectra.py
new file mode 100644
index 0000000..c19ac3a
--- /dev/null
+++ b/pystella/fourier/spectra.py
@@ -0,0 +1,325 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl.array as cla
+import loopy as lp
+
+from warnings import filterwarnings
+filterwarnings('ignore', category=lp.diagnostic.LoopyAdvisory,
+               message="could not find a conflict-free mem layout")
+from pyopencl.characterize import CLCharacterizationWarning
+filterwarnings('ignore', category=CLCharacterizationWarning)
+
+
+class PowerSpectra:
+    """
+    A class for computing power spectra of fields.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    .. automethod:: bin_power
+    .. automethod:: polarization
+    .. automethod:: gw
+    """
+
+    def __init__(self, decomp, fft, dk, volume, **kwargs):
+        """
+        :arg decomp: A :class:`DomainDecomposition`.
+
+        :arg fft: An FFT object as returned by :func:`DFT`.
+
+        :arg dk: A 3-:class:`tuple` of the momentum-space grid spacing of each
+            axis (i.e., the infrared cutoff of the grid in each direction).
+
+        :arg volume: The physical volume of the grid.
+
+        The following keyword-only arguments are also recognized:
+
+        :arg bin_with: A :class:`float` specifying the bin width to use.
+            Defaults to ``min(dk)``.
+        """
+
+        self.decomp = decomp
+        self.fft = fft
+        self.grid_shape = fft.grid_shape
+        self.proc_shape = decomp.proc_shape
+
+        self.dtype = fft.dtype
+        self.cdtype = fft.cdtype
+        self.kshape = self.fft.shape(True)
+
+        self.dk = dk
+        self.bin_width = kwargs.pop('bin_width', min(dk))
+
+        d3x = volume / np.product(self.grid_shape)
+        self.norm = (1 / 2 / np.pi**2 / volume) * d3x**2
+
+        sub_k = list(x.get() for x in self.fft.sub_k.values())
+        kvecs = np.meshgrid(*sub_k, indexing='ij', sparse=False)
+        rkmags = np.sqrt(sum((dki * ki)**2 for dki, ki in zip(self.dk, kvecs)))
+
+        counts = 2. * np.ones_like(rkmags)
+        counts[kvecs[2] == 0] = 1.
+        counts[kvecs[2] == self.grid_shape[-1]//2] = 1.
+
+        from mpi4py import MPI
+        max_k = self.decomp.allreduce(np.max(rkmags), MPI.MAX)
+        self.num_bins = int(max_k / self.bin_width + .5) + 1
+        bins = np.arange(-.5, self.num_bins + .5) * self.bin_width
+
+        sub_bin_counts = np.histogram(rkmags, weights=counts, bins=bins)[0]
+        self.bin_counts = self.decomp.allreduce(sub_bin_counts)
+
+        self.real_spectra_knl = self.make_spectra_knl(True, self.kshape[-1])
+        # FIXME: get complex Nz better
+        _Nz = self.grid_shape[-1] // self.proc_shape[1]
+        self.complex_spectra_knl = self.make_spectra_knl(False, _Nz)
+
+        self.pool = None
+
+    def make_spectra_knl(self, is_real, Nz):
+        knl = lp.make_kernel(
+            "[NZ, Nx, Ny, Nz, num_bins, is_real] -> \
+                { [i,j,k,b]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz and 0<=b<num_bins}",
+            """
+            for b
+                spectrum[b] = 0 {atomic}
+            end
+            ... gbarrier
+            for j
+                for b
+                    temp[b] = 0 {id=init, atomic}
+                end
+                for i, k
+                    <> k_i = momenta_x[i]
+                    <> k_j = momenta_y[j]
+                    <> k_k = momenta_z[k]
+                    <> kmag = sqrt((dki * k_i)**2 + (dkj * k_j)**2 + (dkk * k_k)**2)
+                    <int> bin = round(kmag / bin_width)
+                    <> count = if(is_real and k_k > 0 and k_k < NZ/2, 2., 1.)
+                    <> power = abs(fk[i, j, k])**2 * kmag**k_power * count
+                    temp[bin] = temp[bin] + power {id=tmp, dep=init, atomic}
+                end
+                for b
+                    spectrum[b] = spectrum[b] + temp[b] {id=glb, dep=tmp, atomic}
+                end
+            end
+            """,
+            [
+                lp.GlobalArg("spectrum", self.dtype, shape=(self.num_bins,),
+                             for_atomic=True),
+                lp.GlobalArg("momenta_x", self.dtype, shape=('Nx',)),
+                lp.GlobalArg("momenta_y", self.dtype, shape=('Ny',)),
+                lp.GlobalArg("momenta_z", self.dtype, shape=('Nz',)),
+                lp.TemporaryVariable("temp", self.dtype, shape=(self.num_bins,),
+                                     for_atomic=True,
+                                     address_space=lp.AddressSpace.LOCAL),
+                lp.ValueArg("k_power, bin_width, dki, dkj, dkk", self.dtype),
+                "..."
+            ],
+            default_offset=lp.auto,
+            silenced_warnings=['write_race(tmp)', 'write_race(glb)'],
+            seq_dependencies=True,
+            lang_version=(2018, 2),
+        )
+        # FIXME: count incorrect for complex?
+
+        knl = lp.fix_parameters(knl, NZ=self.grid_shape[-1], num_bins=self.num_bins,
+                                dki=self.dk[0], dkj=self.dk[1], dkk=self.dk[2],
+                                Nz=Nz, is_real=is_real)
+        knl = lp.split_iname(knl, "k", Nz, outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_iname(knl, "b", min(1024, self.num_bins),
+                             outer_tag="g.0", inner_tag="l.0")
+        knl = lp.tag_inames(knl, "j:g.1")
+        return knl
+
+    def bin_power(self, fk, queue=None, k_power=3, is_real=True):
+        """
+        Computes the binned power spectrum of a momentum-space field,
+
+        .. math::
+
+            \\Delta_f^2(k)
+            = \\frac{1}{2 \\pi^2 V} \\int \\mathrm{d} \\Omega \\,
+                \\left\\vert \\mathbf{k} \\right\\vert^n
+                \\left\\vert f(\\mathbf{k}) \\right\\vert^2
+
+        where ``k_power`` specifies the value of :math:`n`.
+
+        :arg fk: The array containing the complex-valued,
+            momentum-space field whose power spectrum is to be computed.
+
+        The following keyword arguments are recognized:
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+            Defaults to ``fk.queue``.
+
+        :arg k_power: The exponent :math:`n` to use on :math:`\\vert k \\vert`.
+            Defaults to 3 (to compute the "dimensionless" power spectrum).
+        """
+
+        queue = queue or fk.queue
+
+        if self.pool is None:
+            import pyopencl.tools as clt
+            self.pool = clt.MemoryPool(clt.ImmediateAllocator(queue))
+
+        if is_real:
+            evt, (spectrum,) = \
+                self.real_spectra_knl(queue, allocator=self.pool, fk=fk,
+                                      k_power=k_power, **self.fft.sub_k,
+                                      bin_width=self.bin_width)
+        else:
+            raise NotImplementedError('complex spectra, at least distributed')
+            evt, (spectrum,) = \
+                self.complex_spectra_knl(queue, allocator=self.pool, fk=fk,
+                                         k_power=k_power, **self.fft.sub_k_c,
+                                         bin_width=self.bin_width)
+
+        full_spectrum = self.decomp.allreduce(spectrum.get())
+        return full_spectrum / self.bin_counts
+
+    def __call__(self, fx, queue=None, k_power=3):
+        """
+        Computes the power spectrum of the position-space field ``fx`` by first
+        Fourier transforming ``fx`` and then calling :meth:`bin_power`.
+
+        :arg fx: The array containing the position-space field
+            whose power spectrum is to be computed.
+            If ``fx`` has more than three axes, all the outer axes are looped over.
+            As an example, if ``f`` has shape ``(2, 3, 130, 130, 130)``,
+            this method loops over the outermost two axes with shape ``(2, 3)``, and
+            the resulting output data would have the shape ``(2, 3, num_bins)``.
+
+        The following keyword arguments are recognized:
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+            Defaults to ``fx.queue``.
+
+        :arg k_power: The exponent :math:`n` to use on :math:`\\vert k \\vert`.
+            Defaults to 3 (to compute the "dimensionless" power spectrum).
+        """
+
+        queue = queue or fx.queue
+        is_real = fx.dtype == np.float64 or fx.dtype == np.float32
+
+        outer_shape = fx.shape[:-3]
+        from itertools import product
+        slices = list(product(*[range(n) for n in outer_shape]))
+
+        result = np.zeros(outer_shape+(self.num_bins,), self.dtype)
+        for s in slices:
+            fk = self.fft.dft(fx[s])
+            result[s] = self.bin_power(fk, queue, k_power, is_real)
+
+        return self.norm * result
+
+    def polarization(self, vector, projector, queue=None, k_power=3):
+        """
+        Computes the power spectra of the plus and minus polarizations of a vector
+        field.
+
+        :arg vector: The array containing the position-space vector field
+            whose power spectrum is to be computed.
+            If ``vector`` has more than four axes, all the outer axes are
+            looped over.
+            As an example, if ``vector`` has shape ``(2, 3, 3, 130, 130, 130)``
+            (where the fourth-to-last axis is the vector-component axis),
+            this method loops over the outermost two axes with shape ``(2, 3)``, and
+            the resulting output data would have the shape ``(2, 3, 2, num_bins)``
+            (where the second-to-last axis is the polarization axis).
+
+        :arg projector: A :class:`Projector`.
+
+        The remaining arguments are the same as those to :meth:`__call__`.
+        """
+
+        queue = queue or vector.queue
+
+        vec_k = cla.empty(queue, (3,)+self.kshape, self.cdtype)
+        # overwrite vec_k
+        plus = vec_k[0]
+        minus = vec_k[1]
+
+        outer_shape = vector.shape[:-4]
+        from itertools import product
+        slices = list(product(*[range(n) for n in outer_shape]))
+
+        result = np.zeros(outer_shape+(2, self.num_bins,), self.dtype)
+        for s in slices:
+            for mu in range(3):
+                self.fft.dft(vector[s][mu], vec_k[mu])
+
+            projector.vec_to_pol(queue, plus, minus, vec_k)
+            result[s][0] = self.bin_power(plus, queue=queue, k_power=k_power)
+            result[s][1] = self.bin_power(minus, queue=queue, k_power=k_power)
+
+        return self.norm * result
+
+    def gw(self, hij, projector, hubble, queue=None, k_power=3):
+        """
+        Computes the present, transverse-traceless gravitational wave power spectrum.
+
+        .. math::
+
+            \\Delta_t^2(k)
+            = \\frac{1}{24 \\pi^{2} \\mathcal{H}^{2}}
+                \\frac{1}{V}
+                \\sum_{i, j} \\int \\mathrm{d} \\Omega \\,
+                \\left\\vert \\mathbf{k} \\right\\vert^3
+                \\left\\vert h_{i j}^{\\prime}(k) \\right \\vert^{2}
+
+        :arg hij: The array containing the
+            position-space tensor field whose power spectrum is to be computed.
+            Must be 4-dimensional, with the first axis being length-6.
+
+        :arg projector: A :class:`Projector`.
+
+        :arg hubble: The current value of the conformal Hubble parameter.
+
+        The remaining arguments are the same as those to :meth:`__call__`.
+        """
+
+        queue = queue or hij.queue
+
+        hij_k = cla.empty(queue, (6,)+self.kshape, dtype=self.cdtype)
+
+        for mu in range(6):
+            self.fft.dft(hij[mu], hij_k[mu])
+
+        def tensor_id(i, j):
+            a = i if i <= j else j
+            b = j if i <= j else i
+            return (7 - a) * a // 2 - 4 + b
+
+        gw_spec = []
+        projector.transverse_traceless(queue, hij_k)
+        for mu in range(6):
+            spec = self.bin_power(hij_k[mu], queue=queue, k_power=k_power)
+            gw_spec.append(spec)
+
+        gw_tot = sum(gw_spec[tensor_id(i, j)]
+                     for i in range(1, 4) for j in range(1, 4))
+
+        return self.norm / 12 / hubble**2 * gw_tot
diff --git a/pystella/multigrid/__init__.py b/pystella/multigrid/__init__.py
new file mode 100644
index 0000000..61cdcd4
--- /dev/null
+++ b/pystella/multigrid/__init__.py
@@ -0,0 +1,493 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl.array as cla
+from pystella.multigrid.transfer import (Injection, FullWeighting,
+                                         LinearInterpolation, CubicInterpolation)
+from pystella.multigrid.relax import JacobiIterator, NewtonIterator
+
+__doc__ = """
+.. currentmodule:: pystella.multigrid
+.. autoclass:: FullApproximationScheme
+.. autoclass:: MultiGridSolver
+
+.. _multigrid-cycles:
+
+Multigrid cycles
+^^^^^^^^^^^^^^^^
+
+Multigrid cycles are represnted as a sequence of levels to visit and how many
+smoothing iterations to perform on each.
+Level ``i`` denotes the level with a factor ``2**i`` fewer gridpoints
+in each dimension (relative to the finest grid).
+The following utilities can be used to generate particular types ofcycles
+by specifying, e.g., the coarsest level to visit and how many iterations
+to perform on these levels.
+
+.. autofunction:: mu_cycle
+.. autofunction:: v_cycle
+.. autofunction:: w_cycle
+.. autofunction:: f_cycle
+"""
+
+
+def mu_cycle(mu, i, nu1, nu2, max_depth):
+    """
+    A utility for generating a generic :math:`\\mu`-cycle.
+
+    :arg mu: The order of the cycle. See...
+
+    :arg i: The initial and final (i.e., finest) level to traverse from/to.
+
+    :arg nu1: The number of iterations to perform on each level after a
+        transition to the next coarser level.
+
+    :arg nu2:  The number of iterations to perform on each level after a
+        transition to the next finer level.
+
+    :arg max_depth: The lowest level to traverse to.
+
+    :returns: A generic multigrid cycle in the form of a :class:`list` of
+        :class:`tuple`'s ``(level, iterations)``, representing the order of levels
+        to visit and how many smoothing iterations to perform on each.
+    """
+
+    if i == max_depth:
+        return [(i, nu2)]
+    else:
+        x = mu_cycle(mu, i+1, nu1, nu2, max_depth)
+        return [(i, nu1)] + x + x[1:]*(mu-1) + [(i, nu2)]
+
+
+def v_cycle(nu1, nu2, max_depth):
+    """
+    A utility for generating a V-cycle.
+
+    Example::
+
+        >>> v_cycle(10, 20, 3)
+        [(0, 10), (1, 10), (2, 10), (3, 20), (2, 20), (1, 20), (0, 20)]
+
+    :arg nu1: The number of iterations to perform on each level after a
+        transition to the next coarser level.
+
+    :arg nu2:  The number of iterations to perform on each level after a
+        transition to the next finer level.
+
+    :arg max_depth: The lowest level to traverse to.
+
+    :returns: A V-cycle in the form of a :class:`list` of
+        :class:`tuple`'s ``(level, iterations)``, representing the order of levels
+        to visit and how many smoothing iterations to perform on each.
+    """
+
+    return mu_cycle(1, 0, nu1, nu2, max_depth)
+
+
+def w_cycle(nu1, nu2, max_depth):
+    """
+    A utility for generating a W-cycle.
+
+    Example::
+
+        >>> w_cycle(10, 20, 3)
+        [(0, 10), (1, 10), (2, 10), (3, 20), (2, 20), (3, 20), (2, 20), (1, 20),
+        (2, 10), (3, 20), (2, 20), (3, 20), (2, 20), (1, 20), (0, 20)]
+
+    :arg nu1: The number of iterations to perform on each level after a
+        transition to the next coarser level.
+
+    :arg nu2:  The number of iterations to perform on each level after a
+        transition to the next finer level.
+
+    :arg max_depth: The lowest level to traverse to.
+
+    :returns: A W-cycle in the form of a :class:`list` of
+        :class:`tuple`'s ``(level, iterations)``, representing the order of levels
+        to visit and how many smoothing iterations to perform on each.
+    """
+
+    return mu_cycle(2, 0, nu1, nu2, max_depth)
+
+
+def _cycle(i, j, k, nu1, nu2):
+    down = [(a, nu1) for a in range(i, j)]
+    up = [(a, nu2) for a in range(j, k-1, -1)]
+    return down + up
+
+
+def f_cycle(nu1, nu2, max_depth):
+    """
+    A utility for generating a F-cycle.
+
+    Example::
+
+        >>> f_cycle(10, 20, 3)
+        [(0, 10), (1, 10), (2, 10), (3, 20), (2, 20), (3, 20), (2, 20), (1, 20),
+        (2, 10), (3, 20), (2, 20), (1, 20), (0, 20)]
+
+    :arg nu1: The number of iterations to perform on each level after a
+        transition to the next coarser level.
+
+    :arg nu2:  The number of iterations to perform on each level after a
+        transition to the next finer level.
+
+    :arg max_depth: The lowest level to traverse to.
+
+    :returns: An F-cycle in the form of a :class:`list` of
+        :class:`tuple`'s ``(level, iterations)``, representing the order of levels
+        to visit and how many smoothing iterations to perform on each.
+    """
+
+    cycle = _cycle(0, max_depth, max_depth-1, nu1, nu2)
+    for top in range(max_depth-1, 0, -1):
+        cycle += _cycle(top+1, max_depth, top-1, nu1, nu2)
+    return cycle
+
+
+class FullApproximationScheme:
+    """
+    A class for solving generic systems of boundary-value problems using the
+    Full Approximation Scheme.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+
+    The below methods are documented for development's sake, but are not
+    intended to be called by the user.
+
+    .. automethod:: coarse_array_like
+    .. automethod:: transfer_down
+    .. automethod:: transfer_up
+    .. automethod:: smooth
+    .. automethod:: coarse_level_like
+    .. automethod:: setup
+    """
+
+    def __init__(self, solver, h, **kwargs):
+        """
+        :arg solver: A instance of a subclass of :class:`relax.RelaxationBase`
+            (e.g., :class:`JacobiIterator` or :class:`NewtonIterator`).
+
+        :arg h: The number of halo padding layers on each face of the numerical grid.
+
+        The following keyword-only arguments are recognized:
+
+        :arg Restrictor: A mapper which restricts arrays from a fine
+            to a coarser level.
+            Defaults to :class:`FullWeighting`.
+
+        :arg Interpolator: A mapper which interpolates arrays from a coarse
+            to a finer level.
+            Defaults to :class:`LinearInterpolation`.
+        """
+
+        self.solver = solver
+        self.h = h
+
+        Restrictor = kwargs.pop('Restrictor', FullWeighting)
+        self.restrict = Restrictor(h=h)
+        self.restrict_and_correct = Restrictor(h=h, correct=True)
+
+        Interpolator = kwargs.pop('Interpolator', LinearInterpolation)
+        self.interpolate = Interpolator(h=h)
+        self.interpolate_and_correct = Interpolator(h=h, correct=True)
+
+        self.unknowns = {}
+        self.rhos = {}
+        self.auxiliaries = {}
+        self.tmp = {}
+        self.resid = {}
+        self.dx = {}
+        self.decomp = {}
+        self.smooth_args = {}
+        self.resid_args = {}
+
+    def coarse_array_like(self, f1h):
+        """
+        :arg f1h: A :class:`pyopencl.array.Array`.
+            Its unpadded shape will be inferred by subtracting ``2 * self.h``
+            from each axis of its shape.
+
+        :returns: A :class:`pyopencl.array.Array` with padded shape for a
+            grid with half as many points in each dimension of ``f1h``.
+        """
+
+        def halve_and_pad(i):
+            return (i - 2 * self.h)//2 + 2 * self.h
+
+        coarse_shape = tuple(map(halve_and_pad, f1h.shape))
+        f2h = cla.zeros(f1h.queue, shape=coarse_shape, dtype=f1h.dtype)
+        return f2h
+
+    def transfer_down(self, queue, i):
+        """
+        Transfers all arrays from a fine to the next-coarser level.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg i: The level from to transfer to.
+        """
+
+        for k, f1 in self.unknowns[i-1].items():
+            f2 = self.unknowns[i][k]
+            self.restrict(queue, f1=f1, f2=f2)
+            self.decomp[i].share_halos(queue, f2)
+
+        self.solver.residual(queue, **self.resid_args[i-1])
+
+        for k, r1 in self.resid[i-1].items():
+            r2 = self.resid[i][k]
+            self.decomp[i-1].share_halos(queue, r1)
+            self.restrict(queue, f1=r1, f2=r2)
+
+        self.solver.lhs_correction(queue, **self.resid_args[i])
+        for k, rho in self.rhos[i].items():
+            self.decomp[i].share_halos(queue, rho)
+
+    def transfer_up(self, queue, i):
+        """
+        Transfers all arrays from a coarse to the next-finer level.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg i: The level from to transfer to.
+        """
+
+        for k, f1 in self.unknowns[i].items():
+            f2 = self.unknowns[i+1][k]
+            self.restrict_and_correct(queue, f1=f1, f2=f2)
+            self.decomp[i+1].share_halos(queue, f2)
+            self.interpolate_and_correct(queue, f1=f1, f2=f2)
+            self.decomp[i].share_halos(queue, f1)
+
+    def smooth(self, queue, i, nu):
+        """
+        Invokes the relaxation solver, computing the error before and after.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg i: On which level to perform the smoothing.
+
+        :arg nu: The number of smoothing iterations to perform.
+
+        :returns: A list containing the errors before and after of the form
+            ``[(i, error_before), (i, error_after)]``.
+        """
+
+        errs1 = self.solver.get_error(queue, **self.resid_args[i])
+        self.solver(self.decomp[i], queue, iterations=nu, **self.smooth_args[i])
+        errs2 = self.solver.get_error(queue, **self.resid_args[i])
+        return [(i, errs1), (i, errs2)]
+
+    def coarse_level_like(self, dict_1):
+        """
+        A wrapper to :meth:`coarse_array_like` with returns a :class:`dict`
+        like ``dict_1`` whose values are new :class:`pyopencl.array.Array`'s
+        with shape appropriate for the next-coarser level.
+        """
+
+        dict_2 = {}
+        for k, f1 in dict_1.items():
+            dict_2[k] = self.coarse_array_like(f1)
+        return dict_2
+
+    def setup(self, decomp0, queue, dx0, depth, **kwargs):
+        """
+        Performs the inital setup and array allocation for each required level.
+        Creates instances of :class:`~pystella.DomainDecomposition` for each level
+        and all arrays needed on each level.
+        Called automatically by :meth:`__call__`.
+
+        :arg decomp0: An instance of :class:`~pystella.DomainDecomposition`
+            constructed for the finest level.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg dx0: The grid-spacing on the finest level.
+
+        :arg depth: The coarsest level to traverse to. That is, the deepest level
+            which will be used has a factor ``2**depth`` fewer gridpoints than the
+            finest level.
+
+        All unknowns and ``rho`` arrays must be passed by keyword.
+        Any additional keyword arguments are interpreted as auxillary arrays which
+        must be available on all levels.
+        """
+
+        self.decomp[0] = decomp0
+        self.dx[0] = np.array(dx0)
+
+        self.unknowns[0] = {}
+        self.rhos[0] = {}
+        for k, v in self.solver.f_to_rho_dict.items():
+            self.unknowns[0][k] = kwargs.pop(k)
+            self.rhos[0][v] = kwargs.pop(v)
+
+        self.auxiliaries[0] = kwargs
+
+        if 0 not in self.tmp:
+            self.tmp[0] = {}
+            self.resid[0] = {}
+            for k, f in self.unknowns[0].items():
+                self.tmp[0]['tmp_'+k] = cla.zeros_like(f)
+                self.resid[0]['r_'+k] = self.tmp[0]['tmp_'+k]
+
+        for i in range(depth+1):
+            if i not in self.dx:
+                self.dx[i] = np.array(self.dx[i-1] * 2)
+
+            if i not in self.decomp:
+                ng_2 = tuple(ni // 2 for ni in self.decomp[i-1].rank_shape)
+                from pystella import DomainDecomposition
+                self.decomp[i] = \
+                    DomainDecomposition(self.decomp[i-1].proc_shape,
+                                        self.h, ng_2)
+
+            if i not in self.unknowns:
+                self.unknowns[i] = self.coarse_level_like(self.unknowns[i-1])
+
+            if i not in self.tmp:
+                self.tmp[i] = self.coarse_level_like(self.tmp[i-1])
+                self.resid[i] = {}
+                for k, f in self.unknowns[i].items():
+                    self.resid[i]['r_'+k] = self.tmp[i]['tmp_'+k]
+
+            if i not in self.rhos:
+                self.rhos[i] = self.coarse_level_like(self.rhos[i-1])
+
+            if i not in self.auxiliaries:
+                self.auxiliaries[i] = self.coarse_level_like(self.auxiliaries[i-1])
+                for k, f1 in self.auxiliaries[i-1].items():
+                    f2 = self.auxiliaries[i][k]
+                    self.restrict(queue, f1=f1, f2=f2)
+                    self.decomp[i].share_halos(queue, f2)
+
+            if i not in self.smooth_args:
+                self.smooth_args[i] = {**self.unknowns[i], **self.rhos[i],
+                                       **self.auxiliaries[i], **self.tmp[i]}
+                self.smooth_args[i]['dx'] = np.array(self.dx[i])
+
+            if i not in self.resid_args:
+                self.resid_args[i] = {**self.unknowns[i], **self.rhos[i],
+                                      **self.auxiliaries[i], **self.resid[i]}
+                self.resid_args[i]['dx'] = np.array(self.dx[i])
+
+    def __call__(self, decomp0, queue, dx0, cycle=None, **kwargs):
+        """
+        Executes a specified multigrid cycle.
+
+        :arg decomp0: An instance of :class:`~pystella.DomainDecomposition`
+            constructed for the finest level.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg dx0: The grid-spacing on the finest level.
+
+        :arg cycle: The multigrid cycle to execute.
+            See :ref:`multigrid-cycles` for details on how these are specified
+            and for utilities to generate them.
+
+        All required arrays must be passed by keyword.
+        """
+
+        if cycle is None:
+            grid_shape = tuple(ni * pi
+                               for ni, pi in zip(decomp0.rank_shape,
+                                                 decomp0.proc_shape))
+            depth = int(np.log2(min(grid_shape) / 8))
+            cycle = v_cycle(25, 50, depth)
+
+        depth = max([i for i, nu in cycle])
+        self.setup(decomp0, queue, dx0, depth, **kwargs)
+
+        nu0 = cycle[0][1]
+        level_errors = self.smooth(queue, 0, nu0)
+
+        previous = 0
+        for i, nu in cycle[1:]:
+            if i == previous + 1:
+                self.transfer_down(queue, i)
+            elif i == previous - 1:
+                self.transfer_up(queue, i)
+            else:
+                raise ValueError('consecutive levels must be spaced by one')
+            level_errors += self.smooth(queue, i, nu)
+            previous = i
+
+        return level_errors
+
+
+class MultiGridSolver(FullApproximationScheme):
+    """
+    A class for solving systems of linear boundary-value problems using linear
+    Multigrid.
+    Usage is identical to :class:`FullApproximationScheme`.
+
+    .. warning::
+
+        Convergence is currently slower than expected, suggesting a possible
+        problem with the lower levels.
+        :class:`FullApproximationScheme` is perfectly suited to solve linear problems
+        as well.
+
+    The scheme is implemented by subclassing :class:`FullApproximationScheme`, with
+    the only differences in the level transfer functionality (which are not intended
+    to be called by the user).
+
+    .. automethod transfer_down
+    .. automethod transfer_up
+    """
+
+    # FIXME: convergence slow, possible issue with coarse levels?
+    def transfer_down(self, queue, i):
+        self.solver.residual(queue, **self.resid_args[i-1])
+
+        for f, rho in self.solver.f_to_rho_dict.items():
+            r1 = self.resid[i-1]['r_'+f]
+            self.decomp[i-1].share_halos(queue, r1)
+            r2 = self.rhos[i][rho]
+            self.restrict(queue, f1=r1, f2=r2)
+            self.decomp[i].share_halos(queue, r2)
+
+    def transfer_up(self, queue, i):
+        for k, f1 in self.unknowns[i].items():
+            f2 = self.unknowns[i+1][k]
+            self.interpolate_and_correct(queue, f1=f1, f2=f2)
+            self.decomp[i].share_halos(queue, f1)
+
+
+__all__ = [
+    'Injection',
+    'FullWeighting',
+    'LinearInterpolation',
+    'CubicInterpolation',
+    'JacobiIterator',
+    'NewtonIterator',
+    'FullApproximationScheme',
+    'MultiGridSolver',
+    'v_cycle',
+    'w_cycle',
+    'f_cycle',
+]
diff --git a/pystella/multigrid/relax.py b/pystella/multigrid/relax.py
new file mode 100644
index 0000000..e44739e
--- /dev/null
+++ b/pystella/multigrid/relax.py
@@ -0,0 +1,376 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import loopy as lp
+from pystella import Field, get_field_args, Stencil
+
+__doc__ = """
+.. currentmodule:: pystella.multigrid
+.. autoclass:: pystella.multigrid.relax.RelaxationBase
+.. autoclass:: JacobiIterator
+.. autoclass:: NewtonIterator
+"""
+
+
+class RelaxationBase:
+    """
+    Base class for relaxation-based iterative solvers to solve
+    boundary-value problems of the form
+
+    .. math::
+
+        L(f) = \\rho.
+
+    Here :math:`\\rho` is not a function of :math:`f`, but :math:`L(f)`
+    may in principle be an arbitrary (nonlinear differential) function
+    of :math:`f` (assuming a subclass's implemented solver is appropriate
+    for such an equation).
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    .. automethod:: get_error
+
+    A subclass implements a particular iterative solver by providing
+    a :meth:`step_operator` method.
+
+    .. automethod:: step_operator
+
+    The below methods are documented for development's sake, but are not
+    intended to be called by the user:
+
+    .. automethod:: make_stepper
+    .. automethod:: make_lhs_kernel
+    .. automethod:: make_residual_kernel
+    .. automethod:: make_resid_stats
+
+    The following methods related to solving additional constraints on
+    systems with periodic boundary conditions are incomplete:
+
+    .. automethod:: make_shift_kernel
+    .. automethod:: eval_constraint
+    .. automethod:: solve_constraint
+    """
+
+    def __init__(self, decomp, queue, lhs_dict, MapKernel=Stencil, **kwargs):
+        """
+        :arg decomp: A :class:`~pystella.DomainDecomposition`.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        :arg lhs_dict: A :class:`dict` representing the set of equations to be
+            solved, whose keys must be :class:`~pystella.Field`'s representing the
+            unknown degrees of freedom and values are :class:`tuple`'s
+            ``(lhs, rho)`` representing the left-hand side :math:`L(f)`
+            and right-hand side :math:`\\rho` of that unknown's equation.
+
+        The following keyword arguments are recognized:
+
+        :arg MapKernel: The kernel class which the required mapping kernels will
+            be instances of---i.e., one of :class:`~pystella.ElementWiseMap` or its
+            subclasses. Defaults to :class:`~pystella.Stencil`.
+
+        :arg unknown_args: A list of :class:`loopy.ArrayArg`'s representing
+            the unknown degrees of freedom.
+            Defaults to *None*, in which case the correct arguments
+            (in particular, their shapes) are (attempted to be) inferred
+            from the keys of ``lhs_dict``.
+
+        :arg rho_args: A list of :class:`loopy.ArrayArg`'s representing
+            the static right-hand side arrays (i.e., those independent
+            of the degrees of freedom).
+            Defaults to *None*, in which case the correct arguments
+            (in particular, their shapes) are (attempted to be) inferred
+            from the values of ``lhs_dict``.
+
+        Any remaining keyword arguments are passed to each of the kernel
+        creation routines.
+        """
+
+        self.decomp = decomp
+        self.lhs_dict = lhs_dict
+        self.h = kwargs.get('h')
+
+        # get GlobalArgs of unknowns, or infer from lhs_dict.keys()
+        self.unknown_args = kwargs.pop('unknown_args', None)
+        if self.unknown_args is None:
+            self.unknown_args = get_field_args(list(lhs_dict.keys()))
+
+        def array_args_like(args, prefix='', suffix=''):
+            return [lp.GlobalArg(prefix+arg.name+suffix,
+                                 shape=arg.shape, dtype=arg.dtype)
+                    for arg in args]
+
+        self.temp_args = array_args_like(self.unknown_args, prefix='tmp_')
+        self.residual_args = array_args_like(self.unknown_args, prefix='r_')
+
+        # get GlobalArgs of unknowns, or infer from lhs_dict.keys()
+        self.rho_args = kwargs.pop('rho_args', None)
+        if self.rho_args is None:
+            rho_list = [lhs[1] for lhs in lhs_dict.values()]
+            self.rho_args = get_field_args(rho_list)
+
+        self.f_to_rho_dict = {}
+        for f, (lhs, rho) in self.lhs_dict.items():
+            self.f_to_rho_dict[f.child.name] = rho.child.name
+
+        self.make_stepper(MapKernel, **kwargs)
+        self.make_lhs_kernel(MapKernel, **kwargs)
+        self.make_residual_kernel(MapKernel, **kwargs)
+        self.make_resid_stats(decomp, queue, **kwargs)
+        self.make_shift_kernel(**kwargs)
+
+    def step_operator(self, f, lhs, rho):
+        """
+        :arg f: The unknown field for which a relaxation step instruction
+            will be generated.
+
+        :arg lhs: :math:`L(f)` for the unknown ``f``'s equation.
+
+        :arg rho: :math:`\\rho` for the unknown ``f``'s equation.
+        """
+
+        raise NotImplementedError
+
+    def make_stepper(self, MapKernel, **kwargs):
+        self.step_dict = {}
+        for f, (lhs, rho) in self.lhs_dict.items():
+            tmp = Field('tmp_'+f.child.name, offset=f.offset)
+            self.step_dict[tmp] = self.step_operator(f, lhs, rho)
+
+        args = self.unknown_args + self.rho_args + self.temp_args
+        self.stepper = MapKernel(self.step_dict, args=args, **kwargs)
+
+    def step(self, queue, **kwargs):
+        self.stepper(queue, **kwargs)
+
+    def __call__(self, decomp, queue, iterations=100, **kwargs):
+        """
+        Executes a number of iterations of relaxation.
+
+        :arg decomp: A :class:`~pystella.DomainDecomposition`.
+
+            .. note::
+
+                ``decomp`` is intended to (and should) be different from the
+                :attr:`decomp` passed to :meth:`__init__`, as each multigrid level
+                requires a different :class:`~pystella.DomainDecomposition`.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        The following keyword arguments are recognized:
+
+        :arg iterations: The number of iterations to execute.
+            Defaults to ``100``.
+
+        :arg solve_constraint:
+            Defaults to *False*.
+
+        All arrays required for the relaxation step must be passed by keyword.
+        """
+
+        solve_constraint = kwargs.pop('solve_constraint', False)
+
+        even_iterations = iterations if iterations % 2 == 0 else iterations + 1
+        for i in range(even_iterations):
+            self.stepper(queue, **kwargs)
+            for arg in self.unknown_args:
+                f = arg.name
+                kwargs[f], kwargs['tmp_'+f] = kwargs['tmp_'+f], kwargs[f]
+                decomp.share_halos(queue, kwargs[f])
+
+            if solve_constraint:
+                self.solve_constraint(queue, **kwargs)
+
+    def make_lhs_kernel(self, MapKernel, **kwargs):
+        tmp_dict = {}
+        lhs_dict = {}
+        from pymbolic import var
+        tmp_lhs = var('tmp_lhs')
+        for i, (f, (lhs, rho)) in enumerate(self.lhs_dict.items()):
+            tmp_dict[tmp_lhs[i]] = lhs
+            resid = Field('r_'+f.child.name, offset='h')
+            lhs_dict[rho] = resid + tmp_lhs[i]
+
+        args = self.unknown_args + self.rho_args + self.residual_args
+        self.lhs_correction = MapKernel(lhs_dict, tmp_dict=tmp_dict, args=args,
+                                        **kwargs)
+
+    def make_residual_kernel(self, MapKernel, **kwargs):
+        residual_dict = {}
+        for f, (lhs, rho) in self.lhs_dict.items():
+            resid = Field('r_'+f.child.name, offset='h')
+            residual_dict[resid] = rho - lhs
+
+        args = self.unknown_args + self.rho_args + self.residual_args
+        self.residual = MapKernel(residual_dict, args=args, **kwargs)
+
+    def make_resid_stats(self, decomp, queue, dtype, **kwargs):
+        reducers = {}
+        avg_reducers = {}
+        # from pymbolic.functions import fabs
+        from pymbolic import var
+        fabs = var('fabs')
+        for arg in self.unknown_args:
+            f = arg.name
+            resid = Field('r_'+f, offset='h')
+            reducers[f] = [(fabs(resid), 'max'), (resid**2, 'sum')]
+            avg_reducers[f] = [(resid, 'sum')]
+
+        args = self.residual_args
+        from pystella import Reduction
+        self.resid_stats = Reduction(decomp, reducers, args=args, **kwargs)
+        self.avg_resid = Reduction(decomp, avg_reducers, args=args, **kwargs)
+
+    def get_error(self, queue, **kwargs):
+        """
+        Computes statistics of the current residual, :math:`L(f) - \\rho`.
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+
+        All required arrays must be passed by keyword.
+
+        :returns: A :class:`dict` whose values are :class:`list`'s of the
+            :math:`L_\\infty` (maximum absolute) and :math:`L_2` (Euclidean)
+            norms of the residual equation corresponding to the unknown denoted
+            by the keys of the dictionary.
+        """
+
+        self.residual(queue, **kwargs, filter_args=True)
+
+        padded_shape = kwargs.get(self.unknown_args[0].name).shape
+        rank_shape = tuple(i - 2 * self.h for i in padded_shape)
+        grid_size = np.product(self.decomp.proc_shape) * np.product(rank_shape)
+        errs = self.resid_stats(queue, **kwargs, filter_args=True,
+                                rank_shape=rank_shape, grid_size=grid_size)
+        for k, v in errs.items():
+            errs[k][1] = v[1]**.5
+
+        return errs
+
+    def make_shift_kernel(self, **kwargs):
+        f = Field('f', offset=0)
+        tmp = Field('tmp', offset=0)
+        from pymbolic import var
+        shift = var('shift')
+        scale = var('scale')
+        self.shift_dict = {tmp: scale * f + shift}
+
+        args = ['...']
+        from pystella import ElementWiseMap
+        self.shifter = ElementWiseMap(self.shift_dict, args=args, **kwargs)
+
+    def eval_constraint(self, queue, shifts, scales, **kwargs):
+        for arg, shift, scale in zip(self.unknown_args, shifts, scales):
+            f = arg.name
+            self.shifter(queue, f=kwargs[f], tmp=kwargs['tmp_'+f],
+                         shift=np.array(shift), scale=np.array(scale))
+
+        padded_shape = kwargs.get(self.unknown_args[0].name).shape
+        rank_shape = tuple(i - 2 * self.h for i in padded_shape)
+        grid_size = np.product(self.decomp.proc_shape) * np.product(rank_shape)
+
+        args_to_avg_resid = kwargs.copy()
+        for arg in self.unknown_args:
+            f = arg.name
+            args_to_avg_resid[f] = kwargs['tmp_'+f]
+
+        result = self.avg_resid(queue, **args_to_avg_resid, filter_args=True,
+                                rank_shape=rank_shape, grid_size=grid_size)
+        return result['avg']
+
+    def solve_constraint(self, queue, **kwargs):
+        raise NotImplementedError('constraint solving untested')
+
+        def integral_condition(shifts):
+            scales = np.ones_like(shifts)
+            avg = self.eval_constraint(queue, **kwargs, shifts=shifts, scales=scales)
+            return np.sum(avg)
+
+        from scipy.optimize import root_scalar
+        x0 = np.zeros(len(self.unknown_args))
+        x1 = x0 + 1.e-3
+        x0 += - 1.e-3
+        sol = root_scalar(integral_condition, x0=x0, x1=x1, method='secant')
+        if not sol.converged:
+            print(sol)
+        else:
+            shifts = sol.root
+            scales = np.ones_like(shifts)
+            for arg, shift, scale in zip(self.unknown_args, shifts, scales):
+                f = arg.name
+                self.shifter(queue, f=kwargs[f], tmp=kwargs[f],
+                             shift=np.array(shift), scale=np.array(scale))
+
+
+class JacobiIterator(RelaxationBase):
+    """
+    A subclass of :class:`RelaxationBase` which implements (damped) Jacobi iteration
+    for linear systems of the form :math:`L f = \\rho`, where :math:`L` is a linear
+    operator.
+    A step of Jacobi iteration takes the form
+
+    .. math::
+
+        f \\leftarrow (1 - \\omega) f
+        + \\omega D^{-1} \\left( \\rho - (L - D) f \\right)
+
+    where :math:`D` is the diagonal part of :math:`L`.
+    In practice :math:`D` is computed by differentiating :math:`L f` with respect to
+    :math:`f`, which is inappropriate for nonlinear system (which Jacobi
+    iteration is not intended for).
+    """
+
+    def step_operator(self, f, lhs, rho):
+        from pystella import diff
+        D = diff(lhs, f)
+        R_y = lhs - D * f  # FIXME: only valid for linear equations
+
+        from pymbolic import var
+        omega = var('omega')
+
+        return (1 - omega) * f + omega * (rho - R_y) / D
+
+
+class NewtonIterator(RelaxationBase):
+    """
+    A subclass of :class:`RelaxationBase` which implements Newton iteration
+    for arbitrary systems of the form :math:`L(f) = \\rho`, where :math:`L`
+    is a generic function of :math:`f`.
+    A step of Newton iteration takes the form
+
+    .. math::
+
+        f \\leftarrow f
+        - \\omega \\frac{L(f) - \\rho}{\\partial L(f) / \\partial f}
+
+    """
+
+    def step_operator(self, f, lhs, rho):
+        from pystella import diff
+        D = diff(lhs, f)
+
+        from pymbolic import var
+        omega = var('omega')
+
+        return f - omega * (lhs - rho) / D
diff --git a/pystella/multigrid/transfer.py b/pystella/multigrid/transfer.py
new file mode 100644
index 0000000..7e495d0
--- /dev/null
+++ b/pystella/multigrid/transfer.py
@@ -0,0 +1,265 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import loopy as lp
+from pystella import Field
+from pystella import Stencil, ElementWiseMap
+from pystella.derivs import expand_stencil
+
+__doc__ = """
+.. currentmodule:: pystella.multigrid
+.. autofunction:: pystella.multigrid.transfer.RestrictionBase
+.. autofunction:: FullWeighting
+.. autofunction:: Injection
+.. autofunction:: pystella.multigrid.transfer.InterpolationBase
+.. autofunction:: LinearInterpolation
+.. autofunction:: CubicInterpolation
+"""
+
+
+def RestrictionBase(coefs, StencilKernel, h, **kwargs):
+    """
+    A base function for generating a restriction kernel.
+
+    :arg coefs: The coefficients representing the restriction formula.
+        Follows the convention of :func:`pystella.derivs.centered_diff`
+        (since the restriction is applied recursively in each dimension).
+
+    :arg StencilKernel: The stencil mapper to create an instance of.
+        Defaults to :class:`~pystella.Stencil`.
+
+    :arg h: The number of halo padding layers on each face of the numerical grid.
+
+    :arg lsize: The shape of prefetched arrays in shared memory.
+        See :class:`~pystella.ElementWiseMap`.
+        Defaults to ``(4, 4, 4)``.
+
+    :arg correct: A :class:`bool` determining whether to produce a kernel which
+        corrects an output array by the restricted array, or to only perform
+        strict restriction.
+        Defaults to *False*.
+
+    :returns: An instance of ``StencilKernel`` which executes the requested
+        restriction.
+    """
+
+    lsize = kwargs.pop('lsize', (4, 4, 4))
+
+    # ensure grid dimensions are *not* passed, as they will be misinterpreted
+    for N in ['Nx', 'Ny', 'Nz']:
+        _ = kwargs.pop(N, None)
+
+    restrict_coefs = {}
+    for a, c_a in coefs.items():
+        for b, c_b in coefs.items():
+            for c, c_c in coefs.items():
+                restrict_coefs[(a, b, c)] = c_a * c_b * c_c
+
+    from pymbolic import parse, var
+    i, j, k = parse('i, j, k')
+    f1 = Field('f1', offset='h', indices=(2*i, 2*j, 2*k))
+    f2 = Field('f2', offset='h')
+    tmp = var('tmp')
+
+    tmp_dict = {tmp: expand_stencil(f1, restrict_coefs)}
+
+    if kwargs.pop('correct', False):
+        restrict_dict = {f2: f2 - tmp}
+    else:
+        restrict_dict = {f2: tmp}
+
+    args = [lp.GlobalArg('f1', shape='(2*Nx+2*h, 2*Ny+2*h, 2*Nz+2*h)'),
+            lp.GlobalArg('f2', shape='(Nx+2*h, Ny+2*h, Nz+2*h)')]
+
+    if isinstance(StencilKernel, Stencil):
+        return StencilKernel(restrict_dict, tmp_dict=tmp_dict, args=args,
+                             prefetch_args=['f1'], h=h, lsize=lsize,
+                             **kwargs)
+    else:
+        return StencilKernel(restrict_dict, tmp_dict=tmp_dict, args=args,
+                             h=h, lsize=lsize, **kwargs)
+
+
+def FullWeighting(StencilKernel=Stencil, **kwargs):
+    """
+    Creates a full-weighting restriction kernel, which restricts in input array
+    :math:`f^{(h)}` on the fine grid into an array :math:`f^{(2 h)}` on the
+    coarse grid by applying
+
+    .. math::
+
+        f^{(2 h)}_i
+        = \\frac{1}{4} f^{(h)}_{2 i - 1}
+            + \\frac{1}{2} f^{(h)}_{2 i}
+            + \\frac{1}{4} f^{(h)}_{2 i + 1}
+
+    in each dimension.
+
+    See :class:`transfer.RestrictionBase`.
+    """
+
+    from pymbolic.primitives import Quotient
+    coefs = {-1: Quotient(1, 4), 0: Quotient(1, 2), 1: Quotient(1, 4)}
+    return RestrictionBase(coefs, StencilKernel, **kwargs)
+
+
+def Injection(StencilKernel=ElementWiseMap, **kwargs):
+    """
+    Creates an injection kernel, which restricts in input array
+    :math:`f^{(h)}` on the fine grid into an array :math:`f^{(2 h)}` on the
+    coarse grid by direct injection:
+
+    .. math::
+
+        f^{(2 h)}_{i, j ,k}
+        = f^{(h)}_{2 i, 2 j, 2 k}
+
+    See :class:`transfer.RestrictionBase`.
+    """
+
+    coefs = {0: 1}
+    return RestrictionBase(coefs, StencilKernel, **kwargs)
+
+
+def InterpolationBase(even_coefs, odd_coefs, StencilKernel, h, **kwargs):
+    """
+    A base function for generating a restriction kernel.
+
+    :arg even_coefs: The coefficients representing the interpolation formula
+        for gridpoints on the coarse and fine grid which coincide in space.
+        Follows the convention of :func:`pystella.derivs.centered_diff`
+        (since the restriction is applied recursively in each dimension).
+
+    :arg odd_coefs: Same as ``even_coefs``, but for points on the fine grid which
+        lie between points on the coarse grid.
+
+    :arg StencilKernel: The stencil mapper to create an instance of.
+        Defaults to :class:`~pystella.Stencil`.
+
+    :arg h: The number of halo padding layers on each face of the numerical grid.
+
+    :arg correct: A :class:`bool` determining whether to produce a kernel which
+        corrects an output array by the interpolated array, or to only perform
+        strict interpolation.
+        Defaults to *False*.
+
+    :returns: An instance of ``StencilKernel`` which executes the requested
+        interpolation.
+    """
+
+    from pymbolic import parse, var
+    i, j, k = parse('i, j, k')
+    f1 = Field('f1', offset='h')
+
+    tmp_dict = {}
+    tmp = var('tmp')
+
+    import itertools
+    for parity in tuple(itertools.product((0, 1), (0, 1), (0, 1))):
+        result = 0
+        for a, c_a in odd_coefs.items() if parity[0] else even_coefs.items():
+            for b, c_b in odd_coefs.items() if parity[1] else even_coefs.items():
+                for c, c_c in odd_coefs.items() if parity[2] else even_coefs.items():
+                    f2 = Field('f2', offset='h',
+                               indices=((i+a)//2, (j+b)//2, (k+c)//2))
+                    result += c_a * c_b * c_c * f2
+
+        tmp_dict[tmp[parity]] = result
+
+    def is_odd(expr):
+        from pymbolic.primitives import If, Comparison, Remainder
+        return If(Comparison(Remainder(expr, 2), '==', 1), 1, 0)
+
+    a, b, c = parse('a, b, c')
+    for ind, val in zip((i, j, k), (a, b, c)):
+        tmp_dict[val] = is_odd(ind)
+
+    if kwargs.pop('correct', False):
+        interp_dict = {f1: f1 + tmp[a, b, c]}
+    else:
+        interp_dict = {f1: tmp[a, b, c]}
+
+    args = [lp.GlobalArg('f1', shape='(Nx+2*h, Ny+2*h, Nz+2*h)'),
+            lp.GlobalArg('f2', shape='(Nx//2+2*h, Ny//2+2*h, Nz//2+2*h)')]
+
+    return StencilKernel(interp_dict, tmp_dict=tmp_dict, args=args,
+                         prefetch_args=['f2'], h=h, **kwargs)
+
+
+def LinearInterpolation(StencilKernel=Stencil, **kwargs):
+    """
+    Creates an linear interpolation kernel, which interpolates in input array
+    :math:`f^{(h)}` on the fine grid into an array :math:`f^{(2 h)}` on the
+    coarse grid via
+
+    .. math::
+
+        f^{(h)}_{2 i}
+        &= f^{(2 h)}_{i}
+
+        f^{(h)}_{2 i + 1}
+        &= \\frac{1}{2} f^{(2 h)}_{i} + \\frac{1}{2} f^{(2 h)}_{i + 1}
+
+    in each dimension.
+
+    See :class:`transfer.InterpolationBase`.
+    """
+
+    from pymbolic.primitives import Quotient
+    odd_coefs = {-1: Quotient(1, 2), 1: Quotient(1, 2)}
+    even_coefs = {0: 1}
+
+    return InterpolationBase(even_coefs, odd_coefs, StencilKernel, **kwargs)
+
+
+def CubicInterpolation(StencilKernel=Stencil, **kwargs):
+    """
+    Creates an cubic interpolation kernel, which interpolates in input array
+    :math:`f^{(h)}` on the fine grid into an array :math:`f^{(2 h)}` on the
+    coarse grid via
+
+    .. math::
+
+        f^{(h)}_{2 i}
+        &= f^{(2 h)}_{i}
+
+        f^{(h)}_{2 i + 1}
+        &= - \\frac{1}{16} f^{(2 h)}_{i - 1}
+            + \\frac{9}{16} f^{(2 h)}_{i}
+            + \\frac{9}{16} f^{(2 h)}_{i + 1}
+            - \\frac{1}{16} f^{(2 h)}_{i + 2}
+
+    in each dimension.
+
+    See :class:`transfer.InterpolationBase`.
+    """
+
+    if kwargs.get('h', 0) < 2:
+        raise ValueError('CubicInterpolation requires padding >= 2')
+
+    from pymbolic.primitives import Quotient
+    odd_coefs = {-3: Quotient(-1, 16), -1: Quotient(9, 16),
+                 1: Quotient(9, 16), 3: Quotient(-1, 16)}
+    even_coefs = {0: 1}
+
+    return InterpolationBase(even_coefs, odd_coefs, StencilKernel, **kwargs)
diff --git a/pystella/output.py b/pystella/output.py
new file mode 100644
index 0000000..fe1e5d8
--- /dev/null
+++ b/pystella/output.py
@@ -0,0 +1,177 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import h5py
+
+
+def get_versions(dependencies):
+    import importlib
+    import pkg_resources
+    from pytools import find_module_git_revision
+    versions = {}
+    git_revs = {}
+    for dep in dependencies:
+        try:
+            versions[dep] = pkg_resources.get_distribution(dep).version
+        except ModuleNotFoundError:
+            versions[dep] = None
+        try:
+            file = importlib.import_module(dep.replace('.', '')).__file__
+            git_revs[dep] = find_module_git_revision(file, n_levels_up=1)
+        except ModuleNotFoundError:
+            git_revs[dep] = None
+    return versions, git_revs
+
+
+def append(dset, data):
+    dset.resize(dset.shape[0]+1, axis=0)
+    dset[-1] = data
+
+
+class OutputFile(h5py.File):
+    """
+    A wrapper to :class:`h5py:File` which collects and saves useful run
+    information and provides functionality to append to datasets.
+
+    .. automethod:: __init__
+    .. automethod:: output
+    """
+
+    def create_from_kwargs(self, group, **kwargs):
+        self.create_group(group)
+        for key, val in kwargs.items():
+            if not isinstance(val, np.ndarray):
+                val = np.array(val)
+            shape = (0,) + val.shape
+            maxshape = (None,) + val.shape
+            self[group].create_dataset(key, shape=shape, dtype=val.dtype,
+                                       maxshape=maxshape, chunks=True)
+
+    def __init__(self, context=None, name=None, runfile=None, **kwargs):
+        """
+        No arguments are required, but the following keyword arguments are
+        recognized:
+
+        :arg context: A :class:`pyopencl.Context`. If not *None*, information
+            about the device, driver, and platform is saved to the
+            :attr:`attrs` dictionary.
+            Defaults to *None*.
+
+        :arg name: The name of the ``.h5`` (sans the extension) file to create.
+            If *None*, a unique filename is chosen based on the current date and
+            time.
+            Defaults to *None*.
+
+        :arg runfile: A file whose content will be saved as a string to
+            ``attrs['runfile']``, if not *None*. Useful for attaching the run file
+            of a simulation to its output.
+            Defaults to *None*.
+
+        Any remaining keyword arguments are saved to the :attr:`attrs` dictionary.
+        If any value ``val`` is not of valid type to be saved, the ``val.__name__``
+        attribute is saved if the value is a :class:`type` instance, or else
+        ``str(val)`` is saved.
+
+        Versions and git revisions (when available) of :mod:`pystella` and its
+        dependencies are saved as ``'versions'`` and ``'git_revs'``
+        :class:`h5py:Dataset`'s. The hostname is recorded in the ``'hostname'``
+        key of the :attr:`attrs` dictionary.
+        """
+
+        if name is None:
+            import datetime
+            name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+        while True:
+            try:
+                filename = name + '.h5'
+                super().__init__(filename, 'x')
+                break
+            except OSError:
+                import time
+                time.sleep(1)
+                name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+        if context is not None:
+            device, = context.devices
+            self.attrs['device'] = device.name
+            self.attrs['driver_version'] = device.driver_version
+            self.attrs['platform_version'] = device.platform.version
+
+        import socket
+        self.attrs['hostname'] = socket.gethostname()
+
+        for key, val in kwargs.items():
+            try:
+                self.attrs[key] = val
+            except:  # noqa
+                if isinstance(val, type):
+                    self.attrs[key] = val.__name__
+                else:
+                    self.attrs[key] = str(val)
+
+        if runfile is not None:
+            fp = open(runfile, "r")
+            content = fp.read()
+            fp.close()
+            self.attrs['runfile'] = content
+
+        # output current dependency versions
+        dependencies = ['pystella', 'numpy', 'scipy',
+                        'pyopencl', 'loo.py', 'pymbolic',
+                        'mpi4py', 'gpyfft', 'mpi4py_fft', 'h5py']
+        versions, git_revs = get_versions(dependencies)
+
+        self.create_group('versions')
+        for k, v in versions.items():
+            self['versions'][k] = v
+
+        self.create_group('git_revs')
+        for k, v in git_revs.items():
+            self['git_revs'][k] = '' if v is None else v
+
+    def output(self, group, **kwargs):
+        """
+        Appends values to datasets within a :class:`h5py:Group` named ``group``.
+        ``group`` is created if it does not exist, and the :class:`h5py:Dataset`'s
+        of this :class:`h5py:Group` are determined by the keys of keyword arguments.
+        If ``group`` already exists, iterates over each :class:`h5py:Dataset` and
+        appends values from keyword arguments (matching :class:`h5py:Dataset`
+        names to keys).
+
+        :arg group: The :class:`h5py:Group` to append :class:`h5py:Dataset`
+            values to.
+
+        If ``group`` already exists, a keyword argument for each
+        :class:`h5py:Dataset` in ``group`` must be provided.
+        """
+
+        # create group and datasets if they don't exist
+        if group not in self:
+            self.create_from_kwargs(group, **kwargs)
+
+        # ensure that all fields are provided
+        for key in self[group]:
+            val = kwargs.pop(key)
+            append(self[group][key], val)
diff --git a/pystella/reduction.py b/pystella/reduction.py
new file mode 100644
index 0000000..dbb2d45
--- /dev/null
+++ b/pystella/reduction.py
@@ -0,0 +1,356 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl.array as cla
+import loopy as lp
+
+from warnings import filterwarnings
+from loopy.diagnostic import ParameterFinderWarning
+filterwarnings('ignore', category=ParameterFinderWarning)
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: Reduction
+"""
+
+
+def get_mpi_reduction_op(op):
+    from mpi4py import MPI
+    _MPI_REDUCTION_OPS = {
+        "sum": MPI.SUM,
+        "product": MPI.PROD,
+        "max": MPI.MAX,
+        "min": MPI.MIN,
+        }
+
+    if op in _MPI_REDUCTION_OPS:
+        return _MPI_REDUCTION_OPS[op]
+    else:
+        raise NotImplementedError('MPI allreduce for operation %s' % op)
+
+
+def get_numpy_reduction_op(op):
+    _NUMPY_REDUCTION_OPS = {
+        "sum": np.sum,
+        "product": np.prod,
+        "max": np.max,
+        "min": np.min,
+        }
+
+    if op in _NUMPY_REDUCTION_OPS:
+        return _NUMPY_REDUCTION_OPS[op]
+    else:
+        raise NotImplementedError('numpy reduction for operation %s' % op)
+
+
+def red_stmnt(assignee, expr, op):
+    from pystella import Indexer
+    red = lp.symbolic.Reduction(operation=op,
+                                inames=('i'),
+                                expr=Indexer(expr),
+                                allow_simultaneous=True)
+    return lp.Assignment(assignee, red)
+
+
+class Reduction:
+    """
+    An interface to :func:`loopy.make_kernel` which computes (an arbitrary
+    number of) reductions.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+    def make_reduce_knl(self, statements, args):
+        knl = lp.make_kernel(
+            "[Nx, Ny, Nz] -> {[i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz}",
+            statements + [lp.Assignment('Nx_', 'Nx')],
+            args + [lp.GlobalArg('Nx_', shape=(), dtype='int'), '...'],
+            default_offset=lp.auto,
+            lang_version=(2018, 2),
+            )
+        knl = lp.split_iname(knl, "k", 32, outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_iname(knl, "j", 2, outer_tag="g.1", inner_tag="ilp")
+        knl = lp.remove_unused_arguments(knl)
+        return knl
+
+    def __init__(self, decomp, input, **kwargs):
+        """
+        :arg decomp: An instance of :class:`DomainDecomposition`.
+
+        :arg input: May be one of the following:
+
+            * a :class:`dict`. The output of :meth:`__call__` will be a dictionary
+              with the same keys whose values are the corresponding reductions
+              of the :class:`dict`'s values. The values may either be lists of
+              :mod:`pymbolic` expressions or a lists of :class:`tuple`'s
+              ``(expr, op)``, where ``expr`` is a :mod:`pymbolic` expression and
+              ``op`` is the reduction operation to perform. Valid options are
+              ``'sum'`` (default), ``'prod'``, ``'max'``, and ``'min'``.
+              Sum operations are assumed to be averages by :meth:`__call__`.
+
+            * a :class:`Sector`. In this case, the reduction dictionary will be
+              obtained from :attr:`Sector.reducers`, and :attr:`args` from
+              :meth:`Sector.get_args`.
+
+            * a :class:`list` of :class:`Sector`'s. In this case, the input
+              obtained from each :class:`Sector` (as described above) will be
+              combined.
+
+        The following keyword-only arguments are recognized:
+
+        :arg args: A list of kernel arguments to be specified to
+            :func:`loopy.make_kernel`. Defaults to ``['...']``, which instructs
+            :func:`loopy.make_kernel` to infer all arguments and their shapes.
+
+        :arg rank_shape: A 3-:class:`tuple` specifying the global size of every
+            kernel call.
+            Defaults to *None*, in which case the global size is not fixed (and
+            will be inferred when the kernel is called, at a slight performance
+            penalty).
+
+        :arg h: The number of halo padding layers on each face of the numerical grid.
+            Defaults to *None*, in which case it is not fixed at kernel creation.
+
+        :arg grid_size: The total number of gridpoints on the entire computational
+            grid.
+            Defaults to *None*, in which case it will be inferred at
+            :meth:`__call__` (if averages are being performed).
+
+        :arg callback: A :class:`callable` used to process the reduction results
+            before :meth:`__call__` returns.
+            Defaults to ``lambda x: x``, i.e., doing nothing.
+        """
+
+        self.decomp = decomp
+        from pystella import Sector
+        if isinstance(input, Sector):
+            self.reducers = input.reducers
+            self.args = input.get_args(single_stage=True)
+        elif isinstance(input, list):
+            self.reducers = dict(i for s in input for i in s.reducers.items())
+            self.args = {arg.name: arg for s in input
+                         for arg in s.get_args(single_stage=True)}
+            self.args = list(self.args.values())
+
+        elif isinstance(input, dict):
+            self.reducers = input
+            self.args = kwargs.pop('args', ['...'])
+        else:
+            raise NotImplementedError
+        reducers = self.reducers
+        rank_shape = kwargs.pop('rank_shape', None)
+        h = kwargs.pop('h', None)
+        self.grid_size = kwargs.pop('grid_size', None)
+        self.callback = kwargs.pop('callback', lambda x: x)
+
+        self.num_reductions = sum(len(i) for i in reducers.values())
+
+        from pymbolic import var
+        tmp = var('tmp')
+        self.tmp_dict = {}
+        i = 0
+        for key, val in reducers.items():
+            inext = i + len(val)
+            self.tmp_dict[key] = range(i, inext)
+            i = inext
+
+        # flatten and process inputs into expression and operation
+        flat_reducers = []
+        reduction_ops = []
+        for val in reducers.values():
+            for v in val:
+                if isinstance(v, tuple):
+                    flat_reducers.append(v[0])
+                    reduction_ops.append(v[1])
+                else:
+                    flat_reducers.append(v)
+                    reduction_ops.append('sum')
+        self.reduction_ops = reduction_ops
+
+        statements = [red_stmnt(tmp[i, var('j'), var('k')], v, op)
+                      for i, (v, op) in enumerate(zip(flat_reducers, reduction_ops))]
+
+        knl = self.make_reduce_knl(statements, self.args)
+
+        if rank_shape is not None:
+            knl = lp.fix_parameters(
+                knl, Nx=rank_shape[0], Ny=rank_shape[1], Nz=rank_shape[2]
+            )
+        if h is not None:
+            knl = lp.fix_parameters(knl, h=h)
+
+        self.knl = lp.set_options(knl, return_dict=True)
+
+        self.pool = None
+
+    def reduce_array(self, arr, op):
+        np_op = get_numpy_reduction_op(op)
+        rank_sum = np_op(arr.get())  # cla.sum(arr).get()
+
+        if self.decomp.comm is not None:
+            mpi_op = get_mpi_reduction_op(op)
+            return self.decomp.allreduce(rank_sum, op=mpi_op)
+        else:
+            return rank_sum
+
+    def __call__(self, queue, filter_args=False, **kwargs):
+        """
+        Performs reductions by calling :attr:`knl` and
+        :meth:`DomainDecomposition.allreduce`.
+
+        :arg queue: The :class:`pyopencl.CommandQueue` on which to enqueue the
+            kernel.
+            If *None*, ``queue`` is not passed (i.e., for
+            :class:`loopy.ExecutableCTarget`)
+
+        The following keyword arguments are recognized:
+
+        :arg filter_args: Whether to filter ``kwargs`` such that no unexpected
+            arguments are passed to the :attr:`knl`. Defaults to *False*.
+
+        The remaining keyword arguments are passed to :attr:`knl`.
+
+        :returns: A :class:`dict` with the same keys as (interpreted from) ``input``
+            whose values are the corresponding (lists of) reduced values. If a
+            given reduction operation is a sum, the average is returned by
+            dividing by :attr:`grid_size`.
+            If ``grid_size`` was not supplied at :meth:`__init__`, it is inferred
+            (at a slight performance penalty).
+        """
+
+        if self.pool is None:
+            import pyopencl.tools as clt
+            self.pool = clt.MemoryPool(clt.ImmediateAllocator(queue))
+
+        input_args = kwargs.copy()
+        if filter_args:
+            kernel_args = [arg.name for arg in self.knl.args]
+            for arg in kwargs:
+                if arg not in kernel_args:
+                    input_args.pop(arg)
+
+        evt, output = self.knl(queue, allocator=self.pool, **input_args)
+        tmp = output['tmp']
+        vals = {}
+        for key, sub_indices in self.tmp_dict.items():
+            reductions = []
+            for j in sub_indices:
+                op = self.reduction_ops[j]
+                val = self.reduce_array(tmp[j], op)
+                if op == 'sum':
+                    if self.grid_size is None:
+                        Nx = output['Nx_'].get()
+                        sub_grid_size = Nx * np.product(tmp[j].shape)
+                        grid_size = self.decomp.allreduce(sub_grid_size)
+                    else:
+                        grid_size = self.grid_size
+                    val /= grid_size
+                reductions.append(val)
+            vals[key] = np.array(reductions)
+        return self.callback(vals)
+
+
+class FieldStatistics(Reduction):
+    """
+    A subclass of :class:`Reduction` which computes the mean and variance of
+    fields.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, decomp, h, **kwargs):
+        """
+        :arg decomp: An instance of :class:`DomainDecomposition`.
+
+        :arg h: The number of halo padding layers on each face of the numerical grid.
+
+        The following keyword-only arguments are recognized:
+
+        :arg max_min: A :class:`bool` determining whether to also compute the
+            actual and absolute maxima and minima of fields.
+            Defaults to *False*.
+
+        Any remaining keyword arguments are passed to :meth:`Reduction.__init__`.
+        """
+
+        self.min_max = kwargs.pop('max_min', False)
+
+        from pystella import Field
+        f = Field('f', offset='h')
+        reducers = {}
+        reducers['mean'] = [f]
+        reducers['variance'] = [f**2]
+        if self.min_max:
+            reducers['max'] = [(f, 'max')]
+            reducers['min'] = [(f, 'min')]
+            # from pymbolic.functions import fabs
+            from pymbolic import var
+            fabs = var('fabs')
+            reducers['abs_max'] = [(fabs(f), 'max')]
+            reducers['abs_min'] = [(fabs(f), 'min')]
+        self.reducers = reducers
+
+        args = \
+            [
+                lp.GlobalArg('f', shape="(Nx+2*h, Ny+2*h, Nz+2*h)", offset=lp.auto),
+            ]
+
+        super().__init__(decomp, reducers, args=args, h=h, **kwargs)
+
+    def __call__(self, f, queue=None):
+        """
+        :arg f: The array whose statistics will be computed.
+            If ``f`` has more than three axes, all the outer axes are looped over.
+            As an example, if ``f`` has shape ``(2, 3, 130, 130, 130)``,
+            this method loops over the outermost two axes with shape ``(2, 3)``, and
+            the resulting output data would have the same shape.
+
+        The following keyword arguments are recognized:
+
+        :arg queue: A :class:`pyopencl.CommandQueue`.
+            Defaults to ``fx.queue``.
+
+        :returns: A :class:`dict` of means and variances, whose values are lists
+            of the statistic (key) for each array in ``fields``.
+        """
+
+        queue = queue or f.queue
+
+        outer_shape = f.shape[:-3]
+        from itertools import product
+        slices = list(product(*[range(n) for n in outer_shape]))
+
+        out = {k: np.zeros(outer_shape) for k in self.reducers.keys()}
+        for s in slices:
+            stats = super().__call__(queue, f=f[s])
+            for k, v in stats.items():
+                if k == 'variance':
+                    out[k][s] = stats['variance'][0] - stats['mean'][0]**2
+                else:
+                    out[k][s] = v[0]
+
+        out = {k: np.array(v) for k, v in out.items()}
+
+        return out
diff --git a/pystella/sectors.py b/pystella/sectors.py
new file mode 100644
index 0000000..b0bfebf
--- /dev/null
+++ b/pystella/sectors.py
@@ -0,0 +1,319 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import loopy as lp
+from pystella import DynamicField, Field
+from pystella.field import diff
+from pymbolic import var
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: Sector
+.. autoclass:: ScalarSector
+.. autoclass:: TensorPerturbationSector
+
+.. currentmodule:: pystella.sectors
+.. autofunction:: get_rho_and_p
+"""
+
+eta = [-1, 1, 1, 1]
+
+
+class Sector:
+    """
+    A unimplemented base class defining the methods and properties needed for
+    code generation for most simulations.
+
+    .. automethod:: __init__
+    .. automethod:: get_args
+    .. autoattribute:: rhs_dict
+    .. autoattribute:: reducers
+    .. automethod:: stress_tensor
+    """
+
+    def __init__(self):
+        """
+        Processes input needed to specify a model for the particular
+        :class:`Sector`.
+        """
+
+        raise NotImplementedError
+
+    def get_args(self, single_stage=False):
+        """
+        :returns: A :class:`list` of all :class:`loopy.KernelArgument`'s
+            relevant for a particular sector.
+
+        :arg single_stage: Whether array shapes should include an outermost axis
+            denoting temporary copies.
+        """
+        raise NotImplementedError
+
+    @property
+    def rhs_dict(self):
+        """
+        An ``@property`` method returning a :class:`dict` specifying the system
+        of equations to be time-integrated.
+        See the documentation of :class:`~pystella.step.Stepper`.
+        """
+        raise NotImplementedError
+
+    @property
+    def reducers(self):
+        """
+        An ``@property`` method returning :class:`dict` specifying the quantities
+        to be computed, e.g., energy components for :class:`Expansion` and output.
+        See the documentation of :class:`Reduction`.
+        """
+        raise NotImplementedError
+
+    def stress_tensor(self, mu, nu, drop_trace=True):
+        """
+        :arg drop_trace: Whether to drop the term
+            :math:`g_{\\mu\\nu} \\mathcal{L}`.
+            Defauls to *False*.
+
+        :returns: The component :math:`T_{\\mu\\nu}` of the stress-energy
+            tensor of the particular :class:`Sector`.
+            Used by :class:`TensorPerturbationSector`, with ``drop_trace=True``.
+        """
+        raise NotImplementedError
+
+
+class ScalarSector(Sector):
+    """
+    A :class:`Sector` of scalar fields.
+
+    .. automethod:: __init__
+    """
+
+    def __init__(self, nscalars, **kwargs):
+        """
+        :arg nscalars: The total number of scalar fields.
+
+        The following keyword-only arguments are recognized:
+
+        :arg f: The :class:`DynamicField` of scalar fields.
+            Defaults to ``DynamicField('f', offset='h')``.
+
+        :arg potential: A :class:`callable` which takes as input a
+            :mod:`pymbolic` expression or a :class:`list` thereof, returning
+            the potential of the scalar fields.
+            Defaults to ``lambda x: 0``.
+
+        :arg dilaton_coup: The kinetic coupling between the scalar fields.
+            Defaults to ``lambda x: 1``.
+
+        :raises ValueError: if a particular field is coupled to its own kinetic
+            term.
+        """
+
+        self.nscalars = nscalars
+        self.f = kwargs.pop('f', DynamicField('f', offset='h'))
+        self.potential = kwargs.pop('potential', lambda x: 0)
+        dilaton = kwargs.pop('dilaton', lambda x, fld: 1)
+        self.dilaton_coup = [dilaton(self.f, fld) for fld in range(nscalars)]
+
+    def get_args(self, single_stage=False):
+        if single_stage:
+            shape = "(%d, Nx+2*h, Ny+2*h, Nz+2*h)" % self.nscalars
+        else:
+            shape = "(3, %d, Nx+2*h, Ny+2*h, Nz+2*h)" % self.nscalars
+        lap_shape = "(%d, Nx, Ny, Nz)" % self.nscalars
+        pd_shape = "(%d, 3, Nx, Ny, Nz)" % self.nscalars
+        a = lp.ValueArg('a') if single_stage else lp.GlobalArg('a', shape=(3,))
+        H = lp.ValueArg('hubble') if single_stage \
+            else lp.GlobalArg('hubble', shape=(3,))
+
+        all_args = \
+            [
+                lp.GlobalArg(self.f.name, shape=shape, offset=lp.auto),
+                lp.GlobalArg(self.f.dot.name, shape=shape, offset=lp.auto),
+                lp.GlobalArg(self.f.lap.name, shape=lap_shape, offset=lp.auto),
+                lp.GlobalArg(self.f.pd.name, shape=pd_shape, offset=lp.auto),
+                a,
+                H,
+            ]
+        return all_args
+
+    @property
+    def rhs_dict(self):
+        f = self.f
+        H = Field('hubble', indices=[])
+        a = Field('a', indices=[])
+
+        rhs_dict = {}
+        V = self.potential(f)
+        dl = self.dilaton_coup
+
+        xmu = ['t', 'x', 'y', 'z']
+
+        for fld in range(self.nscalars):
+            if diff(dl[fld], f[fld]) != 0:
+                raise NotImplementedError("Scalar self-coupling via kinetic term")
+
+            dQdx = [diff(dl[fld], xmu[nu]) for nu in range(4)]
+
+            dl_coup_1 = sum([eta[nu] * dQdx[nu] * diff(f[fld], xmu[nu])
+                             for nu in range(4)]) / dl[fld]
+
+            dl_coup_2 = sum([diff(dl[b], f[fld])
+                             * sum([eta[mu] * f.d(b, mu)**2 for mu in range(4)]) / 2
+                             for b in range(self.nscalars)])
+
+            rhs_dict[f[fld]] = f.dot[fld]
+            rhs_dict[f.dot[fld]] = (f.lap[fld]
+                                    - 2 * H * f.dot[fld]
+                                    + dl_coup_1
+                                    - dl_coup_2
+                                    - a**2 * diff(V, f[fld]) / dl[fld])
+        return rhs_dict
+
+    @property
+    def reducers(self):
+        f = self.f
+        a = var('a')
+        dl = self.dilaton_coup
+
+        reducers = {}
+        reducers['kinetic'] = [dl[fld] * f.dot[fld]**2 / 2
+                               for fld in range(self.nscalars)]
+        reducers['potential'] = [a**2 * self.potential(f)]
+        reducers['gradient'] = [-f[fld] * f.lap[fld] / 2 if dl[fld] == 1 else
+                                dl[fld] * sum([f.d(fld, i)**2
+                                               for i in range(1, 4)]) / 2
+                                for fld in range(self.nscalars)]
+        return reducers
+
+    def stress_tensor(self, mu, nu, drop_trace=False):
+        f = self.f
+        a = Field('a', indices=[])
+        dl = self.dilaton_coup
+
+        Tmunu = sum(dl[fld] * f.d(fld, mu) * f.d(fld, nu)
+                    for fld in range(self.nscalars))
+
+        if drop_trace:
+            return Tmunu
+        else:
+            metric = np.diag((-1/a**2, 1/a**2, 1/a**2, 1/a**2))  # contravariant
+            lag = (- sum(dl[fld]
+                         * sum(metric[mu, nu] * f.d(fld, mu) * f.d(fld, nu)
+                               for mu in range(4) for nu in range(4))
+                         for fld in range(self.nscalars)) / 2
+                   - self.potential(self.f))
+            metric = np.diag((-a**2, a**2, a**2, a**2))  # covariant
+            return Tmunu + metric[mu, nu] * lag
+
+
+def tensor_index(i, j):
+    a = i if i <= j else j
+    b = j if i <= j else i
+    return (7 - a) * a // 2 - 4 + b
+
+
+class TensorPerturbationSector:
+    """
+    A :class:`Sector` of tensor perturbations.
+
+    .. automethod:: __init__
+    """
+
+    def __init__(self, sectors, **kwargs):
+        """
+        :arg sectors: The :class:`Sector`'s whose :meth:`~Sector.stress_tensor`'s
+            source the tensor perturbations.
+
+        The following keyword-only arguments are recognized:
+
+        :arg hij: The :class:`DynamicField` of tensor fields.
+            Defaults to ``DynamicField('hij', offset='h')``.
+        """
+
+        self.hij = kwargs.pop('hij', DynamicField('hij', offset='h'))
+        self.sectors = sectors
+
+    def get_args(self, single_stage=False):
+        if single_stage:
+            shape = "(6, Nx+2*h, Ny+2*h, Nz+2*h)"
+        else:
+            shape = "(3, 6, Nx+2*h, Ny+2*h, Nz+2*h)"
+        shape_unpadded = "(6, Nx, Ny, Nz)"
+
+        # assuming that a and H arguments are in other sectors
+        all_args = \
+            [
+                lp.GlobalArg('hij', shape=shape, offset=lp.auto),
+                lp.GlobalArg('dhijdt', shape=shape, offset=lp.auto),
+                lp.GlobalArg('lap_hij', shape=shape_unpadded, offset=lp.auto),
+            ]
+        return all_args
+
+    @property
+    def rhs_dict(self):
+        hij = self.hij
+        H = Field('hubble', indices=[])
+
+        rhs_dict = {}
+
+        for i in range(1, 4):
+            for j in range(i, 4):
+                fld = tensor_index(i, j)
+                Sij = sum(sector.stress_tensor(i, j, drop_trace=True)
+                          for sector in self.sectors)
+                rhs_dict[hij[fld]] = hij.dot[fld]
+                rhs_dict[hij.dot[fld]] = (hij.lap[fld]
+                                          - 2 * H * hij.dot[fld]
+                                          + 16 * np.pi * Sij)
+
+        return rhs_dict
+
+    @property
+    def reducers(self):
+        return {}
+
+
+def get_rho_and_p(energy):
+    """
+    Convenience callback for energy reductions which computes :math:`\\rho` and
+    :math:`P` (really, :math:`a^2` times each).
+
+    :arg energy: A dictionary of energy components as returned by
+        :class:`~pystella.Reduction`.
+    """
+
+    energy['total'] = sum(sum(e) for e in energy.values())
+    energy['pressure'] = 0
+    if 'kinetic' in energy:
+        energy['pressure'] += sum(energy['kinetic'])
+    if 'gradient' in energy:
+        energy['pressure'] += - sum(energy['gradient']) / 3
+    if 'potential' in energy:
+        energy['pressure'] += - sum(energy['potential'])
+    if 'electric' in energy:
+        energy['pressure'] += sum(energy['electric']) / 3
+    if 'magnetic' in energy:
+        energy['pressure'] += sum(energy['magnetic']) / 3
+
+    return energy
diff --git a/pystella/stencil.py b/pystella/stencil.py
new file mode 100644
index 0000000..557fd73
--- /dev/null
+++ b/pystella/stencil.py
@@ -0,0 +1,136 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import loopy as lp
+from pystella.elementwise import ElementWiseMap
+
+from warnings import filterwarnings
+filterwarnings('ignore', category=lp.diagnostic.LoopyAdvisory,
+               message="could not find a conflict-free mem layout")
+from pyopencl.characterize import CLCharacterizationWarning
+filterwarnings('ignore', category=CLCharacterizationWarning)
+
+__doc__ = """
+.. currentmodule:: pystella
+.. autoclass:: Stencil
+.. ifconfig:: not on_rtd
+
+    .. autoclass:: StreamingStencil
+"""
+
+
+class Stencil(ElementWiseMap):
+    """
+    A subclass of :class:`ElementWiseMap`, which creates a kernel with
+    parallelization suitable for stencil-type operations which are
+    "non-local"--namely, computations which combine multiple neighboring values
+    from a global array into a single output (per workitem/thread).
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def _assignment(self, assignee, expression, **kwargs):
+        no_sync_with = kwargs.pop('no_sync_with', None)
+        return lp.Assignment(assignee, expression,
+                             no_sync_with=no_sync_with,
+                             **kwargs)
+
+    def parallelize(self, knl, lsize):
+        knl = lp.split_iname(knl, "k", lsize[0], outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_iname(knl, "j", lsize[1], outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "i", lsize[2], outer_tag="g.2", inner_tag="l.2")
+
+        for arg in self.prefetch_args:
+            knl = lp.add_prefetch(knl, arg, ["i_inner", "j_inner", "k_inner"],
+                                  fetch_bounding_box=True, default_tag=None,
+                                  temporary_name='_'+arg)
+            for x in [arg+'_dim_0:l.2', arg+'_dim_1:l.1', arg+'_dim_2:l.0']:
+                knl = lp.tag_inames(knl, x)
+        return knl
+
+    def __init__(self, map_dict, h, **kwargs):
+        """
+        In addition to the parameters to :meth:`ElementWiseMap.__init__`,
+        the following arguments are required:
+
+        :arg h: The "radius" of the stencil, i.e., the integral number of padded
+            array elements in each direction required to cover the footprint of the
+            stencil.
+
+        The following keyword-only arguments are recognized:
+
+        :arg prefetch_args: A list of arrays (namely, their name as a string)
+            which should be prefetched into local memory. Defaults to an empty list.
+        """
+
+        self.h = h
+        self.prefetch_args = kwargs.pop('prefetch_args', [])
+
+        _lsize = (8, 8, 8) if self.h == 1 \
+                    else (8, 4, 4) if self.h == 2 \
+                    else (4, 4, 4) if self.h == 3 \
+                    else (2, 2, 2)
+        lsize = kwargs.pop('lsize', _lsize)
+
+        super().__init__(map_dict=map_dict, lsize=lsize,
+                         silenced_warnings=['single_writer_after_creation'],
+                         **kwargs, h=self.h)
+
+
+class StreamingStencil(Stencil):
+    """
+    A subclass of :class:`Stencil` which performs a "streaming" prefetch
+    in place of a standard, single-block prefetch.
+
+    .. warning::
+        Currently, :func:`loopy.add_prefetch` only supports streaming prefetches
+        of a single array.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def parallelize(self, knl, lsize):
+        knl = lp.split_iname(knl, "k", lsize[0], outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_iname(knl, "j", lsize[1], outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "i", lsize[2])
+
+        for arg in self.prefetch_args:
+            knl = lp.add_prefetch(knl, arg, ["i_inner", "j_inner", "k_inner"],
+                                  stream_iname="i_outer",
+                                  fetch_bounding_box=True, default_tag=None,
+                                  temporary_name='_'+arg)
+            for x in [arg+'_dim_1:l.1', arg+'_dim_2:l.0']:
+                knl = lp.tag_inames(knl, x)
+
+        return knl
+
+    def __init__(self, map_dict, **kwargs):
+        if len(kwargs.get('prefetch_args', [])) > 1:
+            raise NotImplementedError('Streaming codegen can only handle one \
+                                       prefetch array for now')
+
+        lsize = kwargs.pop('lsize', (16, 4, 8))
+
+        super().__init__(map_dict, lsize=lsize, **kwargs)
diff --git a/pystella/step.py b/pystella/step.py
new file mode 100644
index 0000000..6a03f60
--- /dev/null
+++ b/pystella/step.py
@@ -0,0 +1,605 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import loopy as lp
+from pystella.field import Field, Indexer
+from pystella.elementwise import ElementWiseMap
+from pymbolic import var
+
+__doc__ = """
+.. currentmodule:: pystella.step
+.. autoclass:: Stepper
+
+Classical Runge-Kutta methods
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: RungeKuttaStepper
+.. currentmodule:: pystella
+.. autoclass:: RungeKutta4
+.. autoclass:: RungeKutta3SSP
+.. autoclass:: RungeKutta3Heun
+.. autoclass:: RungeKutta3Nystrom
+.. autoclass:: RungeKutta3Ralston
+.. autoclass:: RungeKutta2Midpoint
+.. autoclass:: RungeKutta2Ralston
+
+Low-storage Runge-Kutta methods
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: pystella.step
+.. autoclass:: LowStorageRKStepper
+.. currentmodule:: pystella
+.. autoclass:: LowStorageRK54
+.. autoclass:: LowStorageRK3Williamson
+.. autoclass:: LowStorageRK3Inhomogeneous
+.. autoclass:: LowStorageRK3SSP
+"""
+
+
+class Stepper:
+    """
+    The base class for time steppers, with no implementation of a particular time
+    stepper. Currently, only explicit timesteppers are supported.
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+
+    .. attribute:: num_stages
+
+        The number of substeps/stages per timestep.
+
+    .. attribute:: expected_order
+
+        The expected convergence order of *global* error, i.e.
+        :math:`n` such that the global error is :math:`\\mathcal{O}(\\Delta t^n)`.
+    """
+
+    num_stages = None
+    expected_order = None
+
+    def make_steps(self, MapKernel=ElementWiseMap, **kwargs):
+        raise NotImplementedError
+
+    def __init__(self, input, MapKernel=ElementWiseMap, **kwargs):
+        """
+        :arg input: May be one of the following:
+
+            * a :class:`dict` whose values represent the right-hand side
+              of the ODEs to solve, i.e., `(key, value)` pairs corresponding to
+              :math:`(y, f)` such that
+
+                .. math::
+
+                    \\frac{\\mathrm{d} y}{\\mathrm{d} t} = f,
+
+              where :math:`f` is an arbitrary function of kernel data.
+              Both keys and values must be :mod:`pymbolic` expressions.
+
+            * a :class:`~pystella.Sector`. In this case, the right-hand side
+              dictionary will be obtained from :attr:`~pystella.Sector.rhs_dict`, and
+              :attr:`args` from :meth:`~pystella.Sector.get_args`.
+
+            * a :class:`list` of :class:`~pystella.Sector`'s. In this case, the input
+              obtained from each :class:`~pystella.Sector` (as described above) will
+              be combined.
+
+        The following keyword arguments are recognized:
+
+        :arg MapKernel: The kernel class which each substep/stage will be an
+            instance of---i.e., one of :class:`~pystella.ElementWiseMap` or its
+            subclasses. Defaults to :class:`~pystella.ElementWiseMap`.
+
+        :arg dt: A :class:`float` fixing the value of the timestep interval.
+            Defaults to *None*, in which case it is not fixed at kernel creation.
+
+        The remaining arguments are passed to :meth:`MapKernel.__init__` for
+        each substep of the timestepper (i.e., see the documentation of
+        :class:`~pystella.ElementWiseMap`).
+        """
+
+        single_stage = kwargs.pop('single_stage', False)
+        from pystella import Sector
+        if isinstance(input, Sector):
+            self.rhs_dict = input.rhs_dict
+            self.args = input.get_args(single_stage=single_stage)
+        elif isinstance(input, list):
+            self.args = [arg for s in input
+                         for arg in s.get_args(single_stage=single_stage)]
+            self.rhs_dict = dict(i for s in input for i in s.rhs_dict.items())
+        elif isinstance(input, dict):
+            self.rhs_dict = input
+            self.args = kwargs.pop('args', ['...'])
+        self.args = self.args + [lp.ValueArg('dt')]
+
+        dt = kwargs.pop('dt', None)
+        fixed_parameters = kwargs.pop('fixed_parameters', dict())
+        if dt is not None:
+            fixed_parameters.update(dict(dt=dt))
+
+        self.num_odes = len(self.rhs_dict.keys())
+        self.steps = self.make_steps(**kwargs, fixed_parameters=fixed_parameters)
+
+    def __call__(self, stage, queue=None, **kwargs):
+        """
+        Calls substep/stage ``stage`` (:attr:`steps[stage]`) of the timestepper,
+        i.e., :func:`pystella.ElementWiseMap.__call__` for the kernel for
+        substep/stage ``stage``.
+
+        :arg stage: The substep/stage of time timestepper to call.
+
+        :returns: The :class:`pyopencl.Event` associated with the kernel invocation.
+        """
+
+        evt, _ = self.steps[stage](queue, **kwargs)
+        return evt
+
+
+class RungeKuttaStepper(Stepper):
+    """
+    The base implementation of classical, explicit Runge-Kutta time steppers,
+    which operate by storing and operating on multiple copies of each unknown
+    array. Subclasses must provide an implementation of :meth:`step_statements`
+    which returns a key-value pair implementing a specific substep of the
+    particular timestepper.
+
+    .. warning::
+
+        To minimize the required storage per unknown (i.e., numbner of
+        temporaries), the implementation of most subclasses overwrite arrays that
+        are being read as input to compute right-hand sides. This means that any
+        non-local (stencil-type) operations must be precomputed and cached
+        *globally* (unless otherwise noted).
+
+    :raises ValueError: if the keys of :attr:`rhs_dict` are not
+        :class:`~pystella.Field`'s (or :class:`pymbolic.primitives.Subscript`'s
+        thereof). This is required for :meth:`make_steps` to be able to prepend
+        unknown arrays' subscripts with the index corresponding to the temporary
+        storage axis.
+    """
+
+    def step_statements(self, stage, f, dt, rhs):
+        raise NotImplementedError
+
+    def make_steps(self, MapKernel=ElementWiseMap, **kwargs):
+        rhs = var('rhs')
+        dt = var('dt')
+        q = var('q')
+        fixed_parameters = kwargs.pop('fixed_parameters', dict())
+
+        rhs_statements = {rhs[i]: Indexer(value, prepend_with=(q,))
+                          for i, value in enumerate(self.rhs_dict.values())}
+
+        steps = []
+        for stage in range(self.num_stages):
+            RK_dict = {}
+            for i, f in enumerate(self.rhs_dict.keys()):
+                # ensure that key is either a Field or a Subscript of a Field
+                # so that Indexer can prepend the q index
+                from pymbolic.primitives import Subscript
+                key_has_field = False
+                if isinstance(f, Field):
+                    key_has_field = True
+                elif isinstance(f, Subscript):
+                    if isinstance(f.aggregate, Field):
+                        key_has_field = True
+
+                if not key_has_field:
+                    raise ValueError("rhs_dict keys must be Field instances")
+
+                statements = self.step_statements(stage, f, dt, rhs[i])
+                for k, v in statements.items():
+                    RK_dict[k] = v
+
+            fixed_parameters.update(q=0 if stage == 0 else 1)
+
+            options = lp.Options(enforce_variable_access_ordered="no_check")
+            step = MapKernel(map_dict=RK_dict, tmp_dict=rhs_statements,
+                             args=self.args, **kwargs, options=options,
+                             fixed_parameters=fixed_parameters)
+            steps.append(step)
+
+        return steps
+
+
+class RungeKutta4(RungeKuttaStepper):
+    """
+    The classical, four-stage, fourth-order Runge-Kutta method.
+    Requires unknown arrays to have temporary storage axes of length three.
+    """
+
+    num_stages = 4
+    expected_order = 4
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(3)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt/2 * rhs,
+                    fq[2]: fq[0] + dt/6 * rhs}
+        elif stage == 1:
+            return {fq[1]: fq[0] + dt/2 * rhs,
+                    fq[2]: fq[2] + dt/3 * rhs}
+        elif stage == 2:
+            return {fq[1]: fq[0] + dt * rhs,
+                    fq[2]: fq[2] + dt/3 * rhs}
+        elif stage == 3:
+            return {fq[0]: fq[2] + dt/6 * rhs}
+
+
+class RungeKutta3Heun(RungeKuttaStepper):
+    """
+    Heun's three-stage, third-order Runge-Kutta method.
+    Requires unknown arrays to have temporary storage axes of length three.
+    """
+
+    num_stages = 3
+    expected_order = 3
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(3)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt/3 * rhs,
+                    fq[2]: fq[0] + dt/4 * rhs}
+        elif stage == 1:
+            return {fq[1]: fq[0] + dt*2/3 * rhs}
+        elif stage == 2:
+            return {fq[0]: fq[2] + dt*3/4 * rhs}
+
+
+class RungeKutta3Nystrom(RungeKuttaStepper):
+    """
+    Nystrom's three-stage, third-order Runge-Kutta method.
+    Requires unknown arrays to have temporary storage axes of length three.
+    """
+
+    num_stages = 3
+    expected_order = 3
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(3)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt*2/3 * rhs,
+                    fq[2]: fq[0] + dt*2/8 * rhs}
+        elif stage == 1:
+            return {fq[1]: fq[0] + dt*2/3 * rhs,
+                    fq[2]: fq[2] + dt*3/8 * rhs}
+        elif stage == 2:
+            return {fq[0]: fq[2] + dt*3/8 * rhs}
+
+
+class RungeKutta3Ralston(RungeKuttaStepper):
+    """
+    Ralston's three-stage, third-order Runge-Kutta method.
+    Requires unknown arrays to have temporary storage axes of length three.
+    """
+
+    num_stages = 3
+    expected_order = 3
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(3)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt/2 * rhs,
+                    fq[2]: fq[0] + dt*2/9 * rhs}
+        elif stage == 1:
+            return {fq[1]: fq[0] + dt*3/4 * rhs,
+                    fq[2]: fq[2] + dt*1/3 * rhs}
+        elif stage == 2:
+            return {fq[0]: fq[2] + dt*4/9 * rhs}
+
+
+class RungeKutta3SSP(RungeKuttaStepper):
+    """
+    A three-stage, third-order strong-stability preserving Runge-Kutta method.
+    Requires unknown arrays to have temporary storage axes of length two.
+    """
+
+    num_stages = 3
+    expected_order = 3
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(3)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt * rhs}
+        elif stage == 1:
+            return {fq[1]: 3/4 * fq[0] + 1/4 * fq[1] + dt/4 * rhs}
+        elif stage == 2:
+            return {fq[0]: 1/3 * fq[0] + 2/3 * fq[1] + dt*2/3 * rhs}
+
+
+class RungeKutta2Midpoint(RungeKuttaStepper):
+    """
+    The "midpoint" method, a two-stage, second-order Runge-Kutta method.
+    Requires unknown arrays to have temporary storage axes of length two.
+
+    .. note::
+
+        Right-hand side operations *can* safely involve non-local computations
+        of unknown arrays for this method.
+    """
+
+    num_stages = 2
+    expected_order = 2
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(2)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt/2 * rhs}
+        elif stage == 1:
+            return {fq[0]: fq[0] + dt * rhs}
+
+
+# possible order reduction
+class RungeKutta2Heun(RungeKuttaStepper):
+    num_stages = 2
+    expected_order = 2
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(2)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt * rhs,
+                    fq[0]: fq[0] + dt/2 * rhs}
+        elif stage == 1:
+            return {fq[0]: fq[0] + dt/2 * rhs}
+
+
+class RungeKutta2Ralston(RungeKuttaStepper):
+    """
+    Ralstons's two-stage, second-order Runge-Kutta method.
+    Requires unknown arrays to have temporary storage axes of length two.
+    """
+
+    num_stages = 2
+    expected_order = 2
+
+    def step_statements(self, stage, f, dt, rhs):
+        fq = [Indexer(f, prepend_with=(q,)) for q in range(2)]
+
+        if stage == 0:
+            return {fq[1]: fq[0] + dt*2/3 * rhs,
+                    fq[0]: fq[0] + dt/4 * rhs}
+        elif stage == 1:
+            return {fq[0]: fq[0] + dt*3/4 * rhs}
+
+
+class LowStorageRKStepper(Stepper):
+    """
+    The base implementation of low-storage, explicit Runge-Kutta time steppers,
+    which operate by storing and operating on a single copy of each unknown array,
+    plus an auxillary temporary array.
+
+    The substeps are expressed in a standard form, drawing coefficients from
+    a subclass's provided values of :attr:`_A`, :attr:`_B`, and :attr:`_C`.
+
+    .. automethod:: __init__
+    """
+
+    _A = []
+    _B = []
+    _C = []
+
+    def make_steps(self, MapKernel=ElementWiseMap, **kwargs):
+        self.args = self.args + [lp.GlobalArg('k_tmp', shape=lp.auto)]
+
+        rhs = var('rhs')
+        dt = var('dt')
+        # filter out indices for zero axes
+        # FIXME: Field.indices should never include offset, so that this can just
+        # replicate test_field.indices (rename to index_tuple?)
+        test_field = list(self.rhs_dict.keys())[0]
+        from pymbolic.primitives import Subscript
+        if isinstance(test_field, Field):
+            num_indices = len(test_field.indices)
+        elif isinstance(test_field, Subscript):
+            if isinstance(test_field.aggregate, Field):
+                num_indices = len(test_field.aggregate.indices)
+            else:
+                num_indices = len(test_field.index_tuple)
+        else:
+            num_indices = 0
+
+        indices = ('i', 'j', 'k')[:num_indices]
+        k = Field('k_tmp', indices=indices)
+
+        rhs_statements = {rhs[i]: Indexer(value)
+                          for i, value in enumerate(self.rhs_dict.values())}
+
+        steps = []
+        for stage in range(self.num_stages):
+            RK_dict = {}
+            for i, key in enumerate(self.rhs_dict.keys()):
+                f = Indexer(key)
+                k_i = Indexer(k[i])
+                RK_dict[k_i] = self._A[stage] * k_i + dt * rhs[i]
+                RK_dict[f] = f + self._B[stage] * k_i
+
+            step = MapKernel(map_dict=RK_dict, tmp_dict=rhs_statements,
+                             args=self.args, **kwargs)
+            steps.append(step)
+
+        return steps
+
+    def __init__(self, input, k_tmp, **kwargs):
+        """
+        :arg k_tmp: The array used for temporary
+            calculations. Its outer-/left-most axis (i.e., the axis of largest
+            stride) must have length equal to the total number of unknown ODEs.
+
+        .. note::
+
+            ``k_tmp`` may be replaced by inputting a :class:`pyopencl.CommandQueue`
+            in a future version of :mod:`pystella`. In this case, the creation of
+            this array would be done automatically.
+
+        Otherwise identical to :func:`Stepper.__init__`.
+        """
+
+        super().__init__(input, single_stage=True, **kwargs)
+        self.k_tmp = k_tmp
+
+    def __call__(self, stage, queue=None, **kwargs):
+        evt, _ = self.steps[stage](queue, k_tmp=self.k_tmp, **kwargs)
+        return evt
+
+
+class LowStorageRK54(LowStorageRKStepper):
+    """
+    A five-stage, fourth-order, low-storage Runge-Kutta method.
+
+    See
+    Carpenter, M.H., and Kennedy, C.A., Fourth-order-2N-storage
+    Runge-Kutta schemes, NASA Langley Tech Report TM 109112, 1994
+    """
+
+    num_stages = 5
+    expected_order = 4
+
+    _A = [
+        0,
+        -567301805773 / 1357537059087,
+        -2404267990393 / 2016746695238,
+        -3550918686646 / 2091501179385,
+        -1275806237668 / 842570457699,
+        ]
+
+    _B = [
+        1432997174477 / 9575080441755,
+        5161836677717 / 13612068292357,
+        1720146321549 / 2090206949498,
+        3134564353537 / 4481467310338,
+        2277821191437 / 14882151754819,
+        ]
+
+    _C = [
+        0,
+        1432997174477 / 9575080441755,
+        2526269341429 / 6820363962896,
+        2006345519317 / 3224310063776,
+        2802321613138 / 2924317926251,
+        ]
+
+
+class LowStorageRK3Williamson(LowStorageRKStepper):
+    """
+    A three-stage, third-order, low-storage Runge-Kutta method.
+
+    See
+    Williamson, J. H., Low-storage Runge-Kutta schemes,
+    J. Comput. Phys., 35, 48-56, 1980
+    """
+
+    num_stages = 3
+    expected_order = 3
+
+    _A = [0, -5/9, -153/128]
+
+    _B = [1/3, 15/16, 8/15]
+
+    _C = [0, 4/9, 15/32]
+
+
+class LowStorageRK3Inhomogeneous(LowStorageRKStepper):
+    """
+    A three-stage, third-order, low-storage Runge-Kutta method.
+    """
+
+    num_stages = 3
+    expected_order = 3
+
+    _A = [0, -17/32, -32/27]
+
+    _B = [1/4, 8/9, 3/4]
+
+    _C = [0, 15/32, 4/9]
+
+
+# possible order reduction
+class LowStorageRK3Symmetric(LowStorageRKStepper):
+    num_stages = 3
+    expected_order = 3
+
+    _A = [0, -2/3, -1]
+
+    _B = [1/3, 1, 1/2]
+
+    _C = [0, 1/3, 2/3]
+
+
+# possible order reduction
+class LowStorageRK3PredictorCorrector(LowStorageRKStepper):
+    num_stages = 3
+    expected_order = 3
+
+    _A = [0, -1/4, -4/3]
+
+    _B = [1/2, 2/3, 1/2]
+
+    _C = [0, 1/2, 1]
+
+
+c2 = .924574
+z1 = np.sqrt(36 * c2**4 + 36 * c2**3 - 135 * c2**2 + 84 * c2 - 12)
+z2 = 2 * c2**2 + c2 - 2
+z3 = 12 * c2**4 - 18 * c2**3 + 18 * c2**2 - 11 * c2 + 2
+z4 = 36 * c2**4 - 36 * c2**3 + 13 * c2**2 - 8 * c2 + 4
+z5 = 69 * c2**3 - 62 * c2**2 + 28 * c2 - 8
+z6 = 34 * c2**4 - 46 * c2**3 + 34 * c2**2 - 13 * c2 + 2
+B1 = c2
+B2 = (12 * c2 * (c2 - 1) * (3 * z2 - z1) - (3 * z2 - z1)**2) \
+     / (144 * c2 * (3 * c2 - 2) * (c2 - 1)**2)
+B3 = - 24 * (3 * c2 - 2) * (c2 - 1)**2 \
+     / ((3 * z2 - z1)**2 - 12 * c2 * (c2 - 1) * (3 * z2 - z1))
+A2 = (- z1 * (6 * c2**2 - 4 * c2 + 1) + 3 * z3) \
+     / ((2 * c2 + 1) * z1 - 3 * (c2 + 2) * (2 * c2 - 1)**2)
+A3 = (- z4 * z1 + 108 * (2 * c2 - 1) * c2**5 - 3 * (2 * c2 - 1) * z5) \
+     / (24 * z1 * c2 * (c2 - 1)**4 + 72 * c2 * z6 + 72 * c2**6 * (2 * c2 - 13))
+
+
+class LowStorageRK3SSP(LowStorageRKStepper):
+    """
+    A three-stage, third-order, strong-stability preserving, low-storage
+    Runge-Kutta method.
+    """
+
+    num_stages = 3
+    expected_order = 3
+
+    _A = [0, A2, A3]
+
+    _B = [B1, B2, B3]
+
+    _C = [0, B1, B1 + B2 * (A2 + 1)]
+
+
+all_steppers = [RungeKutta4, RungeKutta3SSP, RungeKutta3Heun, RungeKutta3Nystrom,
+                RungeKutta3Ralston, RungeKutta2Midpoint,
+                RungeKutta2Ralston, LowStorageRK54,
+                LowStorageRK3Williamson, LowStorageRK3Inhomogeneous,
+                LowStorageRK3SSP]
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100644
index 0000000..040b684
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+mpiexec -n 1 pytest --proc_shape 1,1,1 --grid_shape 128,128,128
+mpiexec -n 2 pytest --proc_shape 2,1,1 --grid_shape 128,128,128
+mpiexec -n 2 pytest --proc_shape 1,2,1 --grid_shape 128,128,128
+mpiexec -n 4 pytest --proc_shape 2,2,1 --grid_shape 128,128,128
+mpiexec -n 4 pytest --proc_shape 4,1,1 --grid_shape 128,128,128
+mpiexec -n 4 pytest --proc_shape 1,4,1 --grid_shape 128,128,128
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..a9fe54b
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,14 @@
+[flake8]
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
+max-line-length=85
+
+[pylint]
+disable = all
+enable = E,C0330
+output-format = colorized
+
+[build_sphinx]
+build-dir = doc/_build/
+
+[tool:pytest]
+filterwarnings = ignore::DeprecationWarning
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..96c0ef0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from setuptools import setup, find_packages
+
+
+# authoritative version in pytools/__init__.py
+def find_git_revision(tree_root):
+    # Keep this routine self-contained so that it can be copy-pasted into
+    # setup.py.
+
+    from os.path import join, exists, abspath
+    tree_root = abspath(tree_root)
+
+    if not exists(join(tree_root, ".git")):
+        return None
+
+    from subprocess import Popen, PIPE, STDOUT
+    p = Popen(["git", "rev-parse", "HEAD"], shell=False,
+              stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True,
+              cwd=tree_root)
+    (git_rev, _) = p.communicate()
+
+    import sys
+    if sys.version_info >= (3,):
+        git_rev = git_rev.decode()
+
+    git_rev = git_rev.rstrip()
+
+    retcode = p.returncode
+    assert retcode is not None
+    if retcode != 0:
+        from warnings import warn
+        warn("unable to find git revision")
+        return None
+
+    return git_rev
+
+
+def write_git_revision(package_name):
+    from os.path import dirname, join
+    dn = dirname(__file__)
+    git_rev = find_git_revision(dn)
+
+    with open(join(dn, package_name, "_git_rev.py"), "w") as outf:
+        outf.write("GIT_REVISION = %s\n" % repr(git_rev))
+
+
+write_git_revision("pystella")
+
+
+setup(name="pystella",
+      version="2019.5",
+      description="A code generator for grid-based PDE solving on CPUs and GPUs",
+      long_description=open("README.rst", "rt").read(),
+
+      install_requires=[
+          "numpy",
+          "pyopencl",
+          "loo.py",
+          ],
+
+      author="Zachary J Weiner",
+      license="MIT",
+      packages=find_packages(),
+      )
diff --git a/test/common.py b/test/common.py
new file mode 100644
index 0000000..e4bba63
--- /dev/null
+++ b/test/common.py
@@ -0,0 +1,63 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from time import time
+import pyopencl as cl
+
+
+def timer(kernel, ntime=200, nwarmup=2):
+    for i in range(nwarmup):
+        kernel()
+
+    start = time()
+    for i in range(ntime):
+        evt = kernel()
+
+    if isinstance(evt, cl.Event):
+        evt.wait()
+
+    end = time()
+
+    return (end - start) / ntime * 1e3
+
+
+def get_exec_arg_dict():
+    """
+    Interprets command line arguments (obtained from `sys.argv`) as key-value
+    pairs. Entries corresponding to values are passed to :func:`eval` and stored
+    as such, unless :func:`eval` raises an exception, in which case the string
+    input itself is stored.
+
+    :returns: A :class:`dict` of the command-line arguments.
+    """
+
+    def eval_unless_str(string):
+        try:
+            x = eval(string)
+        except:  # noqa: E722
+            x = string
+        return x
+
+    import sys
+    ll = sys.argv[1:]
+    return dict(zip(ll[::2], map(eval_unless_str, ll[1::2])))
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000..1490c2e
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,22 @@
+def pytest_addoption(parser):
+    parser.addoption("--grid_shape", action="store", default=(128,)*3)
+    parser.addoption("--proc_shape", action="store", default=(1,)*3)
+
+
+def tuplify(string):
+    if isinstance(string, str):
+        return tuple(int(i) for i in string.split(','))
+    else:
+        return string
+
+
+def pytest_generate_tests(metafunc):
+    # This is called for every test. Only get/set command line arguments
+    # if the argument is specified in the list of test "fixturenames".
+    grid_shape = metafunc.config.option.grid_shape
+    if 'grid_shape' in metafunc.fixturenames and grid_shape is not None:
+        metafunc.parametrize("grid_shape", [tuplify(grid_shape)])
+
+    proc_shape = metafunc.config.option.proc_shape
+    if 'proc_shape' in metafunc.fixturenames and proc_shape is not None:
+        metafunc.parametrize("proc_shape", [tuplify(proc_shape)])
diff --git a/test/test_decomp.py b/test/test_decomp.py
new file mode 100644
index 0000000..9697249
--- /dev/null
+++ b/test/test_decomp.py
@@ -0,0 +1,196 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom as clr
+import pyopencl.array as cla
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("h", [1, 2])
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("_grid_shape", [None, (128, 64, 32)])
+@pytest.mark.parametrize("pass_rank_shape", [True, False])
+def test_share_halos(ctx_factory, grid_shape, proc_shape, h, dtype,
+                     _grid_shape, pass_rank_shape, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    grid_shape = _grid_shape or grid_shape
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(
+        proc_shape, h, rank_shape=(rank_shape if pass_rank_shape else None)
+    )
+
+    rng = clr.ThreefryGenerator(ctx, seed=12321)
+    data = rng.uniform(queue, tuple(Ni + 2*h for Ni in grid_shape), dtype).get()
+    data[:h, :, :] = data[-2*h:-h, :, :].copy()
+    data[-h:, :, :] = data[h:2*h, :, :].copy()
+    data[:, :h, :] = data[:, -2*h:-h, :].copy()
+    data[:, -h:, :] = data[:, h:2*h, :].copy()
+    data[:, :, :h] = data[:, :, -2*h:-h].copy()
+    data[:, :, -h:] = data[:, :, h:2*h].copy()
+
+    subdata = np.empty(tuple(ni + 2*h for ni in rank_shape), dtype)
+    rank_slice = tuple(slice(ri * ni + h, (ri+1) * ni + h)
+                       for ri, ni in zip(mpi.rank_tuple, rank_shape))
+    subdata[h:-h, h:-h, h:-h] = data[rank_slice].copy()
+
+    subdata_device = clr.rand(queue, tuple(ni + 2*h for ni in rank_shape), dtype)
+    cl.enqueue_copy(queue, subdata_device.data, subdata)
+
+    mpi.share_halos(queue, subdata_device)
+    cl.enqueue_copy(queue, subdata, subdata_device.data)
+
+    pencil_slice = tuple(slice(ri * ni, (ri+1) * ni + 2*h)
+                         for ri, ni in zip(mpi.rank_tuple, rank_shape))
+    assert (subdata == data[pencil_slice]).all(), \
+        "rank %d %s has incorrect halo data \n" % (mpi.rank, mpi.rank_tuple)
+
+    # test that can call with different-shaped input
+    if not pass_rank_shape:
+        subdata_device_new = clr.rand(queue, tuple(ni//2 + 2*h for ni in rank_shape),
+                                      dtype)
+        mpi.share_halos(queue, subdata_device_new)
+
+    if timing:
+        from common import timer
+        t = timer(lambda: mpi.share_halos(queue, fx=subdata_device))
+        if mpi.rank == 0:
+            print("share_halos took %.3f ms for grid_shape=%s, h=%d, proc_shape=%s"
+                  % (t, str(grid_shape), h, str(proc_shape)))
+
+
+@pytest.mark.parametrize("h", [1, 2])
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("_grid_shape", [None, (128, 64, 32)])
+def test_gather_scatter(ctx_factory, grid_shape, proc_shape, h, dtype,
+                        _grid_shape, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    grid_shape = _grid_shape or grid_shape
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h)
+
+    rank_slice = tuple(slice(ri * ni, (ri+1) * ni)
+                       for ri, ni in zip(mpi.rank_tuple, rank_shape))
+    pencil_shape = tuple(ni + 2*h for ni in rank_shape)
+
+    # create random data with same seed on all ranks
+    rng = clr.ThreefryGenerator(ctx, seed=12321)
+    data = rng.uniform(queue, grid_shape, dtype)
+
+    # cl.Array -> cl.Array
+    subdata = cla.zeros(queue, pencil_shape, dtype)
+    mpi.scatter_array(queue, data, subdata, 0)
+    sub_h = subdata.get()
+    data_h = data.get()
+    assert (sub_h[h:-h, h:-h, h:-h] == data_h[rank_slice]).all()
+
+    data_test = cla.zeros_like(data)
+    mpi.gather_array(queue, subdata, data_test, 0)
+    data_test_h = data_test.get()
+    if mpi.rank == 0:
+        assert (data_test_h == data_h).all()
+
+    # np.ndarray -> np.ndarray
+    mpi.scatter_array(queue, data_h, sub_h, 0)
+    assert (sub_h[h:-h, h:-h, h:-h] == data_h[rank_slice]).all()
+
+    mpi.gather_array(queue, sub_h, data_test_h, 0)
+    if mpi.rank == 0:
+        assert (data_test_h == data_h).all()
+
+    # scatter cl.Array -> np.ndarray
+    sub_h[:] = 0
+    mpi.scatter_array(queue, data, sub_h, 0)
+    assert (sub_h[h:-h, h:-h, h:-h] == data_h[rank_slice]).all()
+
+    # gather np.ndarray -> cl.Array
+    data_test[:] = 0
+    mpi.gather_array(queue, sub_h, data_test, 0)
+    data_test_h = data_test.get()
+    if mpi.rank == 0:
+        assert (data_test_h == data_h).all()
+
+    # scatter np.ndarray -> cl.Array
+    subdata[:] = 0
+    mpi.scatter_array(queue, data_h, subdata, 0)
+    sub_h = subdata.get()
+    assert (sub_h[h:-h, h:-h, h:-h] == data_h[rank_slice]).all()
+
+    # gather cl.Array -> np.ndarray
+    data_test_h[:] = 0
+    mpi.gather_array(queue, subdata, data_test_h, 0)
+    if mpi.rank == 0:
+        assert (data_test_h == data_h).all()
+
+    if timing:
+        from common import timer
+        ntime = 25
+        times = {}
+
+        times['scatter cl.Array -> cl.Array'] = \
+            timer(lambda: mpi.scatter_array(queue, data, subdata, 0), ntime=ntime)
+        times['scatter cl.Array -> np.ndarray'] = \
+            timer(lambda: mpi.scatter_array(queue, data, sub_h, 0), ntime=ntime)
+        times['scatter np.ndarray -> cl.Array'] = \
+            timer(lambda: mpi.scatter_array(queue, data_h, subdata, 0), ntime=ntime)
+        times['scatter np.ndarray -> np.ndarray'] = \
+            timer(lambda: mpi.scatter_array(queue, data_h, sub_h, 0), ntime=ntime)
+
+        times['gather cl.Array -> cl.Array'] = \
+            timer(lambda: mpi.gather_array(queue, subdata, data, 0), ntime=ntime)
+        times['gather cl.Array -> np.ndarray'] = \
+            timer(lambda: mpi.gather_array(queue, subdata, data_h, 0), ntime=ntime)
+        times['gather np.ndarray -> cl.Array'] = \
+            timer(lambda: mpi.gather_array(queue, sub_h, data, 0), ntime=ntime)
+        times['gather np.ndarray -> np.ndarray'] = \
+            timer(lambda: mpi.gather_array(queue, sub_h, data_h, 0), ntime=ntime)
+
+        if mpi.rank == 0:
+            print("grid_shape=%s, h=%d, proc_shape=%s"
+                  % (str(grid_shape), h, str(proc_shape)))
+            for key, val in times.items():
+                print(key, 'took', '%.3f' % val, 'ms')
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3, 'h': 2,
+            'dtype': np.float64, '_grid_shape': None}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_share_halos(None, **args, pass_rank_shape=True, timing=True)
+    test_gather_scatter(None, **args, timing=True)
diff --git a/test/test_derivs.py b/test/test_derivs.py
new file mode 100644
index 0000000..d11dfd1
--- /dev/null
+++ b/test/test_derivs.py
@@ -0,0 +1,165 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cla
+import pyopencl.clmath as clm
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.parametrize("h", [0, 1, 2, 3, 4])
+@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("stream", [True, False])
+def test_gradient_laplacian(ctx_factory, grid_shape, proc_shape, h, dtype,
+                            stream, timing=False):
+    if h == 0 and stream is True:
+        pytest.skip('no streaming spectral')
+
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    L = (3, 5, 7)
+    dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape))
+    dk = tuple(2 * np.pi / Li for Li in L)
+
+    if h == 0:
+        def get_evals_1(k, dx):
+            return k
+
+        def get_evals_2(k, dx):
+            return - k**2
+
+        fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+        derivs = ps.SpectralGradientLaplacian(fft, dk)
+    else:
+        from pystella.derivs import FirstCenteredDifference, SecondCenteredDifference
+        get_evals_1 = FirstCenteredDifference(h).get_eigenvalues
+        get_evals_2 = SecondCenteredDifference(h).get_eigenvalues
+        if stream:
+            try:
+                derivs = ps.GradientLaplacian(mpi, h, dx, stream=stream)
+            except:  # noqa
+                pytest.skip("StreamingStencil unavailable")
+        else:
+            derivs = ps.GradientLaplacian(mpi, h, dx, stream=False)
+
+    pencil_shape = tuple(ni + 2*h for ni in rank_shape)
+
+    # set up test data
+    fx_h = np.empty(pencil_shape, dtype)
+    kvec = np.array(dk) * np.array([-5, 4, -3]).astype(dtype)
+    xvec = np.meshgrid(*[dxi * np.arange(ri*ni, (ri+1)*ni)
+                         for dxi, ri, ni in zip(dx, mpi.rank_tuple, rank_shape)],
+                       indexing='ij')
+
+    phases = sum(ki * xi for ki, xi in zip(kvec, xvec))
+    if h > 0:
+        fx_h[h:-h, h:-h, h:-h] = np.sin(phases)
+    else:
+        fx_h[:] = np.sin(phases)
+    fx_cos = np.cos(phases)
+
+    fx = cla.empty(queue, pencil_shape, dtype)
+    fx.set(fx_h)
+
+    lap = cla.empty(queue, rank_shape, dtype)
+    grd = cla.empty(queue, (3,)+rank_shape, dtype)
+
+    derivs(queue, fx=fx, lap=lap, grd=grd)
+
+    eff_kmag_sq = sum(get_evals_2(kvec_i, dxi) for dxi, kvec_i in zip(dx, kvec))
+
+    lap_true = cla.to_device(queue, eff_kmag_sq * np.sin(phases))
+    diff = clm.fabs(lap - lap_true)
+
+    max_err = cla.max(diff) / cla.max(clm.fabs(lap_true))
+    avg_err = cla.sum(diff) / cla.sum(clm.fabs(lap_true))
+
+    max_rtol = 1.e-11 if dtype == np.float64 else 3.e-4
+    avg_rtol = 1.e-12 if dtype == np.float64 else 5.e-5
+
+    assert max_err < max_rtol and avg_err < avg_rtol, \
+        "lap inaccurate for h=%d, grid_shape=%s, proc_shape=%s" \
+        % (h, grid_shape, proc_shape)
+
+    for i in range(3):
+        eff_k = get_evals_1(kvec[i], dx[i])
+
+        pdi_true = cla.to_device(queue, eff_k * fx_cos)
+        diff = clm.fabs(grd[i] - pdi_true)
+
+        max_err = cla.max(diff) / cla.max(clm.fabs(pdi_true))
+        avg_err = cla.sum(diff) / cla.sum(clm.fabs(pdi_true))
+
+        max_rtol = 1.e-12 if dtype == np.float64 else 1.e-5
+        avg_rtol = 1.e-13 if dtype == np.float64 else 3.e-6
+
+        assert max_err < max_rtol and avg_err < avg_rtol, \
+            "pd%d inaccurate for h=%d, grid_shape=%s, proc_shape=%s" \
+            % (i, h, grid_shape, proc_shape)
+
+    if timing:
+        from common import timer
+
+        times = {}
+        times['gradient and laplacian'] = \
+            timer(lambda: derivs(queue, fx=fx, lap=lap, grd=grd))
+        times['gradient'] = \
+            timer(lambda: derivs(queue, fx=fx, grd=grd))
+        times['laplacian'] = timer(lambda: derivs(queue, fx=fx, lap=lap))
+        times['pdx'] = timer(lambda: derivs(queue, fx=fx, pdx=grd[0]))
+        times['pdy'] = timer(lambda: derivs(queue, fx=fx, pdy=grd[1]))
+        times['pdz'] = timer(lambda: derivs(queue, fx=fx, pdz=grd[2]))
+
+        if mpi.rank == 0:
+            print("grid_shape=%s, h=%d, proc_shape=%s"
+                  % (str(grid_shape), h, str(proc_shape)))
+            for key, val in times.items():
+                print(key, 'took', '%.3f' % val, 'ms')
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3, 'dtype': np.float64,
+            'h': 2}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+
+    for stream in [True, False]:
+        test_gradient_laplacian(None, **args, stream=stream, timing=True)
+
+    args['h'] = 0
+    test_gradient_laplacian(None, **args, stream=False, timing=True)
diff --git a/test/test_dft.py b/test/test_dft.py
new file mode 100644
index 0000000..39c7a92
--- /dev/null
+++ b/test/test_dft.py
@@ -0,0 +1,114 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom as clr
+import pyopencl.array as cla
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_dft(ctx_factory, grid_shape, proc_shape, dtype, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+    grid_size = np.product(grid_shape)
+
+    if proc_shape[0] * proc_shape[1] * proc_shape[2] == 1:
+        rng = clr.ThreefryGenerator(ctx, seed=12321)
+        fx = rng.uniform(queue, grid_shape, dtype) + 1.e-2
+        fx1 = fx.get()
+
+        fk = fft.dft(fx)
+        fk1 = fk.get()
+        fk_np = np.fft.rfftn(fx1)
+
+        fx2 = fft.idft(fk).get()
+        fx_np = np.fft.irfftn(fk1)
+
+        rtol = 1.e-11 if dtype == np.float64 else 2.e-3
+        assert np.allclose(fx1, fx2 / grid_size, rtol=rtol, atol=0), \
+                "IDFT(DFT(f)) != f for grid_shape=%s" % str(grid_shape)
+
+        assert np.allclose(fk_np, fk1, rtol=rtol, atol=0), \
+                "DFT disagrees with numpy for grid_shape=%s" % str(grid_shape)
+
+        assert np.allclose(fx_np, fx2 / grid_size, rtol=rtol, atol=0), \
+                "IDFT disagrees with numpy for grid_shape=%s" % str(grid_shape)
+
+    fx_cl = cla.empty(queue, rank_shape, dtype)
+    pencil_shape = tuple(ni + 2*h for ni in rank_shape)
+    fx_cl_halo = cla.empty(queue, pencil_shape, dtype)
+    fx_np = np.empty(rank_shape, dtype)
+    fx_np_halo = np.empty(pencil_shape, dtype)
+    fk_cl = cla.empty(queue, fft.shape(True), fft.fk.dtype)
+    fk_np = np.empty(fft.shape(True), fft.fk.dtype)
+
+    # FIXME: check that these actually produce the correct result
+    fx_types = {'cl': fx_cl, 'cl halo': fx_cl_halo,
+                'np': fx_np, 'np halo': fx_np_halo,
+                'None': None}
+
+    fk_types = {'cl': fk_cl, 'np': fk_np, 'None': None}
+
+    # run all of these to ensure no runtime errors even if no timing
+    if timing:
+        ntime = 20
+    else:
+        ntime = 1
+
+    from common import timer
+
+    if mpi.rank == 0:
+        print("N = %s" % str(grid_shape))
+
+    from itertools import product
+    for (a, input_), (b, output) in product(fx_types.items(), fk_types.items()):
+        t = timer(lambda: fft.dft(input_, output), ntime=ntime)
+        if mpi.rank == 0:
+            print("dft(%s, %s) took %.3f ms" % (a, b, t))
+
+    for (a, input_), (b, output) in product(fk_types.items(), fx_types.items()):
+        t = timer(lambda: fft.idft(input_, output), ntime=ntime)
+        if mpi.rank == 0:
+            print("idft(%s, %s) took %.3f ms" % (a, b, t))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3, 'dtype': np.float64}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_dft(None, **args, timing=True)
diff --git a/test/test_elementwise.py b/test/test_elementwise.py
new file mode 100644
index 0000000..52c01aa
--- /dev/null
+++ b/test/test_elementwise.py
@@ -0,0 +1,104 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom as clr
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_elementwise(ctx_factory, grid_shape, proc_shape, dtype, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+
+    from pymbolic import var
+    a = var('a')
+    b = var('b')
+
+    from pystella.field import Field
+    x = Field('x')
+    y = Field('y')
+    z = Field('z')
+
+    tmp_dict = {a[0]: x + 2,
+                a[1]: 2 + x * y,
+                b: x + y / 2}
+    map_dict = {x: a[0] * y**2 * x + a[1] * b,
+                z: z + a[1] * b}
+    single_insn = {x: y + z}
+
+    ew_map = ps.ElementWiseMap(map_dict, tmp_dict=tmp_dict)
+
+    x = clr.rand(queue, rank_shape, dtype=dtype)
+    y = clr.rand(queue, rank_shape, dtype=dtype)
+    z = clr.rand(queue, rank_shape, dtype=dtype)
+
+    a0 = x + 2
+    a1 = 2 + x * y
+    b = x + y / 2
+    x_true = a0 * y**2 * x + a1 * b
+    z_true = z + a1 * b
+
+    ew_map(queue, x=x, y=y, z=z)
+
+    rtol = 5.e-14 if dtype == np.float64 else 1.e-5
+
+    assert np.allclose(x.get(), x_true.get(), rtol=rtol, atol=0), \
+            "x innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (str(grid_shape), str(proc_shape))
+
+    assert np.allclose(z.get(), z_true.get(), rtol=rtol, atol=0), \
+            "z innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (str(grid_shape), str(proc_shape))
+
+    # test success of single instruction
+    ew_map_single = ps.ElementWiseMap(single_insn)
+    ew_map_single(queue, x=x, y=y, z=z)
+
+    assert np.allclose(x.get(), y.get() + z.get(), rtol=rtol, atol=0), \
+            "x innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (str(grid_shape), str(proc_shape))
+
+    if timing:
+        from common import timer
+        t = timer(lambda: ew_map(queue, x=x, y=y, z=z)[0])
+        print("elementwise map took %.3f ms for grid_shape=%s, proc_shape=%s"
+              % (t, str(grid_shape), str(proc_shape)))
+        print("Bandwidth = %.1f GB/s" % (5 * x.nbytes/1024**3 / t * 1000))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3, 'dtype': np.float64}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_elementwise(None, **args, timing=True)
diff --git a/test/test_energy.py b/test/test_energy.py
new file mode 100644
index 0000000..09d2f2b
--- /dev/null
+++ b/test/test_energy.py
@@ -0,0 +1,110 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom as clr
+import pystella as ps
+import pytest
+# pylint: disable=no-member
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("h", [1, 2])
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_scalar_energy(ctx_factory, grid_shape, proc_shape, h, dtype, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    grid_size = np.product(grid_shape)
+
+    nscalars = 2
+
+    def potential(f):
+        phi, chi = f[0], f[1]
+        return 1/2 * phi**2 + 1/2 * chi**2 + 1/2 * phi**2 * chi**2
+
+    scalar_sector = ps.ScalarSector(nscalars, potential=potential)
+    scalar_energy = ps.Reduction(mpi, scalar_sector,
+                                 rank_shape=rank_shape, grid_size=grid_size, h=h)
+
+    pencil_shape = tuple(ni+2*h for ni in rank_shape)
+    f = clr.rand(queue, (nscalars,)+pencil_shape, dtype)
+    dfdt = clr.rand(queue, (nscalars,)+pencil_shape, dtype)
+    lap = clr.rand(queue, (nscalars,)+rank_shape, dtype)
+
+    energy = scalar_energy(queue, f=f, dfdt=dfdt, lap_f=lap, a=np.array(1.))
+
+    kin_test = []
+    grad_test = []
+    for fld in range(nscalars):
+        df_h = dfdt[fld].get()
+        rank_sum = np.sum(df_h[h:-h, h:-h, h:-h]**2)
+        kin_test.append(1/2 * mpi.allreduce(rank_sum) / grid_size)
+
+        f_h = f[fld].get()
+        lap_h = lap[fld].get()
+
+        rank_sum = np.sum(- f_h[h:-h, h:-h, h:-h] * lap_h)
+        grad_test.append(1/2 * mpi.allreduce(rank_sum) / grid_size)
+
+    energy_test = {}
+    energy_test['kinetic'] = np.array(kin_test)
+    energy_test['gradient'] = np.array(grad_test)
+
+    phi = f[0].get()[h:-h, h:-h, h:-h]
+    chi = f[1].get()[h:-h, h:-h, h:-h]
+    pot_rank = np.sum(potential([phi, chi]))
+    energy_test['potential'] = np.array(mpi.allreduce(pot_rank) / grid_size)
+
+    rtol = 1.e-14 if dtype == np.float64 else 1.e-5
+
+    for key, value in energy.items():
+        assert np.allclose(value, energy_test[key], rtol=rtol, atol=0), \
+               "%s energy inaccurate for nscalars=%d, grid_shape=%s, proc_shape=%s" \
+               % (key, nscalars, str(grid_shape), str(proc_shape))
+
+    if timing:
+        from common import timer
+        t = timer(lambda: scalar_energy(queue, a=np.array(1.),
+                                        f=f, dfdt=dfdt, lap_f=lap))
+        if mpi.rank == 0:
+            print("scalar energy took "
+                  "%.3f ms for nscalars=%d, grid_shape=%s, proc_shape=%s"
+                  % (t, nscalars, str(grid_shape), str(proc_shape)))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3,
+            'dtype': np.float64, 'h': 2}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_scalar_energy(None, **args, timing=True)
diff --git a/test/test_examples.py b/test/test_examples.py
new file mode 100644
index 0000000..4820250
--- /dev/null
+++ b/test/test_examples.py
@@ -0,0 +1,61 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import os
+import subprocess
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+examples = {
+    'examples/phi_chi.py': 2.5e-7,
+}
+
+
+@pytest.mark.parametrize("filename, expected", examples.items())
+def test_examples(ctx_factory, grid_shape, proc_shape, filename, expected):
+    if proc_shape[0] * proc_shape[1] * proc_shape[2] > 1:
+        pytest.skip('run examples on only one rank')
+
+    result = subprocess.run(['python', filename, 'end_time', '1'],
+                            stdout=subprocess.PIPE)
+
+    assert result.returncode == 0, '%s failed' % filename
+
+    from glob import glob
+    from h5py import File
+    files = sorted(glob('20*.h5'))
+    f = File(files[-1], 'r')
+    constraint = f['energy/constraint'][-1]
+    print(filename, constraint)
+    f.close()
+    os.remove(files[-1])
+
+    assert constraint < expected, '%s constraint is wrong' % filename
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3}
+    for example, expected in examples.items():
+        test_examples(None, **args, filename=example, expected=expected)
diff --git a/test/test_expansion.py b/test/test_expansion.py
new file mode 100644
index 0000000..6bc3f09
--- /dev/null
+++ b/test/test_expansion.py
@@ -0,0 +1,82 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("Stepper", [ps.RungeKutta4, ps.LowStorageRK54])
+def test_expansion(ctx_factory, proc_shape, dtype, Stepper, timing=False):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test expansion only on one rank")
+
+    def sol(w, t):
+        x = (1 + 3*w)
+        return (x*(t/np.sqrt(3) + 2/x))**(2/x)/2**(2/x)
+
+    from pystella.step import LowStorageRKStepper
+    is_low_storage = LowStorageRKStepper in Stepper.__bases__
+
+    for w in [0, 1/3, 1/2, 1, -1/4]:
+        def energy(a):
+            return a**(-1-3*w)
+
+        def pressure(a):
+            return w * energy(a)
+
+        t = 0
+        dt = .005
+        expand = ps.Expansion(energy(1.), Stepper, mpl=np.sqrt(8.*np.pi))
+
+        while t <= 10. - dt:
+            for s in range(expand.stepper.num_stages):
+                slc = (0) if is_low_storage else (0 if s == 0 else 1)
+                expand.step(s, energy(expand.a[slc]), pressure(expand.a[slc]), dt)
+            t += dt
+
+        slc = () if is_low_storage else (0)
+
+        order = expand.stepper.expected_order
+        rtol = dt**order
+
+        print(order,
+              w,
+              expand.a[slc]/sol(w, t) - 1,
+              expand.constraint(energy(expand.a[slc])))
+
+        assert np.allclose(expand.a[slc], sol(w, t), rtol=rtol, atol=0), \
+                "FLRW solution inaccurate for w=%f" % (w)
+
+        assert expand.constraint(energy(expand.a[slc])) < rtol, \
+                "FLRW solution disobeying constraint for w=%f" % (w)
+
+
+if __name__ == "__main__":
+    from pystella.step import all_steppers
+    for stepper in all_steppers[-5:]:
+        test_expansion(None, (1, 1, 1), np.float64, stepper, timing=True)
diff --git a/test/test_field.py b/test/test_field.py
new file mode 100644
index 0000000..d98f9eb
--- /dev/null
+++ b/test/test_field.py
@@ -0,0 +1,213 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import pystella as ps
+from pymbolic import parse, var
+import pytest
+
+
+def test_field(proc_shape):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test field only on one rank")
+
+    y = ps.Field('y', offset='h')
+    result = ps.Indexer(y)
+    assert result == parse("y[i + h, j + h, k + h]")
+
+    y = ps.Field('y', offset='h', indices=('a', 'b', 'c'))
+    result = ps.Indexer(y)
+    assert result == parse("y[a + h, b + h, c + h]")
+
+    y = ps.Field('y', ignore_prepends=True)
+    result = ps.Indexer(y, prepend_with=(0, 1))
+    assert result == parse("y[i, j, k]")
+
+    y = ps.Field('y[4, 5]', ignore_prepends=True)
+    result = ps.Indexer(y, prepend_with=(0, 1))
+    assert result == parse("y[4, 5, i, j, k]")
+
+    y = ps.Field('y', ignore_prepends=True)
+    result = ps.Indexer(y[2, 3], prepend_with=(0, 1))
+    assert result == parse("y[2, 3, i, j, k]")
+
+    y = ps.Field('y[4, 5]', ignore_prepends=True)
+    result = ps.Indexer(y[2, 3], prepend_with=(0, 1))
+    assert result == parse("y[2, 3, 4, 5, i, j, k]")
+
+    y = ps.Field('y', ignore_prepends=False)
+    result = ps.Indexer(y, prepend_with=(0, 1))
+    assert result == parse("y[0, 1, i, j, k]")
+
+    y = ps.Field('y[4, 5]', ignore_prepends=False)
+    result = ps.Indexer(y, prepend_with=(0, 1))
+    assert result == parse("y[0, 1, 4, 5, i, j, k]")
+
+    y = ps.Field('y', ignore_prepends=False)
+    result = ps.Indexer(y[2, 3], prepend_with=(0, 1))
+    assert result == parse("y[0, 1, 2, 3, i, j, k]")
+
+    y = ps.Field('y[4, 5]', ignore_prepends=False)
+    result = ps.Indexer(y[2, 3], prepend_with=(0, 1))
+    assert result == parse("y[0, 1, 2, 3, 4, 5, i, j, k]")
+
+    y = ps.Field('y', offset=('hx', 'hy', 'hz'))
+    result = ps.Indexer(y.shift((1, 2, 3)))
+    assert result == parse("y[i + hx + 1, j + hy + 2, k + hz + 3]")
+
+    y = ps.Field('y', offset=('hx', var('hy'), 'hz'))
+    result = ps.Indexer(y.shift((1, 2, var('a'))))
+    assert result == parse("y[i + hx + 1, j + hy + 2, k + hz + a]")
+
+
+def test_dynamic_field(proc_shape):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test field only on one rank")
+
+    y = ps.DynamicField('y', offset='h')
+
+    result = ps.Indexer(y)
+    assert result == parse("y[i + h, j + h, k + h]")
+
+    result = ps.Indexer(y.lap)
+    assert result == parse("lap_y[i, j, k]")
+
+    result = ps.Indexer(y.dot)
+    assert result == parse("dydt[i + h, j + h, k + h]")
+
+    result = ps.Indexer(y.pd[var('x')])
+    assert result == parse("dydx[x, i, j, k]")
+
+    result = ps.Indexer(y.d(1, 0))
+    assert result == parse("dydt[1, i + h, j + h, k + h]")
+
+    result = ps.Indexer(y.d(1, 1))
+    assert result == parse("dydx[1, 0, i, j, k]")
+
+
+def test_field_diff(proc_shape):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test field only on one rank")
+
+    from pystella import diff
+
+    y = ps.Field('y')
+    assert diff(y, y) == 1
+    assert diff(y[0], y[0]) == 1
+    assert diff(y[0], y[1]) == 0
+
+    y = ps.DynamicField('y')
+    assert diff(y, y) == 1
+    assert diff(y[0], y[0]) == 1
+    assert diff(y[0], y[1]) == 0
+
+    import pymbolic.primitives as pp
+    assert diff(y**3, y, 't') == pp.Product((3, 2, y, y.d(0)))
+    assert diff(y**3, 't', y) == pp.Product((3, y.d(0), 2, y))
+
+    for i, x in enumerate(['t', 'x', 'y', 'z']):
+        assert diff(y, x) == y.d(i)
+        assert diff(y[1, 3], x) == y.d(1, 3, i)
+        assert diff(y[1]**2, x) == 2 * y[1] * y.d(1, i)
+
+
+def test_get_field_args(proc_shape):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test field only on one rank")
+
+    from pystella import Field, get_field_args
+
+    x = Field('x', offset=(1, 2, 3))
+    y = Field('y', offset='h')
+    z = Field('z')
+
+    from loopy import GlobalArg
+    true_args = [
+        GlobalArg('x', shape='(Nx+2, Ny+4, Nz+6)'),
+        GlobalArg('y', shape='(Nx+2*h, Ny+2*h, Nz+2*h)'),
+        GlobalArg('z', shape='(Nx, Ny, Nz)'),
+    ]
+
+    def lists_equal(a, b):
+        equal = True
+        for x in a:
+            equal *= x in b
+        for x in b:
+            equal *= x in a
+        return equal
+
+    expressions = {x: y, y: x * z}
+    args = get_field_args(expressions)
+    assert lists_equal(args, true_args)
+
+    expressions = x * y + z
+    args = get_field_args(expressions)
+    assert lists_equal(args, true_args)
+
+    expressions = [x, y, y * z**2]
+    args = get_field_args(expressions)
+    assert lists_equal(args, true_args)
+
+
+def test_sympy_interop(proc_shape):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test field only on one rank")
+
+    from pystella.field.sympy import pymbolic_to_sympy, sympy_to_pymbolic
+    import sympy as sym
+
+    f = ps.Field('f', offset='h')
+    g = ps.Field('g', offset='h')
+
+    expr = f[0]**2 * g + 2 * g[1] * f
+    sympy_expr = pymbolic_to_sympy(expr)
+    new_expr = sympy_to_pymbolic(sympy_expr)
+    sympy_expr_2 = pymbolic_to_sympy(new_expr)
+    assert sym.simplify(sympy_expr - sympy_expr_2) == 0, \
+        "sympy <-> pymbolic conversion not invertible"
+
+    # from pymbolic.functions import fabs, exp, exmp1
+    fabs = parse('math.fabs')
+    exp = parse('math.exp')
+    expm1 = parse('math.expm1')
+    x = sym.Symbol('x')
+
+    expr = sym.Abs(x)
+    assert sympy_to_pymbolic(expr) == fabs(var('x'))
+
+    expr = sym.exp(x)
+    assert sympy_to_pymbolic(expr) == exp(var('x'))
+
+    expr = sym.Function('expm1')(x)
+    assert sympy_to_pymbolic(expr) == expm1(var('x'))
+
+    expr = sym.Function('aaa')(x)
+    from pymbolic.primitives import Call, Variable
+    assert sympy_to_pymbolic(expr) == Call(Variable('aaa'), (Variable('x'),))
+
+
+if __name__ == "__main__":
+    test_field((1, 1, 1))
+    test_dynamic_field((1, 1, 1))
+    test_field_diff((1, 1, 1))
+    test_get_field_args((1, 1, 1))
+    test_sympy_interop((1, 1, 1))
diff --git a/test/test_multigrid.py b/test/test_multigrid.py
new file mode 100644
index 0000000..42aca37
--- /dev/null
+++ b/test/test_multigrid.py
@@ -0,0 +1,122 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom as clr
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+from pystella.multigrid import (FullApproximationScheme, MultiGridSolver,
+                                NewtonIterator)
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.ParameterFinderWarning")
+@pytest.mark.parametrize("h", [1])
+@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("Solver", [NewtonIterator])
+@pytest.mark.parametrize("MG", [FullApproximationScheme, MultiGridSolver])
+def test_multigrid(ctx_factory, grid_shape, proc_shape, h, dtype, Solver, MG,
+                   timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    L = 10
+    dx = L / grid_shape[0]
+
+    statistics = ps.FieldStatistics(mpi, h, rank_shape=rank_shape,
+                                    grid_size=np.product(grid_shape))
+
+    def get_laplacian(f):
+        from pystella.derivs import _lap_coefs, centered_diff
+        lap_coefs = _lap_coefs[h]
+        from pymbolic import var
+        return sum([centered_diff(f, lap_coefs, direction=mu, order=2)
+                    for mu in range(1, 4)]) / var('dx')**2
+
+    test_problems = {}
+
+    from pystella import Field
+    f = Field('f', offset='h')
+    rho = Field('rho', offset='h')
+    test_problems[f] = (get_laplacian(f), rho)
+
+    f = Field('f2', offset='h')
+    rho = Field('rho2', offset='h')
+    test_problems[f] = (get_laplacian(f) - f, rho)
+
+    solver = Solver(mpi, queue, test_problems, h=h, dtype=dtype,
+                    fixed_parameters=dict(omega=1/2))
+    mg = MG(solver=solver, h=h, dtype=dtype)
+
+    def zero_mean_array():
+        f0 = clr.rand(queue, grid_shape, dtype)
+        f = clr.rand(queue, tuple(ni + 2*h for ni in rank_shape), dtype)
+        mpi.scatter_array(queue, f0, f, root=0)
+        avg = statistics(f)['mean']
+        f = f - avg
+        mpi.share_halos(queue, f)
+        return f
+
+    f = zero_mean_array()
+    rho = zero_mean_array()
+
+    f2 = zero_mean_array()
+    rho2 = zero_mean_array()
+
+    poisson_errs = []
+    helmholtz_errs = []
+    num_v_cycles = 15 if MG == MultiGridSolver else 10
+    for i in range(num_v_cycles):
+        errs = mg(mpi, queue, dx0=dx, f=f, rho=rho, f2=f2, rho2=rho2)
+        poisson_errs.append(errs[-1][-1]['f'])
+        helmholtz_errs.append(errs[-1][-1]['f2'])
+
+    for name, cycle_errs in zip(['poisson', 'helmholtz'],
+                                [poisson_errs, helmholtz_errs]):
+        tol = 1.e-6 if MG == MultiGridSolver else 1.e-15
+        assert cycle_errs[-1][1] < tol and cycle_errs[-2][1] < 10*tol, \
+            "multigrid solution to %s eqn is inaccurate for " \
+            "grid_shape=%s, h=%d, proc_shape=%s" \
+            % (name, str(grid_shape), h, str(proc_shape))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (128,)*3, 'proc_shape': (1,)*3,
+            'dtype': np.float64, 'h': 1}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_multigrid(None, **args, Solver=NewtonIterator, MG=FullApproximationScheme,
+                   timing=True)
diff --git a/test/test_projectors.py b/test/test_projectors.py
new file mode 100644
index 0000000..a830ba4
--- /dev/null
+++ b/test/test_projectors.py
@@ -0,0 +1,392 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clmath as clm
+import pyopencl.array as cla
+import pystella as ps
+from pystella.derivs import FirstCenteredDifference, SecondCenteredDifference
+from pystella.fourier import gDFT
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("h", [1, 2, 3, 4])
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_effective_momenta(ctx_factory, grid_shape, proc_shape, h, dtype):
+    L = 10.
+    N = 128
+    dx = 10 / N
+    dk = 2 * np.pi / L
+    k = np.linspace(-N//2+1, N//2+1, 100)
+    kmag = dk * k
+
+    diff = 0
+    stencil = FirstCenteredDifference(h)
+    for i, coef in stencil.coefs.items():
+        x = dx * i
+        diff += coef * np.exp(1j * kmag * x)
+        diff += - coef * np.exp(- 1j * kmag * x)
+
+    k_diff = np.real(diff / dx / 1j)
+    eff_k = stencil.get_eigenvalues(kmag, dx)
+
+    assert np.max(np.abs(k_diff/eff_k - 1)) < 1.e-14
+
+    diff = 0
+    stencil = SecondCenteredDifference(h)
+    for i, coef in stencil.coefs.items():
+        x = dx * i
+        diff += coef * np.exp(1j * kmag * x)
+        if i > 0:
+            diff += coef * np.exp(- 1j * kmag * x)
+
+    k_diff = np.real(diff / dx**2)
+    eff_k = stencil.get_eigenvalues(kmag, dx)
+
+    assert np.max(np.abs(k_diff/eff_k - 1)) < 1.e-11
+
+
+def divergence_error(pdx):
+    div = clm.fabs(sum([pdx[mu] for mu in range(3)]))**2
+    norm = sum([pdx[mu]**2 for mu in range(3)])
+    div_norm = clm.sqrt(div) / clm.sqrt(norm)
+    max_err = cla.max(div_norm).get()
+    l2_err = cla.sum(div_norm).get() / div.size
+    return max_err, l2_err
+
+
+def spectral_divergence_error(vector):
+    if isinstance(vector, cla.Array):
+        vector = vector.get()
+    if isinstance(vector, list):
+        N = vector[0].shape[0]
+    else:
+        N = vector.shape[1]
+
+    pts = np.concatenate([np.arange(0, N//2+1), np.arange(-N//2+1, 0)])
+    pts = pts.astype(np.float64)
+    pts[N//2] = 0.  # Nyquist modes have zero first derivative
+    kvecs = np.meshgrid(pts, pts, pts[:N//2+1], indexing='ij')
+
+    div = sum([kvecs[mu] * vector[mu] for mu in range(3)])
+    norm = sum([np.abs(kvecs[mu] * vector[mu])**2 for mu in range(3)])
+    div_norm = np.abs(div[norm != 0] / np.sqrt(norm[norm != 0]))
+
+    # filter out modes where norm is tiny but not zero
+    div_norm = div_norm[div_norm < .99]
+
+    max_err = np.max(div_norm)
+    l2_err = np.sum(div_norm) / div_norm.size
+    return max_err, l2_err
+
+
+def is_hermitian(fk):
+    if isinstance(fk, cla.Array):
+        fk = fk.get()
+
+    grid_shape = list(fk.shape)
+    grid_shape[-1] = 2 * (grid_shape[-1] - 1)
+    pos = [np.arange(0, Ni//2+1) for Ni in grid_shape]
+    neg = [np.concatenate([np.array([0]), np.arange(Ni-1, Ni//2-1, -1)])
+           for Ni in grid_shape]
+
+    test = np.array([])
+    for k in [0, grid_shape[-1]//2]:
+        for n, p in zip(neg[0], pos[0]):
+            test = np.append(test, np.allclose(fk[n, neg[1], k],
+                                               np.conj(fk[p, pos[1], k]),
+                                               atol=0, rtol=1.e-12))
+            test = np.append(test, np.allclose(fk[p, neg[1], k],
+                                               np.conj(fk[n, pos[1], k]),
+                                               atol=0, rtol=1.e-12))
+        for n, p in zip(neg[1], pos[1]):
+            test = np.append(test, np.allclose(fk[neg[0], n, k],
+                                               np.conj(fk[pos[0], p, k]),
+                                               atol=0, rtol=1.e-12))
+            test = np.append(test, np.allclose(fk[neg[0], p, k],
+                                               np.conj(fk[pos[0], n, k]),
+                                               atol=0, rtol=1.e-12))
+
+    for i in [0, grid_shape[0]//2]:
+        for j in [0, grid_shape[1]//2]:
+            for k in [0, grid_shape[2]//2]:
+                test = np.append(test, [np.abs(np.imag(fk[i, j, k])) < 1.e-15])
+    return test.all()
+
+
+def make_data(queue, fft):
+    kshape = fft.shape(True)
+    data = np.random.rand(*kshape) + 1j * np.random.rand(*kshape)
+    if isinstance(fft, gDFT):
+        from pystella.fourier.rayleigh import make_hermitian
+        data = make_hermitian(data).astype(np.complex128)
+
+    data = fft.zero_corner_modes(data)
+    return cla.to_device(queue, data)
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.parametrize("h", [0, 2])
+@pytest.mark.parametrize("pol", [False, True])
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_vector_projector(ctx_factory, grid_shape, proc_shape, h, dtype, pol,
+                          timing=False):
+    if proc_shape[0] * proc_shape[1] * proc_shape[2] > 1 and h == 0:
+        pytest.skip("can't test continuum projectors on multiple ranks yet")
+
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    L = (10,)*3
+    dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape))
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+    cdtype = fft.cdtype
+    if h > 0:
+        stencil = FirstCenteredDifference(h)
+        project = ps.Projector(fft, stencil.get_eigenvalues)
+    else:
+        project = ps.Projector(fft, lambda k, dx: k)
+
+    k_shape = fft.shape(True)
+    vector = cla.empty(queue, (3,)+k_shape, cdtype)
+
+    if pol:
+        plus = make_data(queue, fft).astype(cdtype)
+        minus = make_data(queue, fft).astype(cdtype)
+
+        vector = cla.empty(queue, (3,)+k_shape, cdtype)
+        project.pol_to_vec(queue, plus, minus, vector)
+
+        if isinstance(fft, gDFT):
+            for i in range(3):
+                assert is_hermitian(vector[i]), \
+                    "pol->vec projection is non-hermitian for grid_shape=%s, h=%d" \
+                    % (str(grid_shape), h)
+
+        plus1 = cla.zeros_like(plus)
+        minus1 = cla.zeros_like(minus)
+
+        project.vec_to_pol(queue, plus1, minus1, vector)
+
+        if isinstance(fft, gDFT):
+            assert is_hermitian(plus1), \
+                "plus polarization is not hermitian for grid_shape=%s, h=%d" %  \
+                (str(grid_shape), h)
+            assert is_hermitian(minus1), \
+                "minus polarization is not hermitian for grid_shape=%s, h=%d" %  \
+                (str(grid_shape), h)
+
+        assert np.allclose(plus1.get(), plus.get(), atol=0., rtol=1.e-11) and \
+            np.allclose(minus1.get(), minus.get(), atol=0., rtol=1.e-11), \
+            "pol->vec->pol is not identity mapping for grid_shape=%s, h=%d" %  \
+            (str(grid_shape), h)
+
+    else:
+        for mu in range(3):
+            vector[mu] = make_data(queue, fft).astype(cdtype)
+
+        # apply twice to ensure smallness
+        project.transversify(queue, vector)
+        project.transversify(queue, vector)
+
+    # h=0 performs "continuum" projection
+    if h == 0:
+        max_err, l2_err = spectral_divergence_error(vector)
+        max_rtol = 1.e-12 if dtype == np.float64 else 1.e-4
+        l2_rtol = 1.e-14 if dtype == np.float64 else 1.e-6
+    else:
+        vector_x = cla.empty(queue, (3,)+tuple(ni+2*h for ni in rank_shape), dtype)
+        pdx = cla.empty(queue, (3,)+rank_shape, dtype)
+
+        derivs = ps.GradientLaplacian(mpi, h, dx)
+
+        for mu in range(3):
+            fft.idft(vector[mu], vector_x[mu])
+
+        derivs(queue, fx=vector_x[0], pdx=pdx[0])
+        derivs(queue, fx=vector_x[1], pdy=pdx[1])
+        derivs(queue, fx=vector_x[2], pdz=pdx[2])
+
+        max_err, l2_err = divergence_error(pdx)
+        max_rtol = 1.e-10 if dtype == np.float64 else 1.e-4
+        l2_rtol = 1.e-14 if dtype == np.float64 else 1.e-6
+
+    assert max_err < max_rtol and l2_err < l2_rtol, \
+            "%s projection result is not transverse for grid_shape=%s, h=%d" % \
+            ("pol_to_vec" if pol else "transversify", str(grid_shape), h)
+
+    if timing:
+        from common import timer
+        ntime = 10
+        if pol:
+            t = timer(lambda: project.pol_to_vec(queue, plus, minus, vector),
+                      ntime=ntime)
+        else:
+            t = timer(lambda: project.transversify(queue, vector), ntime=ntime)
+        print("%s took %.3f ms for grid_shape=%s"
+              % ("pol_to_vec" if pol else "transversify", t, str(grid_shape)))
+
+
+def tensor_id(i, j):
+    a = i if i <= j else j
+    b = j if i <= j else i
+    return (7 - a) * a // 2 - 4 + b
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.parametrize("h", [0, 2])
+@pytest.mark.parametrize("pol", [False, True])
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_tensor_projector(ctx_factory, grid_shape, proc_shape, h, dtype, pol,
+                          timing=False):
+    if proc_shape[0] * proc_shape[1] * proc_shape[2] > 1 and h == 0:
+        pytest.skip("can't test continuum projectors on multiple ranks yet")
+    if pol:
+        pytest.skip("No tensor polarization projector yet")
+
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    L = (10,)*3
+    dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape))
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+    cdtype = fft.cdtype
+    if h > 0:
+        stencil = FirstCenteredDifference(h)
+        project = ps.Projector(fft, stencil.get_eigenvalues)
+    else:
+        project = ps.Projector(fft, lambda k, dx: k)
+
+    k_shape = fft.shape(True)
+    hij = cla.empty(queue, shape=(6,)+k_shape, dtype=cdtype)
+
+    if pol:
+        pass
+    else:
+        for mu in range(6):
+            hij[mu] = make_data(queue, fft).astype(cdtype)
+
+        project.transverse_traceless(queue, hij)
+
+    hij_h = hij.get()
+
+    if isinstance(fft, gDFT):
+        for i in range(6):
+            assert is_hermitian(hij_h[i]), \
+                "TT projection is non-hermitian for grid_shape=%s, h=%d" \
+                % (str(grid_shape), h)
+
+    trace = sum([hij_h[tensor_id(i, i)] for i in range(1, 4)])
+    tracenorm = np.sqrt(sum([np.abs(hij_h[tensor_id(i, i)])**2
+                             for i in range(1, 4)]))
+
+    trace = np.abs(trace[tracenorm != 0]) / tracenorm[tracenorm != 0]
+    trace = trace[trace < .9]
+    max_err = np.max(trace)
+    l2_err = np.sum(trace) / trace.size
+
+    max_rtol = 1.e-9 if dtype == np.float64 else 1.e-4
+    l2_rtol = 1.e-14 if dtype == np.float64 else 1.e-6
+
+    assert max_err < max_rtol and l2_err < l2_rtol, \
+        "TT projected tensor isn't traceless for grid_shape=%s, h=%d" \
+        % (str(grid_shape), h)
+
+    # h=0 performs "continuum" projection
+    if h == 0:
+        for i in range(1, 4):
+            vector_h = [hij_h[tensor_id(i, j)] for j in range(1, 4)]
+            max_err, l2_err = spectral_divergence_error(vector_h)
+            max_rtol = 1.e-9 if dtype == np.float64 else 1.e-4
+            l2_rtol = 1.e-14 if dtype == np.float64 else 1.e-6
+
+            assert max_err < max_rtol and l2_err < l2_rtol, \
+                "TT projection is not transverse for grid_shape=%s, h=%d" \
+                % (str(grid_shape), h)
+
+    else:
+        vector_x = cla.empty(queue, (3,)+tuple(ni+2*h for ni in rank_shape), dtype)
+        pdx = cla.empty(queue, (3,)+rank_shape, dtype)
+
+        derivs = ps.GradientLaplacian(mpi, h, dx)
+
+        fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+
+        for i in range(1, 4):
+            vector = [hij[tensor_id(i, j)] for j in range(1, 4)]
+
+            for mu in range(3):
+                fft.idft(vector[mu], vector_x[mu])
+
+            derivs(queue, fx=vector_x[0], pdx=pdx[0])
+            derivs(queue, fx=vector_x[1], pdy=pdx[1])
+            derivs(queue, fx=vector_x[2], pdz=pdx[2])
+
+            max_err, l2_err = divergence_error(pdx)
+            max_rtol = 1.e-10 if dtype == np.float64 else 1.e-4
+            l2_rtol = 1.e-14 if dtype == np.float64 else 1.e-6
+
+            assert max_err < max_rtol and l2_err < l2_rtol, \
+                "TT projection is not transverse for grid_shape=%s, h=%d" \
+                % (str(grid_shape), h)
+
+    if timing:
+        from common import timer
+        ntime = 10
+        t = timer(lambda: project.transverse_traceless(queue, hij), ntime=ntime)
+        print("TT projection took %.3f ms for grid_shape=%s" % (t, str(grid_shape)))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3, 'dtype': np.float64}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    for h in range(1, 5):
+        test_effective_momenta(None, **args, h=h)
+    for h in range(0, 5):
+        test_vector_projector(None, **args, h=h, pol=False, timing=True)
+        test_vector_projector(None, **args, h=h, pol=True, timing=True)
+    for h in range(0, 5):
+        test_tensor_projector(None, **args, h=h, pol=False, timing=True)
diff --git a/test/test_rayleigh.py b/test/test_rayleigh.py
new file mode 100644
index 0000000..00127e1
--- /dev/null
+++ b/test/test_rayleigh.py
@@ -0,0 +1,169 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cla
+import pystella as ps
+import pytest
+# pylint: disable=no-member
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("random", [True, False])
+def test_generate_WKB(ctx_factory, grid_shape, proc_shape, dtype, random,
+                      timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+
+    L = (10,)*3
+    volume = np.product(L)
+    dk = tuple(2 * np.pi / Li for Li in L)
+    modes = ps.RayleighGenerator(ctx, fft, dk, volume)
+
+    # only checking that this call is successful
+    fk, dfk = modes.generate_WKB(queue, random=random)
+
+    if timing:
+        ntime = 10
+        from common import timer
+        t = timer(lambda: modes.generate_WKB(queue, random=random), ntime=ntime)
+        print("%srandom, set_modes took %.3f ms for grid_shape=%s"
+              % ('' if random else 'non-', t, str(grid_shape)))
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("random", [True, False])
+def test_generate(ctx_factory, grid_shape, proc_shape, dtype, random, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+
+    num_bins = int(sum(Ni**2 for Ni in grid_shape)**.5 / 2 + .5) + 1
+    L = (10,)*3
+    volume = np.product(L)
+    dk = tuple(2 * np.pi / Li for Li in L)
+    spectra = ps.PowerSpectra(mpi, fft, dk, volume)
+    modes = ps.RayleighGenerator(ctx, fft, dk, volume)
+
+    kbins = min(dk) * np.arange(0, num_bins)
+    test_norm = 1 / 2 / np.pi**2 / np.product(grid_shape)**2
+
+    for exp in [-1, -2, -3]:
+        def power(k):
+            return k**exp
+
+        fk = modes.generate(queue, random=random, norm=1, field_ps=power)
+
+        spectrum = spectra.norm * spectra.bin_power(fk, queue=queue, k_power=3)[1:-1]
+        true_spectrum = test_norm * kbins[1:-1]**3 * power(kbins[1:-1])
+        err = np.abs(1 - spectrum / true_spectrum)
+
+        tol = .1 if num_bins < 64 else .3
+        assert np.max(err[num_bins//4:-4]) < tol and np.average(err) < tol, \
+            "init power spectrum incorrect for %srandom k**%d" \
+            % ('' if random else 'non-', exp)
+
+        fx = fft.idft(cla.to_device(queue, fk))
+        if isinstance(fx, np.ndarray):
+            fx = cla.to_device(queue, fx)
+
+        avg = mpi.allreduce(cla.sum(fx).get()) / np.product(grid_shape)
+        var = mpi.allreduce(cla.sum(fx**2).get()) / np.product(grid_shape) - avg**2
+        skew = mpi.allreduce(cla.sum(fx**3).get()) / np.product(grid_shape) \
+                - 3 * avg * var - avg**3
+        assert skew / var**1.5 < .1, \
+            "init power spectrum has large skewness for %srandom k**%d" \
+            % ('' if random else 'non-', exp)
+
+    if timing:
+        ntime = 10
+        from common import timer
+        t = timer(lambda: modes.generate(queue, random=random), ntime=ntime)
+        print("%srandom, set_modes took %.3f ms for grid_shape=%s"
+              % ('' if random else 'non-', t, str(grid_shape)))
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_make_hermitian(ctx_factory, grid_shape, proc_shape, dtype):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test make_hermitian only on one rank")
+
+    kshape = (grid_shape[0], grid_shape[1], grid_shape[2]//2 + 1)
+    data = np.random.rand(*kshape) + 1j * np.random.rand(*kshape)
+
+    from pystella.fourier.rayleigh import make_hermitian
+    data = make_hermitian(data)
+
+    pos = [np.arange(0, Ni//2+1) for Ni in grid_shape]
+    neg = [np.concatenate([np.array([0]), np.arange(Ni-1, Ni//2-1, -1)])
+           for Ni in grid_shape]
+
+    for k in [0, grid_shape[-1]//2]:
+        for n, p in zip(neg[0], pos[0]):
+            assert (data[n, neg[1], k] == np.conj(data[p, pos[1], k])).all(), \
+                    "Hermitian symmetry failed for data[:, :, %s]" % (k,)
+            assert (data[p, neg[1], k] == np.conj(data[n, pos[1], k])).all(), \
+                    "Hermitian symmetry failed for data[:, :, %s]" % (k,)
+        for n, p in zip(neg[1], pos[1]):
+            assert (data[neg[0], n, k] == np.conj(data[pos[0], p, k])).all(), \
+                    "Hermitian symmetry failed for data[:, :, %s]" % (k,)
+            assert (data[neg[0], p, k] == np.conj(data[pos[0], n, k])).all(), \
+                    "Hermitian symmetry failed for data[:, :, %s]" % (k,)
+
+        # check modes are real
+        # their k-space indices are also their array indices
+        for i in [0, grid_shape[0]//2]:
+            for j in [0, grid_shape[1]//2]:
+                assert np.imag(data[i, j, k]) == 0, \
+                        "data[%s, %s, %s] is not real" % (i, j, k)
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1, 1, 1), 'dtype': np.float64}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_make_hermitian(None, **args)
+    for random in [True, False]:
+        test_generate_WKB(None, **args, random=random, timing=True)
+        test_generate(None, **args, random=random, timing=True)
diff --git a/test/test_reduction.py b/test/test_reduction.py
new file mode 100644
index 0000000..bd69a27
--- /dev/null
+++ b/test/test_reduction.py
@@ -0,0 +1,200 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom as clr
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("op", ['sum', 'max'])
+@pytest.mark.parametrize("_grid_shape", [None, (128, 64, 32)])
+@pytest.mark.parametrize("pass_grid_dims", [True, False])
+def test_reduction(ctx_factory, grid_shape, proc_shape, dtype, op,
+                   _grid_shape, pass_grid_dims, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    grid_shape = _grid_shape or grid_shape
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    from pystella import Field
+    reducers = {}
+    reducers['avg'] = [(Field('f'), op)]
+
+    if pass_grid_dims:
+        reducer = ps.Reduction(mpi, reducers, rank_shape=rank_shape,
+                               grid_size=np.product(grid_shape))
+    else:
+        reducer = ps.Reduction(mpi, reducers)
+
+    f = clr.rand(queue, rank_shape, dtype=dtype)
+    result = reducer(queue, f=f)
+    avg = result['avg']
+
+    avg_test = reducer.reduce_array(f, op)
+    if op == 'sum':
+        avg_test /= np.product(grid_shape)
+
+    rtol = 5.e-14 if dtype == np.float64 else 1.e-5
+    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
+            "%s reduction innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (op, str(grid_shape), str(proc_shape))
+
+    if timing:
+        from common import timer
+        t = timer(lambda: reducer(queue, f=f), ntime=1000)
+        if mpi.rank == 0:
+            print("reduction took %.3f ms for grid_shape=%s, proc_shape=%s"
+                  % (t, str(grid_shape), str(proc_shape)))
+            print("Bandwidth = %.1f GB/s"
+                  % ((f.nbytes)/1024**3 / t * 1000))
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("op", ['sum'])
+@pytest.mark.parametrize("_grid_shape", [None, (128, 64, 32)])
+def test_reduction_with_new_shape(ctx_factory, grid_shape, proc_shape, dtype, op,
+                                  _grid_shape, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    grid_shape = _grid_shape or grid_shape
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    from pystella import Field
+    reducers = {}
+    reducers['avg'] = [(Field('f'), op)]
+
+    reducer = ps.Reduction(mpi, reducers)
+
+    f = clr.rand(queue, rank_shape, dtype=dtype)
+    result = reducer(queue, f=f)
+    avg = result['avg']
+
+    avg_test = reducer.reduce_array(f, op)
+    if op == 'sum':
+        avg_test /= np.product(grid_shape)
+
+    rtol = 5.e-14 if dtype == np.float64 else 1.e-5
+    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
+            "%s reduction innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (op, str(grid_shape), str(proc_shape))
+
+    # test call to reducer with new shape
+    grid_shape = tuple(Ni // 2 for Ni in grid_shape)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    f = clr.rand(queue, rank_shape, dtype=dtype)
+    result = reducer(queue, f=f)
+    avg = result['avg']
+
+    avg_test = reducer.reduce_array(f, op)
+    if op == 'sum':
+        avg_test /= np.product(grid_shape)
+
+    rtol = 5.e-14 if dtype == np.float64 else 1.e-5
+    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
+            "%s reduction w/new shape innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (op, str(grid_shape), str(proc_shape))
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("_grid_shape", [None, (128, 64, 32)])
+@pytest.mark.parametrize("pass_grid_dims", [True, False])
+def test_field_statistics(ctx_factory, grid_shape, proc_shape, dtype, _grid_shape,
+                          pass_grid_dims, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    grid_shape = _grid_shape or grid_shape
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    # make select parameters local for convenience
+    h = 2
+    f = clr.rand(queue, (2, 1)+tuple(ni + 2*h for ni in rank_shape), dtype=dtype)
+
+    if pass_grid_dims:
+        statistics = ps.FieldStatistics(mpi, h, rank_shape=rank_shape,
+                                        grid_size=np.product(grid_shape))
+    else:
+        statistics = ps.FieldStatistics(mpi, h)
+
+    stats = statistics(f)
+    avg = stats['mean']
+    var = stats['variance']
+
+    f_h = f.get()
+    rank_sum = np.sum(f_h[..., h:-h, h:-h, h:-h], axis=(-3, -2, -1))
+    avg_test = mpi.allreduce(rank_sum) / np.product(grid_shape)
+
+    rank_sum = np.sum(f_h[..., h:-h, h:-h, h:-h]**2, axis=(-3, -2, -1))
+    var_test = mpi.allreduce(rank_sum) / np.product(grid_shape) - avg_test**2
+
+    rtol = 5.e-14 if dtype == np.float64 else 1.e-5
+
+    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
+            "average innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (str(grid_shape), str(proc_shape))
+
+    assert np.allclose(var, var_test, rtol=rtol, atol=0), \
+            "variance innaccurate for grid_shape=%s, proc_shape=%s" \
+            % (str(grid_shape), str(proc_shape))
+
+    if timing:
+        from common import timer
+        t = timer(lambda: statistics(f))
+        if mpi.rank == 0:
+            print("field stats took "
+                  "%.3f ms for outer shape %s, grid_shape=%s, proc_shape=%s"
+                  % (t, f.shape[:-3], str(grid_shape), str(proc_shape)))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3,
+            'dtype': np.float64, '_grid_shape': None}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    for op in ['sum', 'max']:
+        test_reduction(None, **args, op=op, pass_grid_dims=True, timing=True)
+    test_reduction_with_new_shape(None, **args, op='sum')
+    test_field_statistics(None, **args, pass_grid_dims=True, timing=True)
diff --git a/test/test_relax.py b/test/test_relax.py
new file mode 100644
index 0000000..ea90126
--- /dev/null
+++ b/test/test_relax.py
@@ -0,0 +1,139 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cla
+import pyopencl.clrandom as clr
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+from pystella.multigrid import JacobiIterator, NewtonIterator
+
+
+@pytest.mark.parametrize("h", [1])
+@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("Solver", [JacobiIterator, NewtonIterator])
+def test_relax(ctx_factory, grid_shape, proc_shape, h, dtype, Solver, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    L = 10
+    dx = L / grid_shape[0]
+    dk = 2 * np.pi / L
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+    spectra = ps.PowerSpectra(mpi, fft, (dk,)*3, L**3)
+    statistics = ps.FieldStatistics(mpi, h, rank_shape=rank_shape,
+                                    grid_size=np.product(grid_shape))
+
+    def get_laplacian(f):
+        from pystella.derivs import _lap_coefs, centered_diff
+        lap_coefs = _lap_coefs[h]
+        from pymbolic import var
+        return sum([centered_diff(f, lap_coefs, direction=mu, order=2)
+                    for mu in range(1, 4)]) / var('dx')**2
+
+    test_problems = {}
+
+    from pystella import Field
+    f = Field('f', offset='h')
+    rho = Field('rho', offset='h')
+    test_problems[f] = (get_laplacian(f), rho)
+
+    f = Field('f2', offset='h')
+    rho = Field('rho2', offset='h')
+    test_problems[f] = (get_laplacian(f) - f, rho)
+
+    solver = Solver(mpi, queue, test_problems, h=h, dtype=dtype,
+                    fixed_parameters=dict(omega=1/2))
+
+    def zero_mean_array():
+        f0 = clr.rand(queue, grid_shape, dtype)
+        f = clr.rand(queue, tuple(ni + 2*h for ni in rank_shape), dtype)
+        mpi.scatter_array(queue, f0, f, root=0)
+        avg = statistics(f)['mean']
+        f = f - avg
+        mpi.share_halos(queue, f)
+        return f
+
+    f = zero_mean_array()
+    rho = zero_mean_array()
+    tmp = cla.zeros_like(f)
+
+    f2 = zero_mean_array()
+    rho2 = zero_mean_array()
+    tmp2 = cla.zeros_like(f)
+
+    num_iterations = 1000
+    errors = {'f': [], 'f2': []}
+    first_mode_zeroed = {'f': [], 'f2': []}
+    for i in range(0, num_iterations, 2):
+        solver(mpi, queue, iterations=2, dx=np.array(dx),
+               f=f, tmp_f=tmp, rho=rho,
+               f2=f2, tmp_f2=tmp2, rho2=rho2)
+
+        err = solver.get_error(queue,
+                               f=f, r_f=tmp, rho=rho,
+                               f2=f2, r_f2=tmp2, rho2=rho2, dx=np.array(dx))
+        for k, v in err.items():
+            errors[k].append(v)
+
+        for key, resid in zip(['f', 'f2'], [tmp, tmp2]):
+            spectrum = spectra(resid, k_power=0)
+            if mpi.rank == 0:
+                max_amp = np.max(spectrum)
+                first_zero = np.argmax(spectrum[1:] < 1.e-30 * max_amp)
+                first_mode_zeroed[key].append(first_zero)
+
+    for k, errs in errors.items():
+        errs = np.array(errs)
+        iters = np.arange(1, errs.shape[0]+1)
+        assert (errs[10:, 0] * iters[10:] / errs[0, 0] < 1.).all(), \
+            "relaxation not converging at least linearly for " \
+            "grid_shape=%s, h=%d, proc_shape=%s" \
+            % (str(grid_shape), h, str(proc_shape))
+
+    first_mode_zeroed = mpi.bcast(first_mode_zeroed, root=0)
+    for k, x in first_mode_zeroed.items():
+        x = np.array(list(x))[2:]
+        assert (x[1:] <= x[:-1]).all() and np.min(x) < np.max(x) / 5, \
+            "relaxation not smoothing error grid_shape=%s, h=%d, proc_shape=%s" \
+            % (str(grid_shape), h, str(proc_shape))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (128,)*3, 'proc_shape': (1,)*3,
+            'dtype': np.float64, 'h': 1}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_relax(None, **args, Solver=NewtonIterator, timing=True)
diff --git a/test/test_spectra.py b/test/test_spectra.py
new file mode 100644
index 0000000..c470c7e
--- /dev/null
+++ b/test/test_spectra.py
@@ -0,0 +1,198 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cla
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+def make_data(*shape):
+    return np.random.rand(*shape) + 1j * np.random.rand(*shape)
+
+
+def make_hermitian(data, fft):
+    from pystella.fourier import gDFT
+    if isinstance(fft, gDFT):
+        from pystella.fourier.rayleigh import make_hermitian
+        data = make_hermitian(data)
+    data = fft.zero_corner_modes(data)
+    return data
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("L", [(10,)*3, (10, 7, 8), (3, 8, 19), (13.2, 5.71, 9.4),
+                               (11, 11, 4), (4, 11, 11), (11, 4, 11)])
+def test_spectra(ctx_factory, grid_shape, proc_shape, dtype, L, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+
+    L = L or (3, 5, 7)
+    dk = tuple(2 * np.pi / Li for Li in L)
+    cdtype = fft.cdtype
+    spec = ps.PowerSpectra(mpi, fft, dk, np.product(L), bin_width=min(dk)+.001)
+    # FIXME: bin_width=min(dk) sometimes disagrees to O(.1%) with numpy...
+
+    assert int(np.sum(spec.bin_counts)) == np.product(grid_shape), \
+        "bin counts don't sum to total number of points/modes"
+
+    k_power = 2.
+    fk = make_data(*fft.shape(True)).astype(cdtype)
+
+    fk_d = cla.to_device(queue, fk)
+    spectrum = spec.bin_power(fk_d, k_power=k_power, is_real=True)
+    bins = np.arange(-.5, spec.num_bins + .5) * spec.bin_width
+
+    sub_k = list(x.get() for x in fft.sub_k.values())
+    kvecs = np.meshgrid(*sub_k, indexing='ij', sparse=False)
+    kmags = np.sqrt(sum((dki * ki)**2 for dki, ki in zip(dk, kvecs)))
+
+    counts = 2. * np.ones_like(kmags)
+    counts[kvecs[2] == 0] = 1
+    counts[kvecs[2] == grid_shape[-1]//2] = 1
+
+    max_rtol = 1.e-8 if dtype == np.float64 else 2.e-2
+    avg_rtol = 1.e-11 if dtype == np.float64 else 2.e-4
+
+    bin_counts2 = spec.bin_power(np.ones_like(fk), queue=queue, k_power=0)
+    assert np.max(np.abs(bin_counts2 - 1)) < max_rtol, \
+        "bin counting disagrees between PowerSpectra and np.histogram"
+
+    hist = np.histogram(kmags, bins=bins,
+                        weights=np.abs(fk)**2 * counts * kmags**k_power)[0]
+    hist = mpi.allreduce(hist) / spec.bin_counts
+
+    # skip the Nyquist mode and the zero mode
+    err = np.abs((spectrum[1:-2] - hist[1:-2]) / hist[1:-2])
+    assert np.max(err) < max_rtol and np.average(err) < avg_rtol, \
+           "real power spectrum inaccurate for grid_shape=%s" % str(grid_shape)
+
+    if timing:
+        from common import timer
+        t = timer(lambda: spec.bin_power(fk_d, k_power=k_power, is_real=True))
+        print("real power spectrum took %.3f ms for grid_shape=%s"
+              % (t, str(grid_shape)))
+
+    # complex_shape = (p.grid_shape[0], p.grid_shape[1]//p.proc_shape[0],
+    #                  p.grid_shape[2]//p.proc_shape[1])
+    # fk = make_data(complex_shape).astype(cdtype)
+
+    # fk_d = cla.to_device(queue, fk)
+    # spectrum = spec.bin_power(fk_d, k_power=k_power, is_real=False)
+
+    # hist = np.histogram(ckmags/dk, bins=bins,
+    #                     weights=np.abs(fk)**2. * ckmags**k_power)[0]
+    # hist = mpi.allreduce(hist)/spec.bin_counts
+
+    # err = np.abs((spectrum[1:-2] - hist[1:-2]) / hist[1:-2])
+    # assert np.max(err) < max_rtol and np.average(err) < avg_rtol, \
+    #     "complex power spectrum inaccurate for N=%d" % (N)
+
+    # if timing:
+    #     start = time.time()
+    #     for i in range(nrun):
+    #         spectrum = spec.bin_power(fk_d, k_power=k_power, is_real=False)
+    #     end = time.time()
+    #     print("complex power spectrum took %.3f ms for N=%d"
+    #         % ((end - start)/nrun*1000., N))
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_pol_spectra(ctx_factory, grid_shape, proc_shape, dtype, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    h = 1
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+
+    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)
+
+    L = (10, 8, 7)
+    dk = tuple(2 * np.pi / Li for Li in L)
+    cdtype = fft.cdtype
+    spec = ps.PowerSpectra(mpi, fft, dk, np.product(L))
+
+    k_power = 2.
+
+    fk = make_data(*fft.shape(True)).astype(cdtype)
+    fk = make_hermitian(fk, fft).astype(cdtype)
+    plus = cla.to_device(queue, fk)
+
+    fk = make_data(*fft.shape(True)).astype(cdtype)
+    fk = make_hermitian(fk, fft).astype(cdtype)
+    minus = cla.to_device(queue, fk)
+
+    plus_ps_1 = spec.bin_power(plus, queue=queue, k_power=k_power)
+    minus_ps_1 = spec.bin_power(minus, queue=queue, k_power=k_power)
+
+    project = ps.Projector(fft, h)
+
+    vector = cla.empty(queue, (3,)+fft.shape(True), cdtype)
+    project.pol_to_vec(queue, plus, minus, vector)
+    project.vec_to_pol(queue, plus, minus, vector)
+
+    plus_ps_2 = spec.bin_power(plus, k_power=k_power)
+    minus_ps_2 = spec.bin_power(minus, k_power=k_power)
+
+    max_rtol = 1.e-8 if dtype == np.float64 else 1.e-2
+    avg_rtol = 1.e-11 if dtype == np.float64 else 1.e-4
+
+    # skip the Nyquist mode and the zero mode
+    err = np.abs((plus_ps_1[1:-2] - plus_ps_2[1:-2]) / plus_ps_1[1:-2])
+    assert np.max(err) < max_rtol and np.average(err) < avg_rtol, \
+           "plus power spectrum inaccurate for grid_shape=%s" % str(grid_shape)
+
+    err = np.abs((minus_ps_1[1:-2] - minus_ps_2[1:-2]) / minus_ps_1[1:-2])
+    assert np.max(err) < max_rtol and np.average(err) < avg_rtol, \
+           "minus power spectrum inaccurate for grid_shape=%s" % str(grid_shape)
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3, 'dtype': np.float64}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_spectra(None, **args, L=None, timing=True)
+    test_pol_spectra(None, **args, timing=True)
diff --git a/test/test_stencil.py b/test/test_stencil.py
new file mode 100644
index 0000000..4c9f879
--- /dev/null
+++ b/test/test_stencil.py
@@ -0,0 +1,104 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom as clr
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize("stream", [True, False])
+def test_stencil(ctx_factory, grid_shape, proc_shape, dtype, stream, h=1,
+                 timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+
+    from pymbolic import var
+    x = var('x')
+    y = var('y')
+    i, j, k = var('i'), var('j'), var('k')
+
+    map_dict = {}
+    map_dict[y[i, j, k]] = x[i + h + h, j + h, k + h] \
+                           + x[i + h, j + h + h, k + h] \
+                           + x[i + h, j + h, k + h + h] \
+                           + x[i - h + h, j + h, k + h] \
+                           + x[i + h, j - h + h, k + h] \
+                           + x[i + h, j + h, k - h + h]
+
+    if stream:
+        try:
+            stencil_map = ps.StreamingStencil(map_dict, prefetch_args=['x'], h=h)
+        except:  # noqa
+            pytest.skip("StreamingStencil unavailable")
+    else:
+        stencil_map = ps.Stencil(map_dict, prefetch_args=['x'], h=h)
+
+    x = clr.rand(queue, tuple(ni + 2*h for ni in rank_shape), dtype)
+    y = clr.rand(queue, rank_shape, dtype)
+
+    x_h = x.get()
+    y_true = (x_h[2*h:, h:-h, h:-h]
+              + x_h[h:-h, 2*h:, h:-h]
+              + x_h[h:-h, h:-h, 2*h:]
+              + x_h[:-2*h, h:-h, h:-h]
+              + x_h[h:-h, :-2*h, h:-h]
+              + x_h[h:-h, h:-h, :-2*h])
+
+    stencil_map(queue, x=x, y=y)
+
+    rtol = 5.e-14 if dtype == np.float64 else 1.e-5
+
+    assert np.allclose(y.get(), y_true, rtol=rtol, atol=0), \
+           "average innaccurate for grid_shape=%s, h=%d, proc_shape=%s" \
+           % (str(grid_shape), h, str(proc_shape))
+
+    if timing:
+        from common import timer
+        t = timer(lambda: stencil_map(queue, x=x, y=y)[0])
+        print("stencil took %.3f ms for grid_shape=%s, h=%d, proc_shape=%s"
+              % (t, str(grid_shape), h, str(proc_shape)))
+        print("Bandwidth = %.1f GB/s"
+              % ((x.nbytes + y.nbytes)/1024**3 / t * 1000))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (256,)*3, 'proc_shape': (1,)*3, 'dtype': np.float64}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    for h in range(1, 4):
+        for stream in [True, False]:
+            test_stencil(None, **args, stream=stream, h=h, timing=True)
diff --git a/test/test_step.py b/test/test_step.py
new file mode 100644
index 0000000..1254bda
--- /dev/null
+++ b/test/test_step.py
@@ -0,0 +1,118 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cla
+import pyopencl.clrandom as clr
+import pyopencl.clmath as clm
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+# this only tests Stepper's correctness as an ODE solver
+from pystella.step import all_steppers
+@pytest.mark.parametrize("dtype", [np.float64])
+@pytest.mark.parametrize("Stepper", all_steppers)
+def test_step(ctx_factory, proc_shape, dtype, Stepper):
+    if proc_shape != (1, 1, 1):
+        pytest.skip("test step only on one rank")
+
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+
+    from pystella.step import LowStorageRKStepper
+    is_low_storage = LowStorageRKStepper in Stepper.__bases__
+
+    rank_shape = (64,)*3
+    if is_low_storage:
+        arr_shape = rank_shape
+    else:
+        arr_shape = (3,) + rank_shape
+
+    dtlist = [.1, .05, .025, .0125]
+
+    for n in [-1., -2., -3., -4.]:
+        max_errs = {}
+        for dt in dtlist:
+
+            def sol(y0, t):
+                return ((-1 + n)*(-t + y0**(1 - n)/(-1 + n)))**(1/(1 - n))
+
+            y = ps.Field('y')
+            rhs_dict = {y: y**n}
+
+            import loopy as lp
+            args = [lp.GlobalArg('y', shape=arr_shape, dtype=dtype)]
+            y = clr.rand(queue, arr_shape, dtype=dtype) + 1.
+
+            if is_low_storage:
+                k_tmp = cla.zeros(queue, (1,)+arr_shape, dtype=dtype)
+                y0 = y.copy()
+                stepper = Stepper(rhs_dict, k_tmp=k_tmp, args=args, dt=dt, h=1,
+                                  rank_shape=rank_shape)
+            else:
+                stepper = Stepper(rhs_dict, args=args, dt=dt, h=1,
+                                  rank_shape=rank_shape)
+                y0 = y[0].copy()
+
+            t = 0
+            errs = []
+            while t < 1.:
+                for s in range(stepper.num_stages):
+                    stepper(s, queue=queue, y=y)
+                t += dt
+
+                if is_low_storage:
+                    errs.append(cla.max(clm.fabs(1. - sol(y0, t)/y)).get())
+                else:
+                    errs.append(cla.max(clm.fabs(1. - sol(y0, t)/y[0])).get())
+
+            max_errs[dt] = max(errs)
+
+        order = stepper.expected_order
+        print(order)
+        print(n, max_errs)
+        for a, b in zip(dtlist[:-1], dtlist[1:]):
+            print(max_errs[a] / max_errs[b])
+
+        order = stepper.expected_order
+        rtol = dtlist[-1]**order if dtype == np.float64 else 1.e-1
+        assert list(max_errs.values())[-1] < rtol, \
+               "Stepper solution inaccurate for n=%f" % (n)
+
+        for a, b in zip(dtlist[:-1], dtlist[1:]):
+            assert max_errs[a] / max_errs[b] > .9 * 2.**order, \
+                "Stepper convergence failing for n=%f" % (n)
+
+
+if __name__ == "__main__":
+    for stepper in all_steppers:
+        test_step(None, (1, 1, 1), np.float64, stepper)
diff --git a/test/test_transfer.py b/test/test_transfer.py
new file mode 100644
index 0000000..480ba36
--- /dev/null
+++ b/test/test_transfer.py
@@ -0,0 +1,125 @@
+__copyright__ = "Copyright (C) 2019 Zachary J Weiner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.array as cla
+import pystella as ps
+import pytest
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+@pytest.mark.filterwarnings(
+    "ignore::pyopencl.characterize.CLCharacterizationWarning")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.LoopyAdvisory")
+@pytest.mark.filterwarnings("ignore::loopy.diagnostic.ParameterFinderWarning")
+@pytest.mark.parametrize("h", [2])
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_transfer(ctx_factory, grid_shape, proc_shape, h, dtype, timing=False):
+    if ctx_factory:
+        ctx = ctx_factory()
+    else:
+        ctx = ps.choose_device_and_make_context()
+
+    queue = cl.CommandQueue(ctx)
+    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
+    mpi = ps.DomainDecomposition(proc_shape, h, rank_shape)
+    grid_shape_2 = tuple(Ni // 2 for Ni in grid_shape)
+    rank_shape_2 = tuple(ni // 2 for ni in rank_shape)
+    mpi2 = ps.DomainDecomposition(proc_shape, h, rank_shape_2)
+
+    from pystella.multigrid import (Injection, FullWeighting,
+                                    LinearInterpolation, CubicInterpolation)
+
+    inject = Injection(h=h, dtype=dtype)
+    full_weighting = FullWeighting(h=h, dtype=dtype)
+
+    def relerr(a, b):
+        return np.max(np.abs(a-b))
+
+    for restrict in [inject, full_weighting]:
+        f1h = cla.zeros(queue, tuple(ni + 2*h for ni in rank_shape), dtype)
+        f2h = cla.zeros(queue, tuple(ni + 2*h for ni in rank_shape_2), dtype)
+
+        kvec = 2 * np.pi * np.array([1, 1, 1]).astype(dtype)
+
+        xvecs = np.meshgrid(np.linspace(0, 1, grid_shape[0]+1)[:-1],
+                            np.linspace(0, 1, grid_shape[1]+1)[:-1],
+                            np.linspace(0, 1, grid_shape[2]+1)[:-1], indexing='ij')
+
+        phases = kvec[0] * xvecs[0] + kvec[1] * xvecs[1] + kvec[2] * xvecs[2]
+        mpi.scatter_array(queue, np.sin(phases), f1h, root=0)
+        mpi.share_halos(queue, f1h)
+
+        restrict(queue, f1=f1h, f2=f2h)
+
+        restrict_error = relerr(f1h.get()[h:-h:2, h:-h:2, h:-h:2],
+                                f2h.get()[h:-h, h:-h, h:-h])
+
+        if restrict == inject:
+            expected_error_bound = 1.e-15
+        else:
+            expected_error_bound = .05 / (grid_shape[0]/32)**2
+
+        assert restrict_error < expected_error_bound, \
+            "%s innaccurate for grid_shape=%s, h=%d, proc_shape=%s" \
+            % ('restrict', str(grid_shape), h, str(proc_shape))
+
+    linear_interp = LinearInterpolation(h=h, dtype=dtype)
+    cubic_interp = CubicInterpolation(h=h, dtype=dtype)
+
+    for interp in [linear_interp, cubic_interp]:
+        kvec = 2 * np.pi * np.array([1, 1, 1]).astype(dtype)
+
+        xvecs = np.meshgrid(np.linspace(0, 1, grid_shape_2[0]+1)[:-1],
+                            np.linspace(0, 1, grid_shape_2[1]+1)[:-1],
+                            np.linspace(0, 1, grid_shape_2[2]+1)[:-1], indexing='ij')
+
+        phases = kvec[0] * xvecs[0] + kvec[1] * xvecs[1] + kvec[2] * xvecs[2]
+        mpi2.scatter_array(queue, np.sin(phases), f2h, root=0)
+        mpi2.share_halos(queue, f2h)
+
+        f1h_new = cla.zeros_like(f1h)
+        interp(queue, f1=f1h_new, f2=f2h)
+        mpi.share_halos(queue, f1h_new)
+
+        interp_error = relerr(f1h_new.get(), f1h.get())
+
+        if interp == cubic_interp:
+            expected_error_bound = .005 / (grid_shape[0]/32)**4
+        else:
+            expected_error_bound = .1 / (grid_shape[0]/32)**2
+
+        assert interp_error < expected_error_bound, \
+            "%s innaccurate for grid_shape=%s, h=%d, proc_shape=%s" \
+            % ('interp', str(grid_shape), h, str(proc_shape))
+
+
+if __name__ == "__main__":
+    args = {'grid_shape': (128,)*3, 'proc_shape': (1,)*3,
+            'dtype': np.float64, 'h': 2}
+    from common import get_exec_arg_dict
+    args.update(get_exec_arg_dict())
+    test_transfer(None, **args, timing=True)