tskit-dev · benjeffery · Jun 26, 2025 · Jun 19, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -136,6 +136,52 @@ jobs:
           name: codecov-umbrella
           verbose: true
 
+  test-numpy1:
+    name: Numpy 1.x
+    runs-on: ubuntu-24.04
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Cancel Previous Runs
+        uses: styfle/cancel-workflow-action@0.12.1
+        with:
+          access_token: ${{ github.token }}
+
+      - name: Checkout
+        uses: actions/checkout@v4.2.2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5.4.0
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        working-directory: python
+        run: |
+          pip install -r requirements/CI-complete/requirements.txt
+          pip install "numpy<2"
+
+      - name: Build module
+        working-directory: python
+        run: |
+          python setup.py build_ext --inplace
+
+      - name: Run tests with numpy 1.x
+        working-directory: python
+        run: |
+          python -m pytest -x --cov=tskit --cov-report=xml --cov-branch -n2 tests/test_lowlevel.py tests/test_highlevel.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5.4.0
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          working-directory: python
+          fail_ci_if_error: false
+          flags: python-tests-numpy1
+          name: codecov-numpy1
+          verbose: true
+
   msys2:
     runs-on: windows-latest
     strategy:

diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst
@@ -19,6 +19,7 @@
 - Add ``TreeSequence.mutations_edge`` which returns the edge ID for each mutation's
   edge. (:user:`benjeffery`, :pr:`3226`, :issue:`3189`)
 
+
 **Bugfixes**
 
 - Fix bug in ``TreeSequence.pair_coalescence_counts`` when ``span_normalise=True``
@@ -30,6 +31,17 @@
 - ``ltrim``, ``rtrim``, ``trim`` and ``shift`` raise an error if used on a tree sequence
   containing a reference sequence (:user:`hyanwong`, :pr:`3210`, :issue:`2091`)
 
+- Add ``TreeSequence.sites_ancestral_state`` and ``TreeSequence.mutations_derived_state`` properties
+  to return the ancestral state of sites and derived state of mutations as NumPy arrays of
+  the new numpy 2.0 StringDType.
+  This requires numpy version 2 or greater, as such this is now the minimum version stated in tskit's
+  dependencies. If you try to use another python module that was compiled against numpy 1.X you may see
+  the error "A module that was compiled using NumPy 1.x cannot be run in NumPy 2.0.0 as it may crash.".
+  If no newer version of the module is avaliable you can still use it with tskit and numpy 1.X by
+  building tskit from source with numpy 1.X using ``pip install tskit --no-binary tskit``. However
+  any use of the new properties will result in a ``RuntimeError``.
+  (:user:`benjeffery`, :pr:`3228`, :issue:`2632`)
+
 --------------------
 [0.6.4] - 2025-05-21
 --------------------

diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c
@@ -23,16 +23,27 @@
  * SOFTWARE.
  */
 
-#define PY_SSIZE_T_CLEAN
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #define TSK_BUG_ASSERT_MESSAGE                                                          \
     "Please open an issue on"                                                           \
     " GitHub, ideally with a reproducible example."                                     \
     " (https://github.com/tskit-dev/tskit/issues)"
 
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <structmember.h>
+#include <numpy/numpyconfig.h>
+
+#if defined(NPY_2_0_API_VERSION) && NPY_API_VERSION >= NPY_2_0_API_VERSION
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#undef NPY_FEATURE_VERSION
+#define NPY_FEATURE_VERSION NPY_2_0_API_VERSION
+#define HAVE_NUMPY_2 1
+#else
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#define HAVE_NUMPY_2 0
+#endif
 #include <numpy/arrayobject.h>
+
+#include <structmember.h>
 #include <float.h>
 
 #include "kastore.h"
@@ -10802,6 +10813,60 @@
     return make_owned_array((PyObject *) self, size, dtype, data);
 }
 
+#if HAVE_NUMPY_2
+PyObject *
+TreeSequence_decode_ragged_string_column(
+    TreeSequence *self, tsk_size_t num_rows, const char *data, const tsk_size_t *offset)
+{
+    PyObject *ret = NULL;
+    PyObject *array = NULL;
+    char *array_data = NULL;
+    npy_intp dims[1];
+    tsk_size_t i;
+    int pack_result;
+    npy_string_allocator *allocator = NULL;
+    PyArray_StringDTypeObject *string_dtype
+        = (PyArray_StringDTypeObject *) PyArray_DescrFromType(NPY_VSTRING);
+    /* This can only fail if an invalid dtype is passed */
+    assert(string_dtype != NULL);
+
+    dims[0] = (npy_intp) num_rows;
+    array = PyArray_Zeros(1, dims, (PyArray_Descr *) string_dtype, 0);
+    if (array == NULL) {
+        goto out;
+    }
+    array_data = (char *) PyArray_DATA((PyArrayObject *) array);
+    allocator = NpyString_acquire_allocator(string_dtype);
+    for (i = 0; i < num_rows; i++) {
+        pack_result = NpyString_pack(allocator,
+            (npy_packed_static_string
+                    *) (array_data + (i * ((PyArray_Descr *) string_dtype)->elsize)),
+            data + offset[i], offset[i + 1] - offset[i]);
+        if (pack_result < 0) {
+            PyErr_SetString(PyExc_MemoryError, "could not pack string.");
+            goto out;
+        }
+    }
+    /* Release the allocator before we call any other Python C API functions
+     * which may require the GIL.
+     */
+    NpyString_release_allocator(allocator);
+    allocator = NULL;
+
+    /* Clear the writeable flag to match other arrays semantics */
+    PyArray_CLEARFLAGS((PyArrayObject *) array, NPY_ARRAY_WRITEABLE);
+
+    ret = array;
+    array = NULL;
+out:
+    if (allocator != NULL) {
+        NpyString_release_allocator(allocator);
+    }
+    Py_XDECREF(array);
+    return ret;
+}
+#endif
+
 static PyObject *
 TreeSequence_get_individuals_flags(TreeSequence *self, void *closure)
 {
@@ -11049,6 +11114,24 @@
     return ret;
 }
 
+#if HAVE_NUMPY_2
+static PyObject *
+TreeSequence_get_sites_ancestral_state(TreeSequence *self, void *closure)
+{
+    PyObject *ret = NULL;
+    tsk_site_table_t sites;
+
+    if (TreeSequence_check_state(self) != 0) {
+        goto out;
+    }
+    sites = self->tree_sequence->tables->sites;
+    ret = TreeSequence_decode_ragged_string_column(
+        self, sites.num_rows, sites.ancestral_state, sites.ancestral_state_offset);
+out:
+    return ret;
+}
+#endif
+
 static PyObject *
 TreeSequence_get_sites_metadata(TreeSequence *self, void *closure)
 {
@@ -11141,6 +11224,24 @@
     return ret;
 }
 
+#if HAVE_NUMPY_2
+static PyObject *
+TreeSequence_get_mutations_derived_state(TreeSequence *self, void *closure)
+{
+    PyObject *ret = NULL;
+    tsk_mutation_table_t mutations;
+
+    if (TreeSequence_check_state(self) != 0) {
+        goto out;
+    }
+    mutations = self->tree_sequence->tables->mutations;
+    ret = TreeSequence_decode_ragged_string_column(self, mutations.num_rows,
+        mutations.derived_state, mutations.derived_state_offset);
+out:
+    return ret;
+}
+#endif
+
 static PyObject *
 TreeSequence_get_mutations_metadata(TreeSequence *self, void *closure)
 {
@@ -11719,6 +11820,11 @@
     { .name = "sites_position",
         .get = (getter) TreeSequence_get_sites_position,
         .doc = "The site position array" },
+#if HAVE_NUMPY_2
+    { .name = "sites_ancestral_state",
+        .get = (getter) TreeSequence_get_sites_ancestral_state,
+        .doc = "The site ancestral state array" },
+#endif
     { .name = "sites_metadata",
         .get = (getter) TreeSequence_get_sites_metadata,
         .doc = "The site metadata array" },
@@ -11737,6 +11843,11 @@
     { .name = "mutations_time",
         .get = (getter) TreeSequence_get_mutations_time,
         .doc = "The mutation time array" },
+#if HAVE_NUMPY_2
+    { .name = "mutations_derived_state",
+        .get = (getter) TreeSequence_get_mutations_derived_state,
+        .doc = "The mutation derived state array" },
+#endif
     { .name = "mutations_metadata",
         .get = (getter) TreeSequence_get_mutations_metadata,
         .doc = "The mutation metadata array" },
@@ -14606,11 +14717,24 @@
 PyObject *
 PyInit__tskit(void)
 {
-    PyObject *module = PyModule_Create(&tskitmodule);
-    if (module == NULL) {
+    PyObject *module;
+
+#if HAVE_NUMPY_2
+    if (PyArray_ImportNumPyAPI() < 0) {
         return NULL;
     }
+#else
     import_array();
+#endif
+
+    module = PyModule_Create(&tskitmodule);
+    if (module == NULL) {
+        return NULL;
+    }
+
+    if (PyModule_AddIntConstant(module, "HAS_NUMPY_2", HAVE_NUMPY_2)) {
+        return NULL;
+    }
 
     if (register_lwt_class(module) != 0) {
         return NULL;

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -43,7 +43,7 @@ keywords = [
 requires-python = ">=3.9"
 dependencies = [
     "jsonschema>=3.0.0",
-    "numpy>=1.23.5",
+    "numpy>=2",
 ]
 
 [project.urls]

diff --git a/python/requirements/development.txt b/python/requirements/development.txt
@@ -17,7 +17,6 @@ msprime>=1.0.0
 networkx
 newick
 ninja
-numpy
 packaging
 portion
 pre-commit

diff --git a/python/requirements/development.yml b/python/requirements/development.yml
@@ -22,7 +22,7 @@ dependencies:
   - msprime>=1.0.0
   - networkx
   - ninja
-  - numpy<2
+  - numpy
   - packaging
   - portion
   - pre-commit

diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py
@@ -5321,6 +5321,83 @@ def test_mixed_sample_status(self):
         assert_array_equal(result, expected)
 
 
+class TestRaggedArrays:
+    @pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher")
+    @pytest.mark.parametrize("num_rows", [0, 1, 100])
+    @pytest.mark.parametrize("column", ["ancestral_state", "derived_state"])
+    def test_site_ancestral_state(self, num_rows, column):
+        tables = tskit.TableCollection(sequence_length=100)
+        rng = random.Random(42)
+        for i in range(num_rows):
+            state_length = rng.randint(0, 10)
+            state = "".join(
+                chr(rng.randint(0x1F300, 0x1F6FF)) for _ in range(state_length)
+            )
+            if column == "ancestral_state":
+                tables.sites.add_row(position=i, ancestral_state=state)
+            elif column == "derived_state":
+                tables.nodes.add_row()
+                tables.sites.add_row(position=i, ancestral_state="A")
+                tables.mutations.add_row(site=i, node=0, derived_state=state)
+        ts = tables.tree_sequence()
+        a = getattr(
+            ts,
+            (
+                "sites_ancestral_state"
+                if column == "ancestral_state"
+                else "mutations_derived_state"
+            ),
+        )
+        assert isinstance(a, np.ndarray)
+        assert a.shape == (num_rows,)
+        assert a.dtype == np.dtype("T")
+        assert a.size == num_rows
+
+        # Check that the value is cached
+        assert a is getattr(
+            ts,
+            (
+                "sites_ancestral_state"
+                if column == "ancestral_state"
+                else "mutations_derived_state"
+            ),
+        )
+
+        for state, row in itertools.zip_longest(
+            a, ts.sites() if column == "ancestral_state" else ts.mutations()
+        ):
+            assert state == getattr(row, column)
+
+    @pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher")
+    @pytest.mark.parametrize("ts", tsutil.get_example_tree_sequences())
+    def test_equality_sites_ancestral_state(self, ts):
+        assert_array_equal(
+            ts.sites_ancestral_state, [site.ancestral_state for site in ts.sites()]
+        )
+
+    @pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher")
+    @pytest.mark.parametrize("ts", tsutil.get_example_tree_sequences())
+    def test_equality_mutations_derived_state(self, ts):
+        assert_array_equal(
+            ts.mutations_derived_state,
+            [mutation.derived_state for mutation in ts.mutations()],
+        )
+
+    @pytest.mark.skipif(_tskit.HAS_NUMPY_2, reason="Test only on Numpy 1.X")
+    @pytest.mark.parametrize(
+        "column", ["sites_ancestral_state", "mutations_derived_state"]
+    )
+    def test_ragged_array_not_supported(self, column):
+        tables = tskit.TableCollection(sequence_length=100)
+        ts = tables.tree_sequence()
+
+        with pytest.raises(
+            RuntimeError,
+            match="requires numpy 2.0",
+        ):
+            getattr(ts, column)
+
+
 class TestSampleNodesByPloidy:
     @pytest.mark.parametrize(
         "n_samples,ploidy,expected",
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,7 +17,6 @@ msprime>=1.0.0 @@
     networkx
     newick
     ninja
-    numpy
     packaging
     portion
     pre-commit
@@ Expand Down @@