diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 02f32036dd..b0cab4826c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -136,6 +136,52 @@ jobs: name: codecov-umbrella verbose: true + test-numpy1: + name: Numpy 1.x + runs-on: ubuntu-24.04 + defaults: + run: + shell: bash + steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.12.1 + with: + access_token: ${{ github.token }} + + - name: Checkout + uses: actions/checkout@v4.2.2 + + - name: Setup Python + uses: actions/setup-python@v5.4.0 + with: + python-version: '3.12' + + - name: Install dependencies + working-directory: python + run: | + pip install -r requirements/CI-complete/requirements.txt + pip install "numpy<2" + + - name: Build module + working-directory: python + run: | + python setup.py build_ext --inplace + + - name: Run tests with numpy 1.x + working-directory: python + run: | + python -m pytest -x --cov=tskit --cov-report=xml --cov-branch -n2 tests/test_lowlevel.py tests/test_highlevel.py + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v5.4.0 + with: + token: ${{ secrets.CODECOV_TOKEN }} + working-directory: python + fail_ci_if_error: false + flags: python-tests-numpy1 + name: codecov-numpy1 + verbose: true + msys2: runs-on: windows-latest strategy: diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 623469fc40..2e708228c6 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -19,6 +19,7 @@ - Add ``TreeSequence.mutations_edge`` which returns the edge ID for each mutation's edge. (:user:`benjeffery`, :pr:`3226`, :issue:`3189`) + **Bugfixes** - Fix bug in ``TreeSequence.pair_coalescence_counts`` when ``span_normalise=True`` @@ -30,6 +31,17 @@ - ``ltrim``, ``rtrim``, ``trim`` and ``shift`` raise an error if used on a tree sequence containing a reference sequence (:user:`hyanwong`, :pr:`3210`, :issue:`2091`) +- Add ``TreeSequence.sites_ancestral_state`` and ``TreeSequence.mutations_derived_state`` properties + to return the ancestral state of sites and derived state of mutations as NumPy arrays of + the new numpy 2.0 StringDType. + This requires numpy version 2 or greater, as such this is now the minimum version stated in tskit's + dependencies. If you try to use another python module that was compiled against numpy 1.X you may see + the error "A module that was compiled using NumPy 1.x cannot be run in NumPy 2.0.0 as it may crash.". + If no newer version of the module is avaliable you can still use it with tskit and numpy 1.X by + building tskit from source with numpy 1.X using ``pip install tskit --no-binary tskit``. However + any use of the new properties will result in a ``RuntimeError``. + (:user:`benjeffery`, :pr:`3228`, :issue:`2632`) + -------------------- [0.6.4] - 2025-05-21 -------------------- diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index b358ae91ab..cab445e5d0 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -23,16 +23,27 @@ * SOFTWARE. */ -#define PY_SSIZE_T_CLEAN -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #define TSK_BUG_ASSERT_MESSAGE \ "Please open an issue on" \ " GitHub, ideally with a reproducible example." \ " (https://github.com/tskit-dev/tskit/issues)" +#define PY_SSIZE_T_CLEAN #include -#include +#include + +#if defined(NPY_2_0_API_VERSION) && NPY_API_VERSION >= NPY_2_0_API_VERSION +#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION +#undef NPY_FEATURE_VERSION +#define NPY_FEATURE_VERSION NPY_2_0_API_VERSION +#define HAVE_NUMPY_2 1 +#else +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#define HAVE_NUMPY_2 0 +#endif #include + +#include #include #include "kastore.h" @@ -10802,6 +10813,60 @@ TreeSequence_make_array(TreeSequence *self, tsk_size_t size, int dtype, void *da return make_owned_array((PyObject *) self, size, dtype, data); } +#if HAVE_NUMPY_2 +PyObject * +TreeSequence_decode_ragged_string_column( + TreeSequence *self, tsk_size_t num_rows, const char *data, const tsk_size_t *offset) +{ + PyObject *ret = NULL; + PyObject *array = NULL; + char *array_data = NULL; + npy_intp dims[1]; + tsk_size_t i; + int pack_result; + npy_string_allocator *allocator = NULL; + PyArray_StringDTypeObject *string_dtype + = (PyArray_StringDTypeObject *) PyArray_DescrFromType(NPY_VSTRING); + /* This can only fail if an invalid dtype is passed */ + assert(string_dtype != NULL); + + dims[0] = (npy_intp) num_rows; + array = PyArray_Zeros(1, dims, (PyArray_Descr *) string_dtype, 0); + if (array == NULL) { + goto out; + } + array_data = (char *) PyArray_DATA((PyArrayObject *) array); + allocator = NpyString_acquire_allocator(string_dtype); + for (i = 0; i < num_rows; i++) { + pack_result = NpyString_pack(allocator, + (npy_packed_static_string + *) (array_data + (i * ((PyArray_Descr *) string_dtype)->elsize)), + data + offset[i], offset[i + 1] - offset[i]); + if (pack_result < 0) { + PyErr_SetString(PyExc_MemoryError, "could not pack string."); + goto out; + } + } + /* Release the allocator before we call any other Python C API functions + * which may require the GIL. + */ + NpyString_release_allocator(allocator); + allocator = NULL; + + /* Clear the writeable flag to match other arrays semantics */ + PyArray_CLEARFLAGS((PyArrayObject *) array, NPY_ARRAY_WRITEABLE); + + ret = array; + array = NULL; +out: + if (allocator != NULL) { + NpyString_release_allocator(allocator); + } + Py_XDECREF(array); + return ret; +} +#endif + static PyObject * TreeSequence_get_individuals_flags(TreeSequence *self, void *closure) { @@ -11049,6 +11114,24 @@ TreeSequence_get_sites_position(TreeSequence *self, void *closure) return ret; } +#if HAVE_NUMPY_2 +static PyObject * +TreeSequence_get_sites_ancestral_state(TreeSequence *self, void *closure) +{ + PyObject *ret = NULL; + tsk_site_table_t sites; + + if (TreeSequence_check_state(self) != 0) { + goto out; + } + sites = self->tree_sequence->tables->sites; + ret = TreeSequence_decode_ragged_string_column( + self, sites.num_rows, sites.ancestral_state, sites.ancestral_state_offset); +out: + return ret; +} +#endif + static PyObject * TreeSequence_get_sites_metadata(TreeSequence *self, void *closure) { @@ -11141,6 +11224,24 @@ TreeSequence_get_mutations_time(TreeSequence *self, void *closure) return ret; } +#if HAVE_NUMPY_2 +static PyObject * +TreeSequence_get_mutations_derived_state(TreeSequence *self, void *closure) +{ + PyObject *ret = NULL; + tsk_mutation_table_t mutations; + + if (TreeSequence_check_state(self) != 0) { + goto out; + } + mutations = self->tree_sequence->tables->mutations; + ret = TreeSequence_decode_ragged_string_column(self, mutations.num_rows, + mutations.derived_state, mutations.derived_state_offset); +out: + return ret; +} +#endif + static PyObject * TreeSequence_get_mutations_metadata(TreeSequence *self, void *closure) { @@ -11719,6 +11820,11 @@ static PyGetSetDef TreeSequence_getsetters[] = { { .name = "sites_position", .get = (getter) TreeSequence_get_sites_position, .doc = "The site position array" }, +#if HAVE_NUMPY_2 + { .name = "sites_ancestral_state", + .get = (getter) TreeSequence_get_sites_ancestral_state, + .doc = "The site ancestral state array" }, +#endif { .name = "sites_metadata", .get = (getter) TreeSequence_get_sites_metadata, .doc = "The site metadata array" }, @@ -11737,6 +11843,11 @@ static PyGetSetDef TreeSequence_getsetters[] = { { .name = "mutations_time", .get = (getter) TreeSequence_get_mutations_time, .doc = "The mutation time array" }, +#if HAVE_NUMPY_2 + { .name = "mutations_derived_state", + .get = (getter) TreeSequence_get_mutations_derived_state, + .doc = "The mutation derived state array" }, +#endif { .name = "mutations_metadata", .get = (getter) TreeSequence_get_mutations_metadata, .doc = "The mutation metadata array" }, @@ -14606,11 +14717,24 @@ static struct PyModuleDef tskitmodule = { PyObject * PyInit__tskit(void) { - PyObject *module = PyModule_Create(&tskitmodule); - if (module == NULL) { + PyObject *module; + +#if HAVE_NUMPY_2 + if (PyArray_ImportNumPyAPI() < 0) { return NULL; } +#else import_array(); +#endif + + module = PyModule_Create(&tskitmodule); + if (module == NULL) { + return NULL; + } + + if (PyModule_AddIntConstant(module, "HAS_NUMPY_2", HAVE_NUMPY_2)) { + return NULL; + } if (register_lwt_class(module) != 0) { return NULL; diff --git a/python/pyproject.toml b/python/pyproject.toml index a24a4fb868..80b9cc1f78 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -43,7 +43,7 @@ keywords = [ requires-python = ">=3.9" dependencies = [ "jsonschema>=3.0.0", - "numpy>=1.23.5", + "numpy>=2", ] [project.urls] diff --git a/python/requirements/development.txt b/python/requirements/development.txt index 39653c87e7..2a6d27ec62 100644 --- a/python/requirements/development.txt +++ b/python/requirements/development.txt @@ -17,7 +17,6 @@ msprime>=1.0.0 networkx newick ninja -numpy packaging portion pre-commit diff --git a/python/requirements/development.yml b/python/requirements/development.yml index 6aefcbd22a..abe9e91cf2 100644 --- a/python/requirements/development.yml +++ b/python/requirements/development.yml @@ -22,7 +22,7 @@ dependencies: - msprime>=1.0.0 - networkx - ninja - - numpy<2 + - numpy - packaging - portion - pre-commit diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index d0e690dec0..1f476be574 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -5321,6 +5321,83 @@ def test_mixed_sample_status(self): assert_array_equal(result, expected) +class TestRaggedArrays: + @pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher") + @pytest.mark.parametrize("num_rows", [0, 1, 100]) + @pytest.mark.parametrize("column", ["ancestral_state", "derived_state"]) + def test_site_ancestral_state(self, num_rows, column): + tables = tskit.TableCollection(sequence_length=100) + rng = random.Random(42) + for i in range(num_rows): + state_length = rng.randint(0, 10) + state = "".join( + chr(rng.randint(0x1F300, 0x1F6FF)) for _ in range(state_length) + ) + if column == "ancestral_state": + tables.sites.add_row(position=i, ancestral_state=state) + elif column == "derived_state": + tables.nodes.add_row() + tables.sites.add_row(position=i, ancestral_state="A") + tables.mutations.add_row(site=i, node=0, derived_state=state) + ts = tables.tree_sequence() + a = getattr( + ts, + ( + "sites_ancestral_state" + if column == "ancestral_state" + else "mutations_derived_state" + ), + ) + assert isinstance(a, np.ndarray) + assert a.shape == (num_rows,) + assert a.dtype == np.dtype("T") + assert a.size == num_rows + + # Check that the value is cached + assert a is getattr( + ts, + ( + "sites_ancestral_state" + if column == "ancestral_state" + else "mutations_derived_state" + ), + ) + + for state, row in itertools.zip_longest( + a, ts.sites() if column == "ancestral_state" else ts.mutations() + ): + assert state == getattr(row, column) + + @pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher") + @pytest.mark.parametrize("ts", tsutil.get_example_tree_sequences()) + def test_equality_sites_ancestral_state(self, ts): + assert_array_equal( + ts.sites_ancestral_state, [site.ancestral_state for site in ts.sites()] + ) + + @pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher") + @pytest.mark.parametrize("ts", tsutil.get_example_tree_sequences()) + def test_equality_mutations_derived_state(self, ts): + assert_array_equal( + ts.mutations_derived_state, + [mutation.derived_state for mutation in ts.mutations()], + ) + + @pytest.mark.skipif(_tskit.HAS_NUMPY_2, reason="Test only on Numpy 1.X") + @pytest.mark.parametrize( + "column", ["sites_ancestral_state", "mutations_derived_state"] + ) + def test_ragged_array_not_supported(self, column): + tables = tskit.TableCollection(sequence_length=100) + ts = tables.tree_sequence() + + with pytest.raises( + RuntimeError, + match="requires numpy 2.0", + ): + getattr(ts, column) + + class TestSampleNodesByPloidy: @pytest.mark.parametrize( "n_samples,ploidy,expected", diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index ffdff495a2..9cc8206cb1 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -1922,11 +1922,11 @@ def test_array_read_only(self, name, ts_fixture): def test_array_properties(self, name, ts_fixture): ts_fixture = ts_fixture.ll_tree_sequence a = getattr(ts_fixture, name) - assert a.base == ts_fixture assert not a.flags.writeable assert a.flags.aligned assert a.flags.c_contiguous assert not a.flags.owndata + assert a.base == ts_fixture b = getattr(ts_fixture, name) assert a is not b assert np.all(a == b) @@ -1976,6 +1976,104 @@ def test_generated_columns(self, ts_fixture, name): a2[:] = 0 assert a3 is not a2 + @pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0+") + @pytest.mark.parametrize( + "string_array", ["sites_ancestral_state", "mutations_derived_state"] + ) + @pytest.mark.parametrize( + "str_lengths", + ["none", "all-0", "all-1", "all-2", "mixed", "very_long", "unicode"], + ) + def test_string_arrays(self, ts_fixture, str_lengths, string_array): + if str_lengths == "none": + ts = tskit.TableCollection(1.0).tree_sequence() + else: + if str_lengths == "all-1": + ts = ts_fixture + if string_array == "sites_ancestral_state": + assert ts.num_sites > 0 + assert {len(site.ancestral_state) for site in ts.sites()} == {1} + elif string_array == "mutations_derived_state": + assert ts.num_mutations > 0 + assert {len(mut.derived_state) for mut in ts.mutations()} == {1} + else: + tables = ts_fixture.dump_tables() + + str_map = { + "all-0": lambda i, item: "", + "all-2": lambda i, item: chr(ord("A") + (i % 26)) * 2, + "mixed": lambda i, item: chr(ord("A") + (i % 26)) * (i % 20), + "very_long": lambda i, item: "A" * 100_000_000 if i == 1 else "T", + "unicode": lambda i, item: "🧬" * (i + 1), + } + + if string_array == "sites_ancestral_state": + sites = tables.sites.copy() + tables.sites.clear() + get_ancestral_state = str_map[str_lengths] + for i, site in enumerate(sites): + tables.sites.append( + site.replace(ancestral_state=get_ancestral_state(i, site)) + ) + elif string_array == "mutations_derived_state": + mutations = tables.mutations.copy() + tables.mutations.clear() + get_derived_state = str_map[str_lengths] + for i, mutation in enumerate(mutations): + tables.mutations.append( + mutation.replace( + derived_state=get_derived_state(i, mutation) + ) + ) + + ts = tables.tree_sequence() + ll_ts = ts.ll_tree_sequence + + a = getattr(ll_ts, string_array) + + # Contents + if str_lengths == "none": + assert a.size == 0 + else: + if string_array == "sites_ancestral_state": + for site in ts.sites(): + assert a[site.id] == site.ancestral_state + elif string_array == "mutations_derived_state": + for mutation in ts.mutations(): + assert a[mutation.id] == mutation.derived_state + + # Read only + with pytest.raises(AttributeError, match="not writable"): + setattr(ll_ts, string_array, None) + with pytest.raises(AttributeError, match="not writable"): + delattr(ll_ts, string_array) + + with pytest.raises(ValueError, match="assignment destination"): + a[:] = 0 + with pytest.raises(ValueError, match="assignment destination"): + a[0] = 0 + + # Properties + assert a.dtype == np.dtypes.StringDType() + assert a.flags.aligned + assert a.flags.c_contiguous + b = getattr(ll_ts, string_array) + assert a is not b + assert np.all(a == b) + + # Lifetime + a1 = getattr(ll_ts, string_array) + a2 = a1.copy() + assert a1 is not a2 + del ll_ts + # Do some memory operations + a3 = np.ones(10**6) + assert np.all(a1 == a2) + del a1 + # Just do something to touch memory + a2[:] = 0 + assert a3 is not a2 + class StatsInterfaceMixin: """ diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 59cf5a26ae..e740181f81 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -4138,6 +4138,8 @@ def __init__(self, ll_tree_sequence): self._individuals_location = None self._individuals_nodes = None self._mutations_edge = None + self._sites_ancestral_state = None + self._mutations_derived_state = None # NOTE: when we've implemented read-only access via the underlying # tables we can replace these arrays with reference to the read-only # tables here (and remove the low-level boilerplate). @@ -5952,6 +5954,20 @@ def sites_position(self): """ return self._sites_position + @property + def sites_ancestral_state(self): + """ + The ``ancestral_state`` column in the + :ref:`sec_site_table_definition` as a numpy array (dtype=StringDtype). + """ + if not _tskit.HAS_NUMPY_2: + raise RuntimeError( + "The sites_ancestral_state property requires numpy 2.0 or later." + ) + if self._sites_ancestral_state is None: + self._sites_ancestral_state = self._ll_tree_sequence.sites_ancestral_state + return self._sites_ancestral_state + @property def sites_metadata(self): """ @@ -6009,6 +6025,22 @@ def mutations_time(self): """ return self._mutations_time + @property + def mutations_derived_state(self): + """ + Access to the ``derived_state`` column in the + :ref:`sec_mutation_table_definition` as a numpy array (dtype=StringDtype). + """ + if not _tskit.HAS_NUMPY_2: + raise RuntimeError( + "The mutations_derived_state property requires numpy 2.0 or later." + ) + if self._mutations_derived_state is None: + self._mutations_derived_state = ( + self._ll_tree_sequence.mutations_derived_state + ) + return self._mutations_derived_state + @property def mutations_metadata(self): """