Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,52 @@ jobs:
name: codecov-umbrella
verbose: true

test-numpy1:
name: Numpy 1.x
runs-on: ubuntu-24.04
defaults:
run:
shell: bash
steps:
- name: Cancel Previous Runs
uses: styfle/cancel-workflow-action@0.12.1
with:
access_token: ${{ github.token }}

- name: Checkout
uses: actions/checkout@v4.2.2

- name: Setup Python
uses: actions/setup-python@v5.4.0
with:
python-version: '3.12'

- name: Install dependencies
working-directory: python
run: |
pip install -r requirements/CI-complete/requirements.txt
pip install "numpy<2"

- name: Build module
working-directory: python
run: |
python setup.py build_ext --inplace

- name: Run tests with numpy 1.x
working-directory: python
run: |
python -m pytest -x --cov=tskit --cov-report=xml --cov-branch -n2 tests/test_lowlevel.py tests/test_highlevel.py

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5.4.0
with:
token: ${{ secrets.CODECOV_TOKEN }}
working-directory: python
fail_ci_if_error: false
flags: python-tests-numpy1
name: codecov-numpy1
verbose: true

msys2:
runs-on: windows-latest
strategy:
Expand Down
12 changes: 12 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- Add ``TreeSequence.mutations_edge`` which returns the edge ID for each mutation's
edge. (:user:`benjeffery`, :pr:`3226`, :issue:`3189`)


**Bugfixes**

- Fix bug in ``TreeSequence.pair_coalescence_counts`` when ``span_normalise=True``
Expand All @@ -30,6 +31,17 @@
- ``ltrim``, ``rtrim``, ``trim`` and ``shift`` raise an error if used on a tree sequence
containing a reference sequence (:user:`hyanwong`, :pr:`3210`, :issue:`2091`)

- Add ``TreeSequence.sites_ancestral_state`` and ``TreeSequence.mutations_derived_state`` properties
to return the ancestral state of sites and derived state of mutations as NumPy arrays of
the new numpy 2.0 StringDType.
This requires numpy version 2 or greater, as such this is now the minimum version stated in tskit's
dependencies. If you try to use another python module that was compiled against numpy 1.X you may see
the error "A module that was compiled using NumPy 1.x cannot be run in NumPy 2.0.0 as it may crash.".
If no newer version of the module is avaliable you can still use it with tskit and numpy 1.X by
building tskit from source with numpy 1.X using ``pip install tskit --no-binary tskit``. However
any use of the new properties will result in a ``RuntimeError``.
(:user:`benjeffery`, :pr:`3228`, :issue:`2632`)

--------------------
[0.6.4] - 2025-05-21
--------------------
Expand Down
134 changes: 129 additions & 5 deletions python/_tskitmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,27 @@
* SOFTWARE.
*/

#define PY_SSIZE_T_CLEAN
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#define TSK_BUG_ASSERT_MESSAGE \
"Please open an issue on" \
" GitHub, ideally with a reproducible example." \
" (https://github.com/tskit-dev/tskit/issues)"

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <structmember.h>
#include <numpy/numpyconfig.h>

#if defined(NPY_2_0_API_VERSION) && NPY_API_VERSION >= NPY_2_0_API_VERSION
#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
#undef NPY_FEATURE_VERSION
#define NPY_FEATURE_VERSION NPY_2_0_API_VERSION
#define HAVE_NUMPY_2 1
#else
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#define HAVE_NUMPY_2 0
#endif
#include <numpy/arrayobject.h>

#include <structmember.h>
#include <float.h>

#include "kastore.h"
Expand Down Expand Up @@ -10802,6 +10813,60 @@
return make_owned_array((PyObject *) self, size, dtype, data);
}

#if HAVE_NUMPY_2
PyObject *
TreeSequence_decode_ragged_string_column(
TreeSequence *self, tsk_size_t num_rows, const char *data, const tsk_size_t *offset)
{
PyObject *ret = NULL;
PyObject *array = NULL;
char *array_data = NULL;
npy_intp dims[1];
tsk_size_t i;
int pack_result;
npy_string_allocator *allocator = NULL;
PyArray_StringDTypeObject *string_dtype
= (PyArray_StringDTypeObject *) PyArray_DescrFromType(NPY_VSTRING);
/* This can only fail if an invalid dtype is passed */
assert(string_dtype != NULL);

dims[0] = (npy_intp) num_rows;
array = PyArray_Zeros(1, dims, (PyArray_Descr *) string_dtype, 0);
if (array == NULL) {
goto out;

Check warning on line 10836 in python/_tskitmodule.c

View check run for this annotation

Codecov / codecov/patch

python/_tskitmodule.c#L10836

Added line #L10836 was not covered by tests
}
array_data = (char *) PyArray_DATA((PyArrayObject *) array);
allocator = NpyString_acquire_allocator(string_dtype);
for (i = 0; i < num_rows; i++) {
pack_result = NpyString_pack(allocator,
(npy_packed_static_string
*) (array_data + (i * ((PyArray_Descr *) string_dtype)->elsize)),
data + offset[i], offset[i + 1] - offset[i]);
if (pack_result < 0) {
PyErr_SetString(PyExc_MemoryError, "could not pack string.");
goto out;

Check warning on line 10847 in python/_tskitmodule.c

View check run for this annotation

Codecov / codecov/patch

python/_tskitmodule.c#L10846-L10847

Added lines #L10846 - L10847 were not covered by tests
}
}
/* Release the allocator before we call any other Python C API functions
* which may require the GIL.
*/
NpyString_release_allocator(allocator);
allocator = NULL;

/* Clear the writeable flag to match other arrays semantics */
PyArray_CLEARFLAGS((PyArrayObject *) array, NPY_ARRAY_WRITEABLE);

ret = array;
array = NULL;
out:
if (allocator != NULL) {
NpyString_release_allocator(allocator);

Check warning on line 10863 in python/_tskitmodule.c

View check run for this annotation

Codecov / codecov/patch

python/_tskitmodule.c#L10863

Added line #L10863 was not covered by tests
}
Py_XDECREF(array);
return ret;
}
#endif

static PyObject *
TreeSequence_get_individuals_flags(TreeSequence *self, void *closure)
{
Expand Down Expand Up @@ -11049,6 +11114,24 @@
return ret;
}

#if HAVE_NUMPY_2
static PyObject *
TreeSequence_get_sites_ancestral_state(TreeSequence *self, void *closure)
{
PyObject *ret = NULL;
tsk_site_table_t sites;

if (TreeSequence_check_state(self) != 0) {
goto out;
}
sites = self->tree_sequence->tables->sites;
ret = TreeSequence_decode_ragged_string_column(
self, sites.num_rows, sites.ancestral_state, sites.ancestral_state_offset);
out:
return ret;
}
#endif

static PyObject *
TreeSequence_get_sites_metadata(TreeSequence *self, void *closure)
{
Expand Down Expand Up @@ -11141,6 +11224,24 @@
return ret;
}

#if HAVE_NUMPY_2
static PyObject *
TreeSequence_get_mutations_derived_state(TreeSequence *self, void *closure)
{
PyObject *ret = NULL;
tsk_mutation_table_t mutations;

if (TreeSequence_check_state(self) != 0) {
goto out;
}
mutations = self->tree_sequence->tables->mutations;
ret = TreeSequence_decode_ragged_string_column(self, mutations.num_rows,
mutations.derived_state, mutations.derived_state_offset);
out:
return ret;
}
#endif

static PyObject *
TreeSequence_get_mutations_metadata(TreeSequence *self, void *closure)
{
Expand Down Expand Up @@ -11719,6 +11820,11 @@
{ .name = "sites_position",
.get = (getter) TreeSequence_get_sites_position,
.doc = "The site position array" },
#if HAVE_NUMPY_2
{ .name = "sites_ancestral_state",
.get = (getter) TreeSequence_get_sites_ancestral_state,
.doc = "The site ancestral state array" },
#endif
{ .name = "sites_metadata",
.get = (getter) TreeSequence_get_sites_metadata,
.doc = "The site metadata array" },
Expand All @@ -11737,6 +11843,11 @@
{ .name = "mutations_time",
.get = (getter) TreeSequence_get_mutations_time,
.doc = "The mutation time array" },
#if HAVE_NUMPY_2
{ .name = "mutations_derived_state",
.get = (getter) TreeSequence_get_mutations_derived_state,
.doc = "The mutation derived state array" },
#endif
{ .name = "mutations_metadata",
.get = (getter) TreeSequence_get_mutations_metadata,
.doc = "The mutation metadata array" },
Expand Down Expand Up @@ -14606,11 +14717,24 @@
PyObject *
PyInit__tskit(void)
{
PyObject *module = PyModule_Create(&tskitmodule);
if (module == NULL) {
PyObject *module;

#if HAVE_NUMPY_2
if (PyArray_ImportNumPyAPI() < 0) {
return NULL;
}
#else
import_array();
#endif

module = PyModule_Create(&tskitmodule);
if (module == NULL) {
return NULL;

Check warning on line 14732 in python/_tskitmodule.c

View check run for this annotation

Codecov / codecov/patch

python/_tskitmodule.c#L14732

Added line #L14732 was not covered by tests
}

if (PyModule_AddIntConstant(module, "HAS_NUMPY_2", HAVE_NUMPY_2)) {
return NULL;

Check warning on line 14736 in python/_tskitmodule.c

View check run for this annotation

Codecov / codecov/patch

python/_tskitmodule.c#L14736

Added line #L14736 was not covered by tests
}

if (register_lwt_class(module) != 0) {
return NULL;
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ keywords = [
requires-python = ">=3.9"
dependencies = [
"jsonschema>=3.0.0",
"numpy>=1.23.5",
"numpy>=2",
]

[project.urls]
Expand Down
1 change: 0 additions & 1 deletion python/requirements/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ msprime>=1.0.0
networkx
newick
ninja
numpy
packaging
portion
pre-commit
Expand Down
2 changes: 1 addition & 1 deletion python/requirements/development.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies:
- msprime>=1.0.0
- networkx
- ninja
- numpy<2
- numpy
- packaging
- portion
- pre-commit
Expand Down
77 changes: 77 additions & 0 deletions python/tests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5321,6 +5321,83 @@ def test_mixed_sample_status(self):
assert_array_equal(result, expected)


class TestRaggedArrays:
@pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher")
@pytest.mark.parametrize("num_rows", [0, 1, 100])
@pytest.mark.parametrize("column", ["ancestral_state", "derived_state"])
def test_site_ancestral_state(self, num_rows, column):
tables = tskit.TableCollection(sequence_length=100)
rng = random.Random(42)
for i in range(num_rows):
state_length = rng.randint(0, 10)
state = "".join(
chr(rng.randint(0x1F300, 0x1F6FF)) for _ in range(state_length)
)
if column == "ancestral_state":
tables.sites.add_row(position=i, ancestral_state=state)
elif column == "derived_state":
tables.nodes.add_row()
tables.sites.add_row(position=i, ancestral_state="A")
tables.mutations.add_row(site=i, node=0, derived_state=state)
ts = tables.tree_sequence()
a = getattr(
ts,
(
"sites_ancestral_state"
if column == "ancestral_state"
else "mutations_derived_state"
),
)
assert isinstance(a, np.ndarray)
assert a.shape == (num_rows,)
assert a.dtype == np.dtype("T")
assert a.size == num_rows

# Check that the value is cached
assert a is getattr(
ts,
(
"sites_ancestral_state"
if column == "ancestral_state"
else "mutations_derived_state"
),
)

for state, row in itertools.zip_longest(
a, ts.sites() if column == "ancestral_state" else ts.mutations()
):
assert state == getattr(row, column)

@pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher")
@pytest.mark.parametrize("ts", tsutil.get_example_tree_sequences())
def test_equality_sites_ancestral_state(self, ts):
assert_array_equal(
ts.sites_ancestral_state, [site.ancestral_state for site in ts.sites()]
)

@pytest.mark.skipif(not _tskit.HAS_NUMPY_2, reason="Requires NumPy 2.0 or higher")
@pytest.mark.parametrize("ts", tsutil.get_example_tree_sequences())
def test_equality_mutations_derived_state(self, ts):
assert_array_equal(
ts.mutations_derived_state,
[mutation.derived_state for mutation in ts.mutations()],
)

@pytest.mark.skipif(_tskit.HAS_NUMPY_2, reason="Test only on Numpy 1.X")
@pytest.mark.parametrize(
"column", ["sites_ancestral_state", "mutations_derived_state"]
)
def test_ragged_array_not_supported(self, column):
tables = tskit.TableCollection(sequence_length=100)
ts = tables.tree_sequence()

with pytest.raises(
RuntimeError,
match="requires numpy 2.0",
):
getattr(ts, column)


class TestSampleNodesByPloidy:
@pytest.mark.parametrize(
"n_samples,ploidy,expected",
Expand Down
Loading
Loading