diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 2b89dd38f8..3867ff73ee 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -6686,6 +6686,172 @@ test_treeseq_row_access_errors(void) tsk_table_collection_free(&tables); } +static void +test_treeseq_get_individuals_population_errors(void) +{ + int ret; + tsk_id_t ret_id; + tsk_table_collection_t tables; + tsk_treeseq_t ts; + tsk_id_t output[2]; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + + ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 1); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, TSK_NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 1); + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret_id = tsk_treeseq_get_individuals_population(&ts, output); + CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH); + + tsk_treeseq_free(&ts); + tsk_table_collection_free(&tables); +} + +static void +test_treeseq_get_individuals_population(void) +{ + int ret; + tsk_id_t ret_id; + int j; + tsk_table_collection_t tables; + tsk_treeseq_t ts; + tsk_id_t output[4]; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + + for (j = 0; j < 2; j++) { + ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, (tsk_id_t) j); + } + for (j = 0; j < 4; j++) { + ret_id = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, (tsk_id_t) j); + } + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 1); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.0, 1, 3, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 2); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 3); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 4); + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_treeseq_get_individuals_population(&ts, output); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + CU_ASSERT_EQUAL_FATAL(output[0], TSK_NULL); + CU_ASSERT_EQUAL_FATAL(output[1], 0); + CU_ASSERT_EQUAL_FATAL(output[2], TSK_NULL); + CU_ASSERT_EQUAL_FATAL(output[3], 1); + + tsk_treeseq_free(&ts); + tsk_table_collection_free(&tables); +} + +static void +test_treeseq_get_individuals_time_errors(void) +{ + int ret; + tsk_id_t ret_id; + tsk_table_collection_t tables; + tsk_treeseq_t ts; + double output[2]; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + + ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 1); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.2, 0, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.8, 0, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 1); + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_treeseq_get_individuals_time(&ts, output); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_TIME_MISMATCH); + + tsk_treeseq_free(&ts); + tsk_table_collection_free(&tables); +} + +static void +test_treeseq_get_individuals_time(void) +{ + int ret; + tsk_id_t ret_id; + int j; + tsk_table_collection_t tables; + tsk_treeseq_t ts; + double output[4]; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + + for (j = 0; j < 2; j++) { + ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, j); + } + for (j = 0; j < 4; j++) { + ret_id = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, j); + } + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 0); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.25, 0, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 1); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.0, 1, 3, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 2); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.25, 0, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 3); + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 4); + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_treeseq_get_individuals_time(&ts, output); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + CU_ASSERT_EQUAL_FATAL(output[0], 3.25); + CU_ASSERT_EQUAL_FATAL(output[1], 1.25); + CU_ASSERT_FATAL(tsk_is_unknown_time(output[2])); + CU_ASSERT_EQUAL_FATAL(output[3], 3.0); + + tsk_treeseq_free(&ts); + tsk_table_collection_free(&tables); +} + static void test_tree_copy_flags(void) { @@ -7637,6 +7803,13 @@ main(int argc, char **argv) /* Misc */ { "test_tree_errors", test_tree_errors }, { "test_treeseq_row_access_errors", test_treeseq_row_access_errors }, + { "test_treeseq_get_individuals_population_errors", + test_treeseq_get_individuals_population_errors }, + { "test_treeseq_get_individuals_population", + test_treeseq_get_individuals_population }, + { "test_treeseq_get_individuals_time_errors", + test_treeseq_get_individuals_time_errors }, + { "test_treeseq_get_individuals_time", test_treeseq_get_individuals_time }, { "test_tree_copy_flags", test_tree_copy_flags }, { "test_genealogical_nearest_neighbours_errors", test_genealogical_nearest_neighbours_errors }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 067dd49317..3949c0ba0b 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -571,6 +571,18 @@ tsk_strerror_internal(int err) ret = "Individuals cannot be their own ancestor. " "(TSK_ERR_INDIVIDUAL_PARENT_CYCLE)"; break; + + case TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH: + ret = "Individual populations cannot be returned " + "if an individual has nodes from more than one population. " + "(TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH)"; + break; + + case TSK_ERR_INDIVIDUAL_TIME_MISMATCH: + ret = "Individual times cannot be returned " + "if an individual has nodes from more than one time. " + "(TSK_ERR_INDIVIDUAL_TIME_MISMATCH)"; + break; } return ret; } diff --git a/c/tskit/core.h b/c/tskit/core.h index 5fb0b9f46a..bdf95e3d7f 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -812,6 +812,16 @@ An individual was its own parent. An individual was its own ancestor in a cycle of references. */ #define TSK_ERR_INDIVIDUAL_PARENT_CYCLE -1702 +/** +An individual had nodes from more than one population +(and only one was requested). +*/ +#define TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH -1703 +/** +An individual had nodes from more than one time +(and only one was requested). +*/ +#define TSK_ERR_INDIVIDUAL_TIME_MISMATCH -1704 /** @} */ // clang-format on diff --git a/c/tskit/trees.c b/c/tskit/trees.c index c1fa398b81..5a77363123 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -716,6 +716,68 @@ tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self) return tsk_table_collection_has_reference_sequence(self->tables); } +int +tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output) +{ + int ret = 0; + tsk_size_t i, j; + tsk_individual_t ind; + tsk_id_t ind_pop; + const tsk_id_t *node_population = self->tables->nodes.population; + const tsk_size_t num_individuals = self->tables->individuals.num_rows; + + tsk_memset(output, TSK_NULL, num_individuals * sizeof(*output)); + + for (i = 0; i < num_individuals; i++) { + ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); + tsk_bug_assert(ret == 0); + if (ind.nodes_length > 0) { + ind_pop = -2; + for (j = 0; j < ind.nodes_length; j++) { + if (ind_pop == -2) { + ind_pop = node_population[ind.nodes[j]]; + } else if (ind_pop != node_population[ind.nodes[j]]) { + ret = TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH; + goto out; + } + } + output[ind.id] = ind_pop; + } + } +out: + return ret; +} + +int +tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output) +{ + int ret = 0; + tsk_size_t i, j; + tsk_individual_t ind; + double ind_time; + const double *node_time = self->tables->nodes.time; + const tsk_size_t num_individuals = self->tables->individuals.num_rows; + + for (i = 0; i < num_individuals; i++) { + ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); + tsk_bug_assert(ret == 0); + /* the default is UNKNOWN_TIME, but nodes cannot have + * UNKNOWN _TIME so this is safe. */ + ind_time = TSK_UNKNOWN_TIME; + for (j = 0; j < ind.nodes_length; j++) { + if (j == 0) { + ind_time = node_time[ind.nodes[j]]; + } else if (ind_time != node_time[ind.nodes[j]]) { + ret = TSK_ERR_INDIVIDUAL_TIME_MISMATCH; + goto out; + } + } + output[ind.id] = ind_time; + } +out: + return ret; +} + /* Stats functions */ #define GET_2D_ROW(array, row_len, row) (array + (((size_t)(row_len)) * (size_t) row)) diff --git a/c/tskit/trees.h b/c/tskit/trees.h index c0bb804873..f15b4f0dd3 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -697,7 +697,7 @@ Returns the location of each node in the list of samples or @endrst @param self A pointer to a tsk_treeseq_t object. -@return Returns the pointer to the breakpoint array. +@return Returns the pointer to the array of sample indexes. */ const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self); @@ -890,6 +890,9 @@ int tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self); +int tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output); +int tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output); + int tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, double lambda_, double *result); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 808ab519f2..498c37fe7f 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -26,6 +26,10 @@ edges at a specific time. (:user:`jeromekelleher`, :issue:`2276`, :pr:`2296`). +- Add the ``TreeSequence.individuals_time`` and ``TreeSequence.individuals_population`` + methods to return arrays of per-individual times and populations, respectively. + (:user:`petrelharp`, :issue:`1481`, :pr:`2298`). + **Breaking Changes** - The JSON metadata codec now interprets the empty string as an empty object. This means diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index f470b35364..bb5fc240a9 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -8308,6 +8308,69 @@ TreeSequence_get_samples(TreeSequence *self) return ret; } +static PyObject * +TreeSequence_get_individuals_population(TreeSequence *self) +{ + PyObject *ret = NULL; + PyArrayObject *ret_array = NULL; + npy_intp dim; + int err; + + if (TreeSequence_check_state(self) != 0) { + goto out; + } + + dim = tsk_treeseq_get_num_individuals(self->tree_sequence); + ret_array = (PyArrayObject *) PyArray_SimpleNew(1, &dim, NPY_INT32); + if (ret_array == NULL) { + goto out; + } + + err = tsk_treeseq_get_individuals_population( + self->tree_sequence, PyArray_DATA(ret_array)); + if (err != 0) { + handle_library_error(err); + goto out; + } + + ret = (PyObject *) ret_array; + ret_array = NULL; +out: + Py_XDECREF(ret_array); + return ret; +} + +static PyObject * +TreeSequence_get_individuals_time(TreeSequence *self) +{ + PyObject *ret = NULL; + PyArrayObject *ret_array = NULL; + npy_intp dim; + int err; + + if (TreeSequence_check_state(self) != 0) { + goto out; + } + + dim = tsk_treeseq_get_num_individuals(self->tree_sequence); + ret_array = (PyArrayObject *) PyArray_SimpleNew(1, &dim, NPY_FLOAT64); + if (ret_array == NULL) { + goto out; + } + + err = tsk_treeseq_get_individuals_time(self->tree_sequence, PyArray_DATA(ret_array)); + if (err != 0) { + handle_library_error(err); + goto out; + } + + ret = (PyObject *) ret_array; + ret_array = NULL; +out: + Py_XDECREF(ret_array); + return ret; +} + static PyObject * TreeSequence_genealogical_nearest_neighbours( TreeSequence *self, PyObject *args, PyObject *kwds) @@ -9560,6 +9623,14 @@ static PyMethodDef TreeSequence_methods[] = { .ml_meth = (PyCFunction) TreeSequence_get_samples, .ml_flags = METH_NOARGS, .ml_doc = "Returns the samples." }, + { .ml_name = "get_individuals_population", + .ml_meth = (PyCFunction) TreeSequence_get_individuals_population, + .ml_flags = METH_NOARGS, + .ml_doc = "Returns the vector of per-individual populations." }, + { .ml_name = "get_individuals_time", + .ml_meth = (PyCFunction) TreeSequence_get_individuals_time, + .ml_flags = METH_NOARGS, + .ml_doc = "Returns the vector of per-individual times." }, { .ml_name = "genealogical_nearest_neighbours", .ml_meth = (PyCFunction) TreeSequence_genealogical_nearest_neighbours, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 2cf2a1abb5..cdf86ccb34 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2021 Tskit Developers +# Copyright (c) 2018-2022 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -358,7 +358,9 @@ def setUpClass(cls): record_migrations=True, ) assert ts.num_migrations > 0 - cls._tree_sequence = tsutil.insert_random_ploidy_individuals(ts) + cls._tree_sequence = tsutil.insert_random_ploidy_individuals( + ts, samples_only=True + ) fd, cls._tree_sequence_file = tempfile.mkstemp( prefix="tsk_cli", suffix=".trees" ) diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index b15c438eec..6cf60144e6 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -2448,6 +2448,122 @@ def test_tree_node_edges(self): assert edge.right >= tree.interval.right assert np.all(edge_visited) + def verify_individual_vectors(self, ts): + verify_times = np.repeat(np.nan, ts.num_individuals) + verify_populations = np.repeat(tskit.NULL, ts.num_individuals) + for ind in ts.individuals(): + if len(ind.nodes) > 0: + t = {ts.node(n).time for n in ind.nodes} + p = {ts.node(n).population for n in ind.nodes} + assert len(t) <= 1 + assert len(p) <= 1 + verify_times[ind.id] = t.pop() + verify_populations[ind.id] = p.pop() + + times = ts.individuals_time + populations = ts.individuals_population + assert np.array_equal(times, verify_times, equal_nan=True) + assert np.array_equal(populations, verify_populations, equal_nan=True) + times2 = ts.individuals_time + populations2 = ts.individuals_population + assert np.array_equal(times, times2, equal_nan=True) + assert np.array_equal(populations, populations2, equal_nan=True) + # check aliases also + times3 = ts.individual_times + populations3 = ts.individual_populations + assert np.array_equal(times, times3, equal_nan=True) + assert np.array_equal(populations, populations3, equal_nan=True) + + def test_individuals_population_errors(self): + t = tskit.TableCollection(sequence_length=1) + t.individuals.add_row() + t.individuals.add_row() + for j in range(2): + t.populations.add_row() + t.nodes.add_row(time=0, population=j, individual=0) + ts = t.tree_sequence() + with pytest.raises( + _tskit.LibraryError, match="TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH" + ): + _ = ts.individuals_population + # inconsistent but NULL populations are also an error + t.nodes.clear() + t.nodes.add_row(time=0, population=1, individual=0) + t.nodes.add_row(time=0, population=tskit.NULL, individual=0) + ts = t.tree_sequence() + with pytest.raises( + _tskit.LibraryError, match="TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH" + ): + _ = ts.individuals_population + t.nodes.clear() + t.nodes.add_row(time=0, population=tskit.NULL, individual=1) + t.nodes.add_row(time=0, population=0, individual=1) + ts = t.tree_sequence() + with pytest.raises( + _tskit.LibraryError, match="TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH" + ): + _ = ts.individuals_population + + def test_individuals_time_errors(self): + t = tskit.TableCollection(sequence_length=1) + t.individuals.add_row() + for j in range(2): + t.nodes.add_row(time=j, individual=0) + ts = t.tree_sequence() + with pytest.raises( + _tskit.LibraryError, match="TSK_ERR_INDIVIDUAL_TIME_MISMATCH" + ): + _ = ts.individuals_time + + @pytest.mark.parametrize("n", [1, 10]) + def test_individual_vectors(self, n): + d = msprime.Demography.island_model([10] * n, 0.1) + ts = msprime.sim_ancestry( + {pop.name: 10 for pop in d.populations}, + demography=d, + random_seed=100 + n, + model="dtwf", + ) + ts = tsutil.insert_random_consistent_individuals(ts, seed=100 + n) + assert ts.num_individuals > 10 + self.verify_individual_vectors(ts) + + def test_individuals_location_errors(self): + t = tskit.TableCollection(sequence_length=1) + t.individuals.add_row(location=[1.0, 2.0]) + t.individuals.add_row(location=[0.0]) + ts = t.tree_sequence() + with pytest.raises(ValueError, match="locations"): + _ = ts.individuals_location + + t.clear() + t.individuals.add_row(location=[1.0, 2.0]) + t.individuals.add_row(location=[]) + t.individuals.add_row(location=[1.0, 2.0]) + t.individuals.add_row(location=[]) + ts = t.tree_sequence() + with pytest.raises(ValueError, match="locations"): + _ = ts.individuals_location + + @pytest.mark.parametrize("nlocs", [0, 1, 4]) + @pytest.mark.parametrize("num_indivs", [0, 3]) + def test_individuals_location(self, nlocs, num_indivs): + t = tskit.TableCollection(sequence_length=1) + locs = np.array([j + np.arange(nlocs) for j in range(num_indivs)]) + if len(locs) == 0: + locs = locs.reshape((num_indivs, 0)) + for j in range(num_indivs): + t.individuals.add_row(location=locs[j]) + ts = t.tree_sequence() + ts_locs = ts.individuals_location + assert locs.shape == ts_locs.shape + assert np.array_equal(locs, ts_locs) + locs2 = ts.individuals_location + assert np.array_equal(ts_locs, locs2) + # test alias + locs3 = ts.individual_locations + assert np.array_equal(ts_locs, locs3) + class TestTreeSequenceMethodSignatures: ts = msprime.simulate(10, random_seed=1234) diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 9525d04a31..92bb5cfef0 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -4621,7 +4621,9 @@ def get_msprime_example(self, sample_size, T, seed): random_seed=seed, ) ts = tsutil.add_random_metadata(ts, seed) - ts = tsutil.insert_random_ploidy_individuals(ts, max_ploidy=1) + ts = tsutil.insert_random_ploidy_individuals( + ts, max_ploidy=1, samples_only=True + ) return ts def get_wf_example(self, N, T, seed): @@ -4631,7 +4633,9 @@ def get_wf_example(self, N, T, seed): ts = ts.simplify() ts = tsutil.jukes_cantor(ts, 1, 10, seed=seed) ts = tsutil.add_random_metadata(ts, seed) - ts = tsutil.insert_random_ploidy_individuals(ts, max_ploidy=2) + ts = tsutil.insert_random_ploidy_individuals( + ts, max_ploidy=2, samples_only=True + ) return ts def split_example(self, ts, T): diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py index 4073574a3c..3d2dc2bc2f 100644 --- a/python/tests/test_vcf.py +++ b/python/tests/test_vcf.py @@ -200,7 +200,9 @@ class ExamplesMixin: def test_simple_infinite_sites_random_ploidy(self): ts = msprime.simulate(10, mutation_rate=1, random_seed=2) - ts = tsutil.insert_random_ploidy_individuals(ts, min_ploidy=1) + ts = tsutil.insert_random_ploidy_individuals( + ts, min_ploidy=1, samples_only=True + ) assert ts.num_sites > 2 self.verify(ts) @@ -227,7 +229,9 @@ def test_simple_infinite_sites_ploidy_2_even_samples(self): def test_simple_jukes_cantor_random_ploidy(self): ts = msprime.simulate(10, random_seed=2) ts = tsutil.jukes_cantor(ts, num_sites=10, mu=1, seed=2) - ts = tsutil.insert_random_ploidy_individuals(ts, min_ploidy=1) + ts = tsutil.insert_random_ploidy_individuals( + ts, min_ploidy=1, samples_only=True + ) self.verify(ts) def test_single_tree_multichar_mutations(self): diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index cb59de95f7..f2929f33f9 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -212,30 +212,74 @@ def insert_multichar_mutations(ts, seed=1, max_len=10): def insert_random_ploidy_individuals( - ts, min_ploidy=0, max_ploidy=5, max_dimension=3, seed=1 + ts, min_ploidy=0, max_ploidy=5, max_dimension=3, samples_only=False, seed=1 ): """ - Takes random contiguous subsets of the samples an assigns them to individuals. + Takes random contiguous subsets of the samples and assigns them to individuals. Also creates random locations in variable dimensions in the unit interval, - and assigns random parents (including NULL parents). + and assigns random parents (including NULL parents). Note that resulting + individuals will often have nodes with inconsistent populations and/or time. """ rng = random.Random(seed) - samples = np.array(ts.samples(), dtype=int) + if samples_only: + node_ids = np.array(ts.samples(), dtype="int") + else: + node_ids = np.arange(ts.num_nodes) j = 0 tables = ts.dump_tables() tables.individuals.clear() individual = tables.nodes.individual[:] individual[:] = tskit.NULL ind_id = -1 - while j < len(samples): + while j < len(node_ids): ploidy = rng.randint(min_ploidy, max_ploidy) - nodes = samples[j : min(j + ploidy, len(samples))] + nodes = node_ids[j : min(j + ploidy, len(node_ids))] dimension = rng.randint(0, max_dimension) location = [rng.random() for _ in range(dimension)] parents = rng.sample(range(-1, 1 + ind_id), min(1 + ind_id, rng.randint(0, 3))) ind_id = tables.individuals.add_row(location=location, parents=parents) individual[nodes] = ind_id j += ploidy + j += rng.randint(0, 5) # skip a random number + tables.nodes.individual = individual + return tables.tree_sequence() + + +def insert_random_consistent_individuals( + ts, min_ploidy=0, max_ploidy=5, min_dimension=0, max_dimension=3, seed=1 +): + """ + Takes random subsets of nodes having the same time and population and + assigns them to individuals. Also creates random locations in variable + dimensions in the unit interval, and assigns random parents (including NULL + parents). + """ + rng = random.Random(seed) + tables = ts.dump_tables() + tables.individuals.clear() + individual = tables.nodes.individual[:] + individual[:] = tskit.NULL + ind_id = -1 + pops = np.arange(ts.num_populations) + for pop in pops: + n = tables.nodes.population == pop + times = np.unique(tables.nodes.time[n]) + for t in times: + nn = np.where(np.logical_and(n, tables.nodes.time == t))[0] + rng.shuffle(nn) + j = 0 + while j < len(nn): + ploidy = rng.randint(min_ploidy, max_ploidy) + nodes = nn[j : min(j + ploidy, len(nn))] + dimension = rng.randint(min_dimension, max_dimension) + location = [rng.random() for _ in range(dimension)] + parents = rng.sample( + range(-1, 1 + ind_id), min(1 + ind_id, rng.randint(0, 3)) + ) + ind_id = tables.individuals.add_row(location=location, parents=parents) + individual[nodes] = ind_id + j += ploidy + j += rng.randint(0, 2) # skip a random number tables.nodes.individual = individual return tables.tree_sequence() diff --git a/python/tskit/trees.py b/python/tskit/trees.py index f819049fba..2ced65cc05 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -2195,8 +2195,8 @@ def num_tracked_samples(self, u=None): def preorder(self, u=NULL): """ - Returns a numpy array of node ids in preorder - . If the node u + Returns a numpy array of node ids in `preorder + `_. If the node u is specified the traversal is rooted at this node (and it will be the first element in the returned array). Otherwise, all nodes reachable from the tree roots will be returned. See :ref:`tutorials:sec_analysing_trees_traversals` for @@ -2211,8 +2211,8 @@ def preorder(self, u=NULL): def postorder(self, u=NULL): """ - Returns a numpy array of node ids in postorder - . If the node u + Returns a numpy array of node ids in `postorder + `_. If the node u is specified the traversal is rooted at this node (and it will be the last element in the returned array). Otherwise, all nodes reachable from the tree roots will be returned. See :ref:`tutorials:sec_analysing_trees_traversals` for @@ -3774,6 +3774,9 @@ def __init__(self, ll_tree_sequence): self._table_metadata_schemas = self._TableMetadataSchemas( **metadata_schema_instances ) + self._individuals_time = None + self._individuals_population = None + self._individuals_location = None # Implement the pickle protocol for TreeSequence def __getstate__(self): @@ -5073,6 +5076,71 @@ def alignments(self, *, reference_sequence=None, missing_data_character=None): a[site_pos] = h yield a.tobytes().decode("ascii") + @property + def individuals_population(self): + """ + Returns the length-``num_individuals`` array containing, for each + individual, the ``population`` attribute of their nodes, or + ``tskit.NULL`` for individuals with no nodes. Errors if any individual + has nodes with inconsistent non-NULL populations. + """ + if self._individuals_population is None: + self._individuals_population = ( + self._ll_tree_sequence.get_individuals_population() + ) + return self._individuals_population + + @property + def individual_populations(self): + # Undocumented alias for individuals_population to avoid breaking + # pre-1.0 pyslim code + return self.individuals_population + + @property + def individuals_time(self): + """ + Returns the length-``num_individuals`` array containing, for each + individual, the ``time`` attribute of their nodes or ``np.nan`` for + individuals with no nodes. Errors if any individual has nodes with + inconsistent times. + """ + if self._individuals_time is None: + self._individuals_time = self._ll_tree_sequence.get_individuals_time() + return self._individuals_time + + @property + def individual_times(self): + # Undocumented alias for individuals_time to avoid breaking + # pre-1.0 pyslim code + return self.individuals_time + + @property + def individuals_location(self): + """ + Convenience method returning the ``num_individuals x n`` array + whose row k-th row contains the ``location`` property of the k-th + individual. The method only works if all individuals' locations + have the same length (which is ``n``), and errors otherwise. + """ + if self._individuals_location is None: + individuals = self.tables.individuals + n = 0 + lens = np.unique(np.diff(individuals.location_offset)) + if len(lens) > 1: + raise ValueError("Individual locations are not all the same length.") + if len(lens) > 0: + n = lens[0] + self._individuals_location = individuals.location.reshape( + (self.num_individuals, n) + ) + return self._individuals_location + + @property + def individual_locations(self): + # Undocumented alias for individuals_time to avoid breaking + # pre-1.0 pyslim code + return self.individuals_location + def individual(self, id_): """ Returns the :ref:`individual `