Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
----------------------
[0.4.2] - 2022-0X-XX
[0.5.0] - 2022-0X-XX
----------------------

**Changes**
Expand All @@ -10,14 +10,25 @@
- Make dumping of tables and tree seqences to disk a zero-copy operation.
(:user:`benjeffery`, :issue:`2111`, :pr:`2124`)

- Add ``return_variant_copies`` argument to ``TreeSequence.variants`` which if False reuses the
returned ``Variant`` object for improved performance. Defaults to True.
(:user:`benjeffery`, :issue:`605`, :pr:`2172`)

- ``tree.mrca`` now takes 2 or more arguments and gives the common ancestor of them all.
(:user:`savitakartik`, :issue:`1340`, :pr:`2121`)

**Breaking Changes**

- The JSON metadata codec now interprets the empty string as an empty object. This means
that applying a schema to an existing table will no longer necessitate modifying the
existing rows. (:user:`benjeffery`, :issue:`2064`, :pr:`2104`)
- ``tree.mrca`` now takes 2 or more arguments.
(:user:`savitakartik`, :issue:`1340`, :pr:`2121`)

- Remove the previously deprecated ``as_bytes`` argument to ``TreeSequence.variants``.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's super old, but it would be good to be quantitative about when it was deprecated (version or date, I guess)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing something like this does make this a major version bump, so I guess we should update our milestones accordingly (either 0.5 or 1.0 I guess)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should give some guidance on what people should do as well I suppose. If you do use as_bytes, how do you fix your code now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's from the msprime days! 983d969

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, we don't need to worry too much then.

I guess the functionality is actually pretty handy though, so I've opened a new issue #2181. We should update this note to tell users to use this method instead. The as_macs method can then use this function too.

If you need genotypes in byte form this can be done following the code in the
``to_macs`` method on line ``5573`` of ``trees.py``.
This argument was initially deprecated more than 3 years ago when the code was part of
``msprime``.
(:user:`benjeffery`, :issue:`605`, :pr:`2172`)

----------------------
[0.4.1] - 2022-01-11
Expand Down
8 changes: 3 additions & 5 deletions python/_tskitmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -11402,18 +11402,16 @@ Variant_init(Variant *self, PyObject *args, PyObject *kwds)
}

static PyObject *
Variant_decode(Variant *self, PyObject *args, PyObject *kwds)
Variant_decode(Variant *self, PyObject *args)
{
int err;
PyObject *ret = NULL;
tsk_id_t site_id;
static char *kwlist[] = { "site", NULL };

if (Variant_check_state(self) != 0) {
goto out;
}
if (!PyArg_ParseTupleAndKeywords(
args, kwds, "O&", kwlist, &tsk_id_converter, &site_id)) {
if (!PyArg_ParseTuple(args, "O&", &tsk_id_converter, &site_id)) {
goto out;
}
err = tsk_variant_decode(self->variant, site_id, 0);
Expand Down Expand Up @@ -11534,7 +11532,7 @@ static PyGetSetDef Variant_getsetters[]
static PyMethodDef Variant_methods[] = {
{ .ml_name = "decode",
.ml_meth = (PyCFunction) Variant_decode,
.ml_flags = METH_VARARGS | METH_KEYWORDS,
.ml_flags = METH_VARARGS,
.ml_doc = "Sets the variant's genotypes to those of a given tree and site" },
{ .ml_name = "restricted_copy",
.ml_meth = (PyCFunction) Variant_restricted_copy,
Expand Down
25 changes: 1 addition & 24 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,29 +191,6 @@ def get_tree_sequence(self):
assert ts.get_num_mutations() > 10
return ts

def test_as_bytes(self):
ts = self.get_tree_sequence()
n = ts.get_sample_size()
m = ts.get_num_mutations()
A = np.zeros((m, n), dtype="u1")
B = np.zeros((m, n), dtype="u1")
for variant in ts.variants():
A[variant.index] = variant.genotypes
for variant in ts.variants(as_bytes=True):
assert isinstance(variant.genotypes, bytes)
B[variant.index] = np.frombuffer(variant.genotypes, np.uint8) - ord("0")
assert np.all(A == B)
bytes_variants = list(ts.variants(as_bytes=True))
for j, variant in enumerate(bytes_variants):
assert j == variant.index
row = np.frombuffer(variant.genotypes, np.uint8) - ord("0")
assert np.all(A[j] == row)

def test_as_bytes_fails(self):
ts = tsutil.insert_multichar_mutations(self.get_tree_sequence())
with pytest.raises(ValueError):
list(ts.variants(as_bytes=True))

def test_dtype(self):
ts = self.get_tree_sequence()
for var in ts.variants():
Expand Down Expand Up @@ -913,7 +890,7 @@ def test_simple_01_duplicate_alleles(self):
):
assert v2.alleles == alleles
assert v1.site == v2.site
g = v1.genotypes
g = np.array(v1.genotypes)
index = np.where(g == 1)
g[index] = 2
assert np.array_equal(g, v2.genotypes)
Expand Down
23 changes: 11 additions & 12 deletions python/tests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4196,18 +4196,6 @@ def get_instances(self, n):
return [tskit.Edgeset(left=j, right=j, parent=j, children=j) for j in range(n)]


class TestVariantContainer(SimpleContainersMixin):
def get_instances(self, n):
return [
tskit.Variant(
site=TestSiteContainer().get_instances(1)[0],
alleles=["A" * j, "T"],
genotypes=np.zeros(j, dtype=np.int8),
)
for j in range(n)
]


class TestContainersAppend:
def test_containers_append(self, ts_fixture):
"""
Expand Down Expand Up @@ -4262,3 +4250,14 @@ def test_macs(self):
assert len(col) == n
for j in range(n):
assert col[j] == haplotypes[j][site_id]

def test_macs_error(self):
tables = tskit.TableCollection(1)
tables.sites.add_row(position=0.5, ancestral_state="A")
tables.nodes.add_row(time=1, flags=tskit.NODE_IS_SAMPLE)
tables.mutations.add_row(node=0, site=0, derived_state="FOO")
ts = tables.tree_sequence()
with pytest.raises(
ValueError, match="macs output only supports single letter alleles"
):
ts.to_macs()
71 changes: 36 additions & 35 deletions python/tests/test_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -1686,9 +1686,12 @@ def assert_haplotypes_equal(self, ts1, ts2):
assert h1 == h2

def assert_variants_equal(self, ts1, ts2):
v1 = list(ts1.variants(as_bytes=True))
v2 = list(ts2.variants(as_bytes=True))
assert v1 == v2
for v1, v2 in zip(
ts1.variants(copy=False),
ts2.variants(copy=False),
):
assert v1.alleles == v2.alleles
assert np.array_equal(v1.genotypes, v2.genotypes)

def check_num_samples(self, ts, x):
"""
Expand Down Expand Up @@ -2541,9 +2544,12 @@ def verify_permuted_nodes(self, ts):
assert ts.sequence_length == permuted.sequence_length
assert list(permuted.samples()) == samples
assert list(permuted.haplotypes()) == list(ts.haplotypes())
assert [v.genotypes for v in permuted.variants(as_bytes=True)] == [
v.genotypes for v in ts.variants(as_bytes=True)
]
for v1, v2 in zip(
permuted.variants(copy=False),
ts.variants(copy=False),
):
assert np.array_equal(v1.genotypes, v2.genotypes)

assert ts.num_trees == permuted.num_trees
j = 0
for t1, t2 in zip(ts.trees(), permuted.trees()):
Expand Down Expand Up @@ -3355,9 +3361,10 @@ def test_simplest_degenerate_case(self):
assert t.parent_dict == {}
assert sorted(t.roots) == [0, 1]
assert list(ts.haplotypes(isolated_as_missing=False)) == ["10", "01"]
assert [
v.genotypes for v in ts.variants(as_bytes=True, isolated_as_missing=False)
] == [b"10", b"01"]
assert np.array_equal(
np.stack([v.genotypes for v in ts.variants(isolated_as_missing=False)]),
[[1, 0], [0, 1]],
)
simplified = ts.simplify()
t1 = ts.dump_tables()
t2 = simplified.dump_tables()
Expand Down Expand Up @@ -3412,12 +3419,10 @@ def test_simplest_non_degenerate_case(self):
t = next(ts.trees())
assert t.parent_dict == {0: 4, 1: 4, 2: 5, 3: 5}
assert list(ts.haplotypes()) == ["1000", "0100", "0010", "0001"]
assert [v.genotypes for v in ts.variants(as_bytes=True)] == [
b"1000",
b"0100",
b"0010",
b"0001",
]
assert np.array_equal(
np.stack([v.genotypes for v in ts.variants()]),
[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
)
assert t.mrca(0, 1) == 4
assert t.mrca(0, 4) == 4
assert t.mrca(2, 3) == 5
Expand Down Expand Up @@ -3489,13 +3494,10 @@ def test_two_reducible_trees(self):
t = next(ts.trees())
assert t.parent_dict == {0: 4, 1: 5, 2: 7, 3: 7, 4: 6, 5: 6, 8: 7}
assert list(ts.haplotypes()) == ["10000", "01000", "00100", "00010"]
assert [v.genotypes for v in ts.variants(as_bytes=True)] == [
b"1000",
b"0100",
b"0010",
b"0001",
b"0000",
]
assert np.array_equal(
np.stack([v.genotypes for v in ts.variants()]),
[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 0]],
)
assert t.mrca(0, 1) == 6
assert t.mrca(2, 3) == 7
assert t.mrca(2, 8) == 7
Expand All @@ -3508,14 +3510,11 @@ def test_two_reducible_trees(self):
assert ts_simplified.num_nodes == 6
assert ts_simplified.num_trees == 1
t = next(ts_simplified.trees())
# print(ts_simplified.tables)
assert list(ts_simplified.haplotypes()) == ["1000", "0100", "0010", "0001"]
assert [v.genotypes for v in ts_simplified.variants(as_bytes=True)] == [
b"1000",
b"0100",
b"0010",
b"0001",
]
assert np.array_equal(
np.stack([v.genotypes for v in ts_simplified.variants()]),
[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
)
# The site over the non-sample external node should have been discarded.
sites = list(t.sites())
assert sites[-1].position == 0.4
Expand Down Expand Up @@ -3618,15 +3617,17 @@ def test_mutations_over_roots(self):
t = next(ts.trees())
assert len(list(t.sites())) == 6
haplotypes = ["101100", "011100", "000011"]
variants = [b"100", b"010", b"110", b"110", b"001", b"001"]
variants = [[1, 0, 0], [0, 1, 0], [1, 1, 0], [1, 1, 0], [0, 0, 1], [0, 0, 1]]
assert list(ts.haplotypes()) == haplotypes
assert [v.genotypes for v in ts.variants(as_bytes=True)] == variants
assert np.array_equal(np.stack([v.genotypes for v in ts.variants()]), variants)
ts_simplified = ts.simplify(filter_sites=False)
assert list(ts_simplified.haplotypes(isolated_as_missing=False)) == haplotypes
assert variants == [
v.genotypes
for v in ts_simplified.variants(as_bytes=True, isolated_as_missing=False)
]
assert np.array_equal(
np.stack(
[v.genotypes for v in ts_simplified.variants(isolated_as_missing=False)]
),
variants,
)

def test_break_single_tree(self):
# Take a single largish tree from tskit, and remove the oldest record.
Expand Down
Loading