Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
--------------------
[0.6.5] - 2025-0X-XX
--------------------

**Features**

- ``TreeSequence.map_to_vcf_model`` now also returns the transformed positions and
contig length. (:user:`benjeffery`, :pr:`XXXX`, :issue:`3173`)

--------------------
[0.6.4] - 2025-05-21
--------------------
Expand Down
59 changes: 59 additions & 0 deletions python/tests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5910,3 +5910,62 @@ def test_all_individuals_no_nodes(self):
ts = tables.tree_sequence()
result = ts.map_to_vcf_model()
assert result.individuals_nodes.shape == (2, 0)

def test_position_transform_default_and_custom(self):
tables = tskit.TableCollection(10.6)
tables.sites.add_row(position=1.3, ancestral_state="A")
tables.sites.add_row(position=5.7, ancestral_state="T")
tables.sites.add_row(position=9.9, ancestral_state="C")
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
ts = tables.tree_sequence()

result = ts.map_to_vcf_model()
assert np.array_equal(result.transformed_positions, [1, 6, 10])
assert result.contig_length == 11

def floor_transform(positions):
return np.floor(positions).astype(int)

result = ts.map_to_vcf_model(position_transform=floor_transform)
assert np.array_equal(result.transformed_positions, [1, 5, 9])
assert result.contig_length == 10

def test_legacy_position_transform(self):
# Test legacy transform with duplicate positions
tables = tskit.TableCollection(10.0)
tables.sites.add_row(position=1.4, ancestral_state="A")
tables.sites.add_row(position=1.6, ancestral_state="T")
tables.sites.add_row(position=1.7, ancestral_state="T")
tables.sites.add_row(position=3.2, ancestral_state="C")
tables.sites.add_row(position=3.8, ancestral_state="G")
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
ts = tables.tree_sequence()

result = ts.map_to_vcf_model(position_transform="legacy")
assert np.array_equal(result.transformed_positions, [1, 2, 3, 4, 5])
assert result.contig_length == 10

def test_position_transform_no_sites(self):
tables = tskit.TableCollection(5.5)
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
ts = tables.tree_sequence()

result = ts.map_to_vcf_model()
assert result.transformed_positions.shape == (0,)
assert result.contig_length == 6

def test_invalid_position_transform_return_shape(self):
tables = tskit.TableCollection(10.0)
tables.sites.add_row(position=1.0, ancestral_state="A")
tables.sites.add_row(position=5.0, ancestral_state="T")
tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
ts = tables.tree_sequence()

def bad_transform(positions):
return np.array([1]) # Wrong length

with pytest.raises(
ValueError,
match="Position transform must return an array of the same length",
):
ts.map_to_vcf_model(position_transform=bad_transform)
2 changes: 1 addition & 1 deletion python/tskit/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Definitive location for the version number.
# During development, should be x.y.z.devN
# For beta should be x.y.zbN
tskit_version = "0.6.4"
tskit_version = "0.6.5.dev0"
63 changes: 57 additions & 6 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
class VcfModelMapping:
individuals_nodes: np.ndarray
individuals_name: np.ndarray
transformed_positions: np.ndarray
contig_length: int


class CoalescenceRecord(NamedTuple):
Expand Down Expand Up @@ -10636,13 +10638,15 @@ def map_to_vcf_model(
name_metadata_key=None,
individual_names=None,
include_non_sample_nodes=None,
position_transform=None,
):
"""
Maps the sample nodes in this tree sequence to a representation suitable for
VCF output, using the individuals if present.

Creates a VcfModelMapping object that contains both the nodes-to-individual
mapping as a 2D array of (individuals, nodes) and the individual names. The
Creates a VcfModelMapping object that contains a nodes-to-individual
mapping as a 2D array of (individuals, nodes), the individual names and VCF
compatible site positions and contig length. The
mapping is created by first checking if the tree sequence contains individuals.
If it does, the mapping is created using the individuals in the tree sequence.
By default only the sample nodes of the individuals are included in the mapping,
Expand All @@ -10652,6 +10656,12 @@ def map_to_vcf_model(
If no individuals are present, the mapping is created using only the sample nodes
and the specified ploidy.

As the tskit data model allows non-integer positions, site positions and contig
length are transformed to integer values suitable for VCF output. The
transformation is done using the `position_transform` function, which must
return an integer numpy array the same dimension as the input. By default,
this is set to ``numpy.round()`` which will round values to the nearest integer.

If neither `name_metadata_key` nor `individual_names` is not specified, the
individual names are set to "tsk_{individual_id}" for each individual. If
no individuals are present, the individual names are set to "tsk_{i}" with
Expand All @@ -10672,9 +10682,20 @@ def map_to_vcf_model(
be specified simultaneously with name_metadata_key.
:param bool include_non_sample_nodes: If True, include all nodes belonging to
the individuals in the mapping. If False, only include sample nodes.
Deafults to False.
:return: A VcfModelMapping containing the node-to-individual mapping and
individual names.
Defaults to False.
:param position_transform: A callable that transforms the
site position values into integer valued coordinates suitable for
VCF. The function takes a single positional parameter x and must
return an integer numpy array the same dimension as x. By default,
this is set to ``numpy.round()`` which will round values to the
nearest integer. If the string "legacy" is provided here, the
pre 0.2.0 legacy behaviour of rounding values to the nearest integer
(starting from 1) and avoiding the output of identical positions
by incrementing is used.
See the :ref:`sec_export_vcf_modifying_coordinates` for examples
and more information.
:return: A VcfModelMapping containing the node-to-individual mapping,
individual names, transformed positions, and transformed contig length.
:raises ValueError: If both name_metadata_key and individual_names are specified,
if ploidy is specified when individuals are present, if an invalid individual
ID is specified, if a specified individual has no nodes, or if the number of
Expand Down Expand Up @@ -10752,7 +10773,37 @@ def map_to_vcf_model(
"The number of individuals does not match the number of names"
)

return VcfModelMapping(individuals_nodes, individual_names)
def legacy_position_transform(positions):
"""
Transforms positions in the tree sequence into VCF coordinates under
the pre 0.2.0 legacy rule.
"""
last_pos = 0
transformed = []
for pos in positions:
pos = int(round(pos))
if pos <= last_pos:
pos = last_pos + 1
transformed.append(pos)
last_pos = pos
return transformed

if position_transform is None:
position_transform = np.round
elif position_transform == "legacy":
position_transform = legacy_position_transform
transformed_positions = np.array(
position_transform(self.sites_position), dtype=int
)
if transformed_positions.shape != (self.num_sites,):
raise ValueError(
"Position transform must return an array of the same length"
)
contig_length = max(1, int(position_transform([self.sequence_length])[0]))

return VcfModelMapping(
individuals_nodes, individual_names, transformed_positions, contig_length
)

############################################
#
Expand Down
34 changes: 3 additions & 31 deletions python/tskit/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,6 @@
from . import provenance


def legacy_position_transform(positions):
"""
Transforms positions in the tree sequence into VCF coordinates under
the pre 0.2.0 legacy rule.
"""
last_pos = 0
transformed = []
for pos in positions:
pos = int(round(pos))
if pos <= last_pos:
pos = last_pos + 1
transformed.append(pos)
last_pos = pos
return transformed


class VcfWriter:
"""
Writes a VCF representation of the genotypes tree sequence to a
Expand Down Expand Up @@ -74,6 +58,7 @@ def __init__(
ploidy=ploidy,
individual_names=individual_names,
include_non_sample_nodes=include_non_sample_nodes,
position_transform=position_transform,
)
# Remove individuals with zero ploidy as these cannot be
# represented in VCF.
Expand All @@ -96,21 +81,8 @@ def __init__(
if node_id != -1:
self.samples.append(node_id)

# Transform coordinates for VCF
if position_transform is None:
position_transform = np.round
elif position_transform == "legacy":
position_transform = legacy_position_transform
self.transformed_positions = np.array(
position_transform(tree_sequence.tables.sites.position), dtype=int
)
if self.transformed_positions.shape != (tree_sequence.num_sites,):
raise ValueError(
"Position transform must return an array of the same length"
)
self.contig_length = max(
1, int(position_transform([tree_sequence.sequence_length])[0])
)
self.transformed_positions = vcf_model.transformed_positions
self.contig_length = vcf_model.contig_length
if len(self.transformed_positions) > 0:
# Arguably this should be last_pos + 1, but if we hit this
# condition the coordinate systems are all muddled up anyway
Expand Down
Loading