From f8b55c9b7e9d807a5f9a851bad8786aec6f8c045 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 23 May 2025 07:11:13 +0100 Subject: [PATCH] Include positions in VCF mapping --- python/CHANGELOG.rst | 9 +++++ python/tests/test_highlevel.py | 59 +++++++++++++++++++++++++++++++ python/tskit/_version.py | 2 +- python/tskit/trees.py | 63 ++++++++++++++++++++++++++++++---- python/tskit/vcf.py | 34 ++---------------- 5 files changed, 129 insertions(+), 38 deletions(-) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 8cf0c594eb..5225fd0e30 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -1,3 +1,12 @@ +-------------------- +[0.6.5] - 2025-0X-XX +-------------------- + +**Features** + +- ``TreeSequence.map_to_vcf_model`` now also returns the transformed positions and + contig length. (:user:`benjeffery`, :pr:`XXXX`, :issue:`3173`) + -------------------- [0.6.4] - 2025-05-21 -------------------- diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 0562a0f315..429d1ecc5f 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -5910,3 +5910,62 @@ def test_all_individuals_no_nodes(self): ts = tables.tree_sequence() result = ts.map_to_vcf_model() assert result.individuals_nodes.shape == (2, 0) + + def test_position_transform_default_and_custom(self): + tables = tskit.TableCollection(10.6) + tables.sites.add_row(position=1.3, ancestral_state="A") + tables.sites.add_row(position=5.7, ancestral_state="T") + tables.sites.add_row(position=9.9, ancestral_state="C") + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model() + assert np.array_equal(result.transformed_positions, [1, 6, 10]) + assert result.contig_length == 11 + + def floor_transform(positions): + return np.floor(positions).astype(int) + + result = ts.map_to_vcf_model(position_transform=floor_transform) + assert np.array_equal(result.transformed_positions, [1, 5, 9]) + assert result.contig_length == 10 + + def test_legacy_position_transform(self): + # Test legacy transform with duplicate positions + tables = tskit.TableCollection(10.0) + tables.sites.add_row(position=1.4, ancestral_state="A") + tables.sites.add_row(position=1.6, ancestral_state="T") + tables.sites.add_row(position=1.7, ancestral_state="T") + tables.sites.add_row(position=3.2, ancestral_state="C") + tables.sites.add_row(position=3.8, ancestral_state="G") + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model(position_transform="legacy") + assert np.array_equal(result.transformed_positions, [1, 2, 3, 4, 5]) + assert result.contig_length == 10 + + def test_position_transform_no_sites(self): + tables = tskit.TableCollection(5.5) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model() + assert result.transformed_positions.shape == (0,) + assert result.contig_length == 6 + + def test_invalid_position_transform_return_shape(self): + tables = tskit.TableCollection(10.0) + tables.sites.add_row(position=1.0, ancestral_state="A") + tables.sites.add_row(position=5.0, ancestral_state="T") + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + + def bad_transform(positions): + return np.array([1]) # Wrong length + + with pytest.raises( + ValueError, + match="Position transform must return an array of the same length", + ): + ts.map_to_vcf_model(position_transform=bad_transform) diff --git a/python/tskit/_version.py b/python/tskit/_version.py index d3785463da..9caefdbdab 100644 --- a/python/tskit/_version.py +++ b/python/tskit/_version.py @@ -1,4 +1,4 @@ # Definitive location for the version number. # During development, should be x.y.z.devN # For beta should be x.y.zbN -tskit_version = "0.6.4" +tskit_version = "0.6.5.dev0" diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 937978d78d..7354ff34fd 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -63,6 +63,8 @@ class VcfModelMapping: individuals_nodes: np.ndarray individuals_name: np.ndarray + transformed_positions: np.ndarray + contig_length: int class CoalescenceRecord(NamedTuple): @@ -10636,13 +10638,15 @@ def map_to_vcf_model( name_metadata_key=None, individual_names=None, include_non_sample_nodes=None, + position_transform=None, ): """ Maps the sample nodes in this tree sequence to a representation suitable for VCF output, using the individuals if present. - Creates a VcfModelMapping object that contains both the nodes-to-individual - mapping as a 2D array of (individuals, nodes) and the individual names. The + Creates a VcfModelMapping object that contains a nodes-to-individual + mapping as a 2D array of (individuals, nodes), the individual names and VCF + compatible site positions and contig length. The mapping is created by first checking if the tree sequence contains individuals. If it does, the mapping is created using the individuals in the tree sequence. By default only the sample nodes of the individuals are included in the mapping, @@ -10652,6 +10656,12 @@ def map_to_vcf_model( If no individuals are present, the mapping is created using only the sample nodes and the specified ploidy. + As the tskit data model allows non-integer positions, site positions and contig + length are transformed to integer values suitable for VCF output. The + transformation is done using the `position_transform` function, which must + return an integer numpy array the same dimension as the input. By default, + this is set to ``numpy.round()`` which will round values to the nearest integer. + If neither `name_metadata_key` nor `individual_names` is not specified, the individual names are set to "tsk_{individual_id}" for each individual. If no individuals are present, the individual names are set to "tsk_{i}" with @@ -10672,9 +10682,20 @@ def map_to_vcf_model( be specified simultaneously with name_metadata_key. :param bool include_non_sample_nodes: If True, include all nodes belonging to the individuals in the mapping. If False, only include sample nodes. - Deafults to False. - :return: A VcfModelMapping containing the node-to-individual mapping and - individual names. + Defaults to False. + :param position_transform: A callable that transforms the + site position values into integer valued coordinates suitable for + VCF. The function takes a single positional parameter x and must + return an integer numpy array the same dimension as x. By default, + this is set to ``numpy.round()`` which will round values to the + nearest integer. If the string "legacy" is provided here, the + pre 0.2.0 legacy behaviour of rounding values to the nearest integer + (starting from 1) and avoiding the output of identical positions + by incrementing is used. + See the :ref:`sec_export_vcf_modifying_coordinates` for examples + and more information. + :return: A VcfModelMapping containing the node-to-individual mapping, + individual names, transformed positions, and transformed contig length. :raises ValueError: If both name_metadata_key and individual_names are specified, if ploidy is specified when individuals are present, if an invalid individual ID is specified, if a specified individual has no nodes, or if the number of @@ -10752,7 +10773,37 @@ def map_to_vcf_model( "The number of individuals does not match the number of names" ) - return VcfModelMapping(individuals_nodes, individual_names) + def legacy_position_transform(positions): + """ + Transforms positions in the tree sequence into VCF coordinates under + the pre 0.2.0 legacy rule. + """ + last_pos = 0 + transformed = [] + for pos in positions: + pos = int(round(pos)) + if pos <= last_pos: + pos = last_pos + 1 + transformed.append(pos) + last_pos = pos + return transformed + + if position_transform is None: + position_transform = np.round + elif position_transform == "legacy": + position_transform = legacy_position_transform + transformed_positions = np.array( + position_transform(self.sites_position), dtype=int + ) + if transformed_positions.shape != (self.num_sites,): + raise ValueError( + "Position transform must return an array of the same length" + ) + contig_length = max(1, int(position_transform([self.sequence_length])[0])) + + return VcfModelMapping( + individuals_nodes, individual_names, transformed_positions, contig_length + ) ############################################ # diff --git a/python/tskit/vcf.py b/python/tskit/vcf.py index 502d397be0..c711058baf 100644 --- a/python/tskit/vcf.py +++ b/python/tskit/vcf.py @@ -28,22 +28,6 @@ from . import provenance -def legacy_position_transform(positions): - """ - Transforms positions in the tree sequence into VCF coordinates under - the pre 0.2.0 legacy rule. - """ - last_pos = 0 - transformed = [] - for pos in positions: - pos = int(round(pos)) - if pos <= last_pos: - pos = last_pos + 1 - transformed.append(pos) - last_pos = pos - return transformed - - class VcfWriter: """ Writes a VCF representation of the genotypes tree sequence to a @@ -74,6 +58,7 @@ def __init__( ploidy=ploidy, individual_names=individual_names, include_non_sample_nodes=include_non_sample_nodes, + position_transform=position_transform, ) # Remove individuals with zero ploidy as these cannot be # represented in VCF. @@ -96,21 +81,8 @@ def __init__( if node_id != -1: self.samples.append(node_id) - # Transform coordinates for VCF - if position_transform is None: - position_transform = np.round - elif position_transform == "legacy": - position_transform = legacy_position_transform - self.transformed_positions = np.array( - position_transform(tree_sequence.tables.sites.position), dtype=int - ) - if self.transformed_positions.shape != (tree_sequence.num_sites,): - raise ValueError( - "Position transform must return an array of the same length" - ) - self.contig_length = max( - 1, int(position_transform([tree_sequence.sequence_length])[0]) - ) + self.transformed_positions = vcf_model.transformed_positions + self.contig_length = vcf_model.contig_length if len(self.transformed_positions) > 0: # Arguably this should be last_pos + 1, but if we hit this # condition the coordinate systems are all muddled up anyway