diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 1207780a0e..99b31454f7 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -1,3 +1,13 @@ +---------------------- +[0.4.2] - 2022-0X-XX +---------------------- + +**Changes** + +- ``VcfWriter.write`` now prints the site ID of variants in the ID field of the output VCF files. + (:user:`roohy`, :issue:`2103`, :pr:`2107`) + + ---------------------- [0.4.1] - 2022-01-11 ---------------------- diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py index 31a7ed2f3c..627826940f 100644 --- a/python/tests/test_vcf.py +++ b/python/tests/test_vcf.py @@ -127,11 +127,12 @@ def legacy_write_vcf(tree_sequence, output, ploidy, contig_id): print(file=output) for variant in tree_sequence.variants(): pos = positions[variant.index] + site_id = variant.site.id assert variant.num_alleles == 2 print( contig_id, pos, - ".", + site_id, variant.alleles[0], variant.alleles[1], ".", @@ -473,6 +474,13 @@ def test_bad_individuals(self): with pytest.raises(ValueError): ts.write_vcf(io.StringIO(), individuals=[1, 2, ts.num_individuals]) + def test_vcf_ids(self): + ts = msprime.simulate(2, mutation_rate=2, random_seed=1) + assert ts.num_sites > 0 + with ts_to_pyvcf(ts) as vcf_reader: + for vcf_record, site in zip(vcf_reader, ts.sites()): + assert int(vcf_record.ID) == site.id + class TestRoundTripIndividuals(ExamplesMixin): """ diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 0dd4320a4c..51c5f38d97 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5236,6 +5236,11 @@ def write_vcf( check that the alleles result in a valid VCF---for example, it is possible to use the tab character as an allele, leading to a broken VCF. + The ID value in the output VCF file is the integer ID of the corresponding + :ref:`site ` (``site.id``). Subsequently, + These ID values can be utilized to match the contents of the VCF file + to the sites in the tree sequence object. + The ``position_transform`` argument provides a way to flexibly translate the genomic location of sites in tskit to the appropriate value in VCF. There are two fundamental differences in the way that tskit and VCF define diff --git a/python/tskit/vcf.py b/python/tskit/vcf.py index ee0bc5786e..a9792cfb12 100644 --- a/python/tskit/vcf.py +++ b/python/tskit/vcf.py @@ -187,12 +187,13 @@ def write(self, output): "on GitHub if this limitation affects you." ) pos = self.transformed_positions[variant.index] + site_id = variant.site.id ref = variant.alleles[0] alt = ",".join(variant.alleles[1:]) if len(variant.alleles) > 1 else "." print( self.contig_id, pos, - ".", + site_id, ref, alt, ".",