diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 20e00706df..dc4882d393 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -20,11 +20,14 @@ - Method ``tsk_individual_table_add_row`` has an extra arguments ``parents`` and ``parents_length``. -**Breaking changes** - - Add an ``options`` argument to ``tsk_table_collection_subset`` (:user:`petrelharp`, :pr:`1108`), to allow for retaining the order of populations. +**Changes** + +- Allow mutations that have the same derived state as their parent mutation. + (:user:`benjeffery`, :issue:`1180`, :pr:`1233`) + **Bugfixes** ---------------------- diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index c035b99ae2..050bc4c92b 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -864,7 +864,7 @@ test_single_tree_many_alleles(void) } static void -test_single_tree_inconsistent_mutations(void) +test_single_tree_silent_mutations_i16(void) { const char *sites = "0.0 0\n" "0.1 0\n" @@ -896,7 +896,7 @@ test_single_tree_inconsistent_mutations(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INCONSISTENT_MUTATIONS); + CU_ASSERT_EQUAL_FATAL(ret, 1); tsk_vargen_free(&vargen); } } @@ -904,6 +904,95 @@ test_single_tree_inconsistent_mutations(void) tsk_treeseq_free(&ts); } +static void +test_single_tree_silent_mutations_i8(void) +{ + int ret = 0; + tsk_treeseq_t ts; + tsk_vargen_t vargen; + tsk_variant_t *var; + + /* Add some silent mutations */ + const char *silent_ex_sites = "0.125 0\n" + "0.25 0\n" + "0.5 0\n" + "0.75 0\n"; + /* site, node, derived_state, [parent, time] */ + const char *silent_ex_mutations + = "0 5 0 -1\n" /* Silent mutation over mutation 1 */ + "0 2 1 0\n" + "1 4 1 -1\n" + "1 0 0 2\n" /* Back mutation over 0 */ + "1 0 0 3\n" /* Silent mutation under back mutation */ + "2 0 1 -1\n" /* recurrent mutations over samples */ + "2 1 1 -1\n" + "2 2 1 -1\n" + "2 3 1 -1\n" + "3 0 0 -1\n" /* Single silent mutation at a site */ + ; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + silent_ex_sites, silent_ex_mutations, NULL, NULL, 0); + + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_print_state(&vargen, _devnull); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 1); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site->id, 0); + CU_ASSERT_EQUAL(var->site->mutations_length, 2); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site->id, 1); + CU_ASSERT_EQUAL(var->site->mutations_length, 3); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 1); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 1); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site->id, 2); + CU_ASSERT_EQUAL(var->site->mutations_length, 4); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->num_alleles, 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site->id, 3); + CU_ASSERT_EQUAL(var->site->mutations_length, 1); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_vargen_free(&vargen); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); +} + int main(int argc, char **argv) { @@ -922,8 +1011,10 @@ main(int argc, char **argv) { "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors }, { "test_single_tree_subsample", test_single_tree_subsample }, { "test_single_tree_many_alleles", test_single_tree_many_alleles }, - { "test_single_tree_inconsistent_mutations", - test_single_tree_inconsistent_mutations }, + { "test_single_tree_silent_mutations_i16", + test_single_tree_silent_mutations_i16 }, + { "test_single_tree_silent_mutations_i8", test_single_tree_silent_mutations_i8 }, + { NULL, NULL }, }; diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 96584fbb34..8efe8af516 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -42,6 +42,7 @@ const char *single_tree_ex_edges = "0 1 4 0,1\n" const char *single_tree_ex_sites = "0.125 0\n" "0.25 0\n" "0.5 0\n"; +/* site, node, derived_state, [parent, time] */ const char *single_tree_ex_mutations = "0 2 1 -1\n" "1 4 1 -1\n" diff --git a/c/tskit/core.c b/c/tskit/core.c index faa3487bac..08bc4e5a6d 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -284,9 +284,6 @@ tsk_strerror_internal(int err) case TSK_ERR_MUTATION_PARENT_INCONSISTENT: ret = "Mutation parent references form a loop."; break; - case TSK_ERR_INCONSISTENT_MUTATIONS: - ret = "Inconsistent mutations: state already equal to derived state"; - break; case TSK_ERR_UNSORTED_MUTATIONS: ret = "Mutations must be provided in non-decreasing site order and " "non-increasing time order within each site"; diff --git a/c/tskit/core.h b/c/tskit/core.h index 91a989eb8d..e26ee15490 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -247,13 +247,12 @@ not found in the file. #define TSK_ERR_MUTATION_PARENT_EQUAL -501 #define TSK_ERR_MUTATION_PARENT_AFTER_CHILD -502 #define TSK_ERR_MUTATION_PARENT_INCONSISTENT -503 -#define TSK_ERR_INCONSISTENT_MUTATIONS -504 -#define TSK_ERR_UNSORTED_MUTATIONS -505 -#define TSK_ERR_NON_SINGLE_CHAR_MUTATION -506 -#define TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE -507 -#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION -508 -#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE -509 -#define TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN -510 +#define TSK_ERR_UNSORTED_MUTATIONS -504 +#define TSK_ERR_NON_SINGLE_CHAR_MUTATION -505 +#define TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE -506 +#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION -507 +#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE -508 +#define TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN -509 /* Sample errors */ #define TSK_ERR_DUPLICATE_SAMPLE -600 diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index 12db3e1857..647641c4e3 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -321,10 +321,7 @@ tsk_vargen_update_genotypes_i8_sample_list( if (index != TSK_NULL) { stop = list_right[node]; while (true) { - if (genotypes[index] == (int8_t) derived) { - ret = TSK_ERR_INCONSISTENT_MUTATIONS; - goto out; - } + ret += genotypes[index] == TSK_MISSING_DATA; genotypes[index] = (int8_t) derived; if (index == stop) { @@ -333,7 +330,7 @@ tsk_vargen_update_genotypes_i8_sample_list( index = list_next[index]; } } -out: + return ret; } @@ -354,10 +351,7 @@ tsk_vargen_update_genotypes_i16_sample_list( if (index != TSK_NULL) { stop = list_right[node]; while (true) { - if (genotypes[index] == (int16_t) derived) { - ret = TSK_ERR_INCONSISTENT_MUTATIONS; - goto out; - } + ret += genotypes[index] == TSK_MISSING_DATA; genotypes[index] = (int16_t) derived; if (index == stop) { @@ -366,7 +360,7 @@ tsk_vargen_update_genotypes_i16_sample_list( index = list_next[index]; } } -out: + return ret; } @@ -422,13 +416,10 @@ tsk_vargen_visit_i8(tsk_vargen_t *self, tsk_id_t sample_index, tsk_id_t derived) tsk_bug_assert(derived < INT8_MAX); tsk_bug_assert(sample_index != -1); - if (genotypes[sample_index] == (int8_t) derived) { - ret = TSK_ERR_INCONSISTENT_MUTATIONS; - goto out; - } + ret = genotypes[sample_index] == TSK_MISSING_DATA; genotypes[sample_index] = (int8_t) derived; -out: + return ret; } @@ -440,13 +431,10 @@ tsk_vargen_visit_i16(tsk_vargen_t *self, tsk_id_t sample_index, tsk_id_t derived tsk_bug_assert(derived < INT16_MAX); tsk_bug_assert(sample_index != -1); - if (genotypes[sample_index] == (int16_t) derived) { - ret = TSK_ERR_INCONSISTENT_MUTATIONS; - goto out; - } + ret = genotypes[sample_index] == TSK_MISSING_DATA; genotypes[sample_index] = (int16_t) derived; -out: + return ret; } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 93ec6b45fa..9f82b20273 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -32,6 +32,11 @@ population indexing and lossless node reordering with subset. (:user:`petrelharp`, :pr:`1097`) +**Changes** + +- Allow mutations that have the same derived state as their parent mutation. + (:user:`benjeffery`, :issue:`1180`, :pr:`1233`) + **Breaking changes** - tskit now requires Python 3.6 (:user:`benjeffery`, :pr:`xxxx`) diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 1455a173af..8c6c702915 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -329,7 +329,7 @@ def test_recurrent_mutations_over_samples(self): assert variant.alleles == ("0", "1") assert np.all(variant.genotypes == np.ones(ts.sample_size)) - def test_recurrent_mutations_errors(self): + def test_silent_mutations(self): ts = self.get_tree_sequence() tree = next(ts.trees()) tables = ts.dump_tables() @@ -342,8 +342,7 @@ def test_recurrent_mutations_errors(self): tables.mutations.add_row(site=site, node=u, derived_state="1") tables.mutations.add_row(site=site, node=sample, derived_state="1") ts_new = tables.tree_sequence() - with pytest.raises(exceptions.LibraryError): - list(ts_new.variants()) + assert all([v.genotypes[sample] == 1 for v in ts_new.variants()]) def test_zero_samples(self): ts = self.get_tree_sequence() @@ -780,7 +779,7 @@ def test_recurrent_mutations_over_samples(self): for h in ts_new.haplotypes(): assert ones == h - def test_recurrent_mutations_errors(self): + def test_silent_mutations(self): ts = msprime.simulate(10, random_seed=2) tables = ts.dump_tables() tree = next(ts.trees()) @@ -791,9 +790,7 @@ def test_recurrent_mutations_errors(self): tables.mutations.add_row(site=site, node=u, derived_state="1") tables.mutations.add_row(site=site, node=tree.root, derived_state="1") ts_new = tables.tree_sequence() - with pytest.raises(exceptions.LibraryError): - list(ts_new.haplotypes()) - ts_new.haplotypes() + all(h == 1 for h in ts_new.haplotypes()) def test_back_mutations(self): base_ts = msprime.simulate(10, random_seed=2) diff --git a/python/tests/test_stats.py b/python/tests/test_stats.py index 2783e184d6..27b4b49a76 100644 --- a/python/tests/test_stats.py +++ b/python/tests/test_stats.py @@ -195,7 +195,7 @@ def test_tree_sequence_simulated_mutations(self): def set_partitions(collection): """ - Returns an ierator over all partitions of the specified set. + Returns an iterator over all partitions of the specified set. From https://stackoverflow.com/questions/19368375/set-partitions-in-python """ diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index c309062f66..1ada3ac795 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -3871,6 +3871,7 @@ def get_wf_example(self, N, T, seed): ts = tsutil.insert_random_ploidy_individuals(ts, max_ploidy=2) return ts + @pytest.mark.skip("Fails due to #1225") def test_no_mutation_times(self): ts = self.get_wf_example(10, 4, seed=925) self.verify_subset_union(ts) diff --git a/python/tests/test_tree_stats.py b/python/tests/test_tree_stats.py index a4489a2c92..aa47c53c3e 100644 --- a/python/tests/test_tree_stats.py +++ b/python/tests/test_tree_stats.py @@ -773,6 +773,11 @@ def test_single_tree_jukes_cantor(self): ts = tsutil.jukes_cantor(ts, 20, 1, seed=10) self.verify(ts) + def test_single_tree_single_site_many_silent(self): + ts = msprime.simulate(6, random_seed=1) + ts = tsutil.jukes_cantor(ts, 1, 20, seed=10) + self.verify(ts) + def test_single_tree_multichar_mutations(self): ts = msprime.simulate(6, random_seed=1, mutation_rate=1) ts = tsutil.insert_multichar_mutations(ts) diff --git a/python/tests/test_utilities.py b/python/tests/test_utilities.py index fd7cbac078..4e6895768e 100644 --- a/python/tests/test_utilities.py +++ b/python/tests/test_utilities.py @@ -53,6 +53,18 @@ def test_n50_multiroot(self): ts = tsutil.jukes_cantor(ts, 5, 2, seed=2) self.verify(ts) + def test_silent_mutations(self): + ts = msprime.simulate(50, random_seed=1) + ts = tsutil.jukes_cantor(ts, 5, 2, seed=2) + num_silent = 0 + for m in ts.mutations(): + if ( + m.parent != -1 + and ts.mutation(m.parent).derived_state == m.derived_state + ): + num_silent += 1 + assert num_silent > 20 + class TestCaterpillarTree: """ diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py index 5e5b1f1a2a..3dfd0a0d95 100644 --- a/python/tests/test_vcf.py +++ b/python/tests/test_vcf.py @@ -376,7 +376,10 @@ def verify_records(self, pyvcf_records, pysam_records): assert pyvcf_record.CHROM == pysam_record.chrom assert pyvcf_record.POS == pysam_record.pos assert pyvcf_record.ID == pysam_record.id - assert pyvcf_record.ALT == list(pysam_record.alts) + if pysam_record.alts: + assert pyvcf_record.ALT == list(pysam_record.alts) + else: + assert pyvcf_record.ALT == [] or pyvcf_record.ALT == [None] assert pyvcf_record.REF == pysam_record.ref assert pysam_record.filter[0].name == "PASS" assert pyvcf_record.FORMAT == "GT" @@ -489,7 +492,9 @@ def verify(self, ts): ): assert vcf_row.POS == np.round(variant.site.position) assert variant.alleles[0] == vcf_row.REF - assert list(variant.alleles[1:]) == vcf_row.ALT + assert list(variant.alleles[1:]) == [ + allele for allele in vcf_row.ALT if allele is not None + ] j = 0 for individual, sample in itertools.zip_longest( map(ts.individual, indivs), vcf_row.samples diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 37e09805f7..9c012c03f6 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -509,14 +509,14 @@ def generate_site_mutations( x = random.expovariate(mu) new_state = state while x < branch_length: - new_state = random.choice([s for s in states if s != state]) - if multiple_per_node and (state != new_state): + new_state = random.choice(states) + if multiple_per_node: mutation_table.add_row(site, u, new_state, parent) parent = mutation_table.num_rows - 1 state = new_state x += random.expovariate(mu) else: - if (not multiple_per_node) and (state != new_state): + if not multiple_per_node: mutation_table.add_row(site, u, new_state, parent) parent = mutation_table.num_rows - 1 state = new_state diff --git a/python/tskit/vcf.py b/python/tskit/vcf.py index 6733c24dac..578a73e5b6 100644 --- a/python/tskit/vcf.py +++ b/python/tskit/vcf.py @@ -188,7 +188,7 @@ def write(self, output): ) pos = self.transformed_positions[variant.index] ref = variant.alleles[0] - alt = ",".join(variant.alleles[1:]) + alt = ",".join(variant.alleles[1:]) if len(variant.alleles) > 1 else "." print( self.contig_id, pos,