From f0465e849889e3c401c68cf9993efcc38f53f26b Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 28 Nov 2019 20:03:39 +0000 Subject: [PATCH] User-specified allele mapping for genotypes. --- c/CHANGELOG.rst | 3 + c/dev-tools/dev-cli.c | 2 +- c/tests/test_genotypes.c | 278 +++++++++++++++++++++++++++++---- c/tests/test_trees.c | 8 +- c/tskit/core.c | 17 +- c/tskit/core.h | 11 +- c/tskit/genotypes.c | 166 ++++++++++++++++---- c/tskit/genotypes.h | 5 +- docs/python-api.rst | 2 + python/CHANGELOG.rst | 3 + python/_tskitmodule.c | 89 ++++++++++- python/tests/test_genotypes.py | 205 ++++++++++++++++++++++++ python/tests/test_lowlevel.py | 24 ++- python/tskit/__init__.py | 8 + python/tskit/trees.py | 76 ++++++--- 15 files changed, 783 insertions(+), 114 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index a4584e9207..9e5f54ca25 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -16,6 +16,9 @@ In development. of distinct alleles supported by 8 bit genotypes has therefore dropped from 255 to 127, with a similar reduction for 16 bit genotypes. +- Change the ``tsk_vargen_init`` method to take an extra parameter ``alleles``. + To keep the current behaviour, set this parameter to NULL. + **New features** - Add the ``TSK_KEEP_UNARY`` option to simplify (:user:`gtsambos`). See :issue:`1` diff --git a/c/dev-tools/dev-cli.c b/c/dev-tools/dev-cli.c index 1f89b21736..968b91fc37 100644 --- a/c/dev-tools/dev-cli.c +++ b/c/dev-tools/dev-cli.c @@ -62,7 +62,7 @@ print_variants(tsk_treeseq_t *ts) tsk_variant_t* var; printf("variants (%d) \n", (int) tsk_treeseq_get_num_sites(ts)); - ret = tsk_vargen_init(&vg, ts, NULL, 0, 0); + ret = tsk_vargen_init(&vg, ts, NULL, 0, NULL, 0); if (ret != 0) { fatal_library_error(ret, "tsk_vargen_alloc"); } diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index 3ae034ceba..060b1bb713 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -29,7 +29,7 @@ #include static void -test_simplest_vargen_missing_data(void) +test_simplest_missing_data(void) { const char *nodes = "1 0 0\n" @@ -45,7 +45,7 @@ test_simplest_vargen_missing_data(void) CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, 0); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -57,7 +57,7 @@ test_simplest_vargen_missing_data(void) CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, TSK_16_BIT_GENOTYPES); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_16_BIT_GENOTYPES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -69,7 +69,7 @@ test_simplest_vargen_missing_data(void) CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, TSK_IMPUTE_MISSING_DATA); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_IMPUTE_MISSING_DATA); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -85,7 +85,172 @@ test_simplest_vargen_missing_data(void) } static void -test_single_tree_vargen_char_alphabet(void) +test_simplest_missing_data_user_alleles(void) +{ + const char *nodes = + "1 0 0\n" + "1 0 0\n"; + const char *sites = + "0.0 A\n"; + tsk_treeseq_t ts; + tsk_vargen_t vargen; + tsk_variant_t *var; + const char *alleles[] = {"A", NULL}; + int ret; + + tsk_treeseq_from_text(&ts, 1, nodes, "", NULL, sites, NULL, NULL, NULL); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); + + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, alleles, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->site->position, 0.0); + CU_ASSERT_TRUE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes.i8[0], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes.i8[1], TSK_MISSING_DATA); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_free(&vargen); + + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_16_BIT_GENOTYPES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->site->position, 0.0); + CU_ASSERT_TRUE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes.i16[0], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes.i16[1], TSK_MISSING_DATA); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_free(&vargen); + + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_IMPUTE_MISSING_DATA); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->site->position, 0.0); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_free(&vargen); + + tsk_treeseq_free(&ts); +} + +static void +test_single_tree_user_alleles(void) +{ + int ret = 0; + const char *sites = + "0.0 G\n" + "0.125 A\n" + "0.25 C\n" + "0.5 A\n"; + const char *mutations = + "0 0 T -1\n" + "1 1 C -1\n" + "2 0 G -1\n" + "2 1 A -1\n" + "2 2 T -1\n" // A bunch of different sample mutations + "3 4 T -1\n" + "3 0 A 5\n"; // A back mutation from T -> A + tsk_treeseq_t ts; + tsk_vargen_t vargen; + tsk_variant_t *var; + const char *alleles[] = {"A", "C", "G", "T", NULL}; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + sites, mutations, NULL, NULL); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, alleles, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tsk_vargen_print_state(&vargen, _devnull); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->site->position, 0.0); + CU_ASSERT_EQUAL_FATAL(var->num_alleles, 4); + CU_ASSERT_EQUAL(var->allele_lengths[0], 1); + CU_ASSERT_EQUAL(var->allele_lengths[1], 1); + CU_ASSERT_EQUAL(var->allele_lengths[2], 1); + CU_ASSERT_EQUAL(var->allele_lengths[3], 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 3); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 2); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 2); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 2); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->site->position, 0.125); + CU_ASSERT_EQUAL(var->num_alleles, 4); + CU_ASSERT_EQUAL(var->allele_lengths[0], 1); + CU_ASSERT_EQUAL(var->allele_lengths[1], 1); + CU_ASSERT_EQUAL(var->allele_lengths[2], 1); + CU_ASSERT_EQUAL(var->allele_lengths[3], 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->site->position, 0.25); + CU_ASSERT_EQUAL(var->num_alleles, 4); + CU_ASSERT_EQUAL(var->allele_lengths[0], 1); + CU_ASSERT_EQUAL(var->allele_lengths[1], 1); + CU_ASSERT_EQUAL(var->allele_lengths[2], 1); + CU_ASSERT_EQUAL(var->allele_lengths[3], 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 2); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 3); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 1); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->site->position, 0.5); + CU_ASSERT_EQUAL(var->num_alleles, 4); + CU_ASSERT_EQUAL(var->allele_lengths[0], 1); + CU_ASSERT_EQUAL(var->allele_lengths[1], 1); + CU_ASSERT_EQUAL(var->allele_lengths[2], 1); + CU_ASSERT_EQUAL(var->allele_lengths[3], 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[1], 3); + CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); + CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tsk_vargen_free(&vargen); + tsk_treeseq_free(&ts); +} + +static void +test_single_tree_char_alphabet(void) { int ret = 0; const char *sites = @@ -107,7 +272,7 @@ test_single_tree_vargen_char_alphabet(void) tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, sites, mutations, NULL, NULL); - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, 0); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); @@ -178,7 +343,7 @@ test_single_tree_vargen_char_alphabet(void) } static void -test_single_tree_vargen_binary_alphabet(void) +test_single_tree_binary_alphabet(void) { int ret = 0; tsk_treeseq_t ts; @@ -187,7 +352,7 @@ test_single_tree_vargen_binary_alphabet(void) tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL); - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, 0); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); @@ -236,7 +401,7 @@ test_single_tree_vargen_binary_alphabet(void) } static void -test_single_tree_vargen_non_samples(void) +test_single_tree_non_samples(void) { int ret = 0; tsk_treeseq_t ts; @@ -248,11 +413,11 @@ test_single_tree_vargen_non_samples(void) tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL); /* It's an error to hand in non-samples without imputation turned on */ - ret = tsk_vargen_init(&vargen, &ts, samples, 2, 0); + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES); tsk_vargen_free(&vargen); - ret = tsk_vargen_init(&vargen, &ts, samples, 2, TSK_IMPUTE_MISSING_DATA); + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_IMPUTE_MISSING_DATA); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); @@ -294,10 +459,8 @@ test_single_tree_vargen_non_samples(void) tsk_treeseq_free(&ts); } - - static void -test_single_tree_vargen_errors(void) +test_single_tree_errors(void) { int ret; tsk_treeseq_t ts; @@ -306,22 +469,22 @@ test_single_tree_vargen_errors(void) tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL); - ret = tsk_vargen_init(&vargen, &ts, samples, 2, 0); + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); samples[0] = -1; - ret = tsk_vargen_init(&vargen, &ts, samples, 2, 0); + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_OUT_OF_BOUNDS); tsk_vargen_free(&vargen); samples[0] = 7; - ret = tsk_vargen_init(&vargen, &ts, samples, 2, 0); + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_OUT_OF_BOUNDS); tsk_vargen_free(&vargen); samples[0] = 3; - ret = tsk_vargen_init(&vargen, &ts, samples, 2, 0); + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); tsk_vargen_free(&vargen); @@ -329,7 +492,57 @@ test_single_tree_vargen_errors(void) } static void -test_single_tree_vargen_subsample(void) +test_single_tree_user_alleles_errors(void) +{ + int ret; + tsk_treeseq_t ts; + tsk_vargen_t vargen; + tsk_variant_t *var; + int j; + /* The maximium number of alleles is 127. We need space for one more plus the + * sentinel */ + const int max_alleles = 129; + const char * acct_alleles[] = {"A", "C", "G", "T", NULL}; + const char * zero_allele[] = {"0", NULL}; + const char * no_alleles[] = {NULL}; + const char * many_alleles[max_alleles]; + tsk_id_t samples[] = {0, 3}; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL); + + /* these are 0/1 alleles */ + ret = tsk_vargen_init(&vargen, &ts, samples, 2, acct_alleles, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ALLELE_NOT_FOUND); + tsk_vargen_free(&vargen); + + /* pass just the 0 allele alleles at all */ + ret = tsk_vargen_init(&vargen, &ts, samples, 2, zero_allele, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ALLELE_NOT_FOUND); + tsk_vargen_free(&vargen); + + /* Empty allele list is an error */ + ret = tsk_vargen_init(&vargen, &ts, samples, 2, no_alleles, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ZERO_ALLELES); + tsk_vargen_free(&vargen); + + for (j = 0; j < max_alleles; j++) { + many_alleles[j] = "0"; + } + many_alleles[128] = NULL; + ret = tsk_vargen_init(&vargen, &ts, samples, 2, many_alleles, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_ALLELES); + tsk_vargen_free(&vargen); + + tsk_treeseq_free(&ts); +} + +static void +test_single_tree_subsample(void) { int ret = 0; tsk_treeseq_t ts; @@ -339,7 +552,7 @@ test_single_tree_vargen_subsample(void) tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL); - ret = tsk_vargen_init(&vargen, &ts, samples, 2, 0); + ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); @@ -380,7 +593,7 @@ test_single_tree_vargen_subsample(void) CU_ASSERT_EQUAL_FATAL(ret, 0); /* Zero samples */ - ret = tsk_vargen_init(&vargen, &ts, samples, 0, 0); + ret = tsk_vargen_init(&vargen, &ts, samples, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); @@ -418,7 +631,7 @@ test_single_tree_vargen_subsample(void) } static void -test_single_tree_vargen_many_alleles(void) +test_single_tree_many_alleles(void) { int ret = 0; tsk_treeseq_t ts; @@ -452,7 +665,7 @@ test_single_tree_vargen_many_alleles(void) if (l == 1) { options = TSK_16_BIT_GENOTYPES; } - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, options); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, options); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); @@ -504,7 +717,7 @@ test_single_tree_inconsistent_mutations(void) for (s = 0; s < 2; s++) { for (f = 0; f < sizeof(options) / sizeof(*options); f++) { - ret = tsk_vargen_init(&vargen, &ts, samples[s], num_samples, options[f]); + ret = tsk_vargen_init(&vargen, &ts, samples[s], num_samples, NULL, options[f]); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -523,13 +736,16 @@ int main(int argc, char **argv) { CU_TestInfo tests[] = { - {"test_simplest_vargen_missing_data", test_simplest_vargen_missing_data}, - {"test_single_tree_vargen_char_alphabet", test_single_tree_vargen_char_alphabet}, - {"test_single_tree_vargen_binary_alphabet", test_single_tree_vargen_binary_alphabet}, - {"test_single_tree_vargen_non_samples", test_single_tree_vargen_non_samples}, - {"test_single_tree_vargen_errors", test_single_tree_vargen_errors}, - {"test_single_tree_vargen_subsample", test_single_tree_vargen_subsample}, - {"test_single_tree_vargen_many_alleles", test_single_tree_vargen_many_alleles}, + {"test_simplest_missing_data", test_simplest_missing_data}, + {"test_simplest_missing_data_user_alleles", test_simplest_missing_data_user_alleles}, + {"test_single_tree_user_alleles", test_single_tree_user_alleles}, + {"test_single_tree_char_alphabet", test_single_tree_char_alphabet}, + {"test_single_tree_binary_alphabet", test_single_tree_binary_alphabet}, + {"test_single_tree_non_samples", test_single_tree_non_samples}, + {"test_single_tree_errors", test_single_tree_errors}, + {"test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors}, + {"test_single_tree_subsample", test_single_tree_subsample}, + {"test_single_tree_many_alleles", test_single_tree_many_alleles}, {"test_single_tree_inconsistent_mutations", test_single_tree_inconsistent_mutations}, {NULL, NULL}, }; diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 95df43169c..11eae1fbe6 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -432,9 +432,9 @@ verify_simplify_genotypes(tsk_treeseq_t *ts, tsk_treeseq_t *subset, /* tsk_treeseq_print_state(ts, stdout); */ /* tsk_treeseq_print_state(subset, stdout); */ - ret = tsk_vargen_init(&vargen, ts, NULL, 0, 0); + ret = tsk_vargen_init(&vargen, ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_vargen_init(&subset_vargen, subset, NULL, 0, TSK_IMPUTE_MISSING_DATA); + ret = tsk_vargen_init(&subset_vargen, subset, NULL, 0, NULL, TSK_IMPUTE_MISSING_DATA); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(m, tsk_treeseq_get_num_sites(subset)); @@ -1004,7 +1004,7 @@ test_simplest_non_sample_leaf_records(void) CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, 0); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -1250,7 +1250,7 @@ test_simplest_back_mutations(void) CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, 0); + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); diff --git a/c/tskit/core.c b/c/tskit/core.c index 65013ac75e..617be5f2af 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -268,15 +268,9 @@ tsk_strerror_internal(int err) case TSK_ERR_MUTATION_PARENT_AFTER_CHILD: ret = "Parent mutation ID must be < current ID"; break; - case TSK_ERR_TOO_MANY_ALLELES: - ret = "Cannot have more than 127 alleles"; - break; case TSK_ERR_INCONSISTENT_MUTATIONS: ret = "Inconsistent mutations: state already equal to derived state"; break; - case TSK_ERR_NON_SINGLE_CHAR_MUTATION: - ret = "Only single char mutations supported"; - break; case TSK_ERR_UNSORTED_MUTATIONS: ret = "Mutations must be provided in non-decreasing site order"; break; @@ -363,11 +357,20 @@ tsk_strerror_internal(int err) ret = "Bad genotype value provided"; break; - /* Missing data errors */ + /* Genotype decoding errors */ + case TSK_ERR_TOO_MANY_ALLELES: + ret = "Cannot have more than 127 alleles"; + break; + case TSK_ERR_ZERO_ALLELES: + ret = "Must have at least one allele when specifying an allele map"; + break; case TSK_ERR_MUST_IMPUTE_NON_SAMPLES: ret = "Cannot generate genotypes for non-samples unless missing data " "imputation is enabled"; break; + case TSK_ERR_ALLELE_NOT_FOUND: + ret = "An allele was not found in the user-specified allele map"; + break; /* Distance metric errors */ case TSK_ERR_SAMPLE_SIZE_MISMATCH: diff --git a/c/tskit/core.h b/c/tskit/core.h index 54d02adb14..3f7b64c929 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -180,10 +180,8 @@ of tskit. #define TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE -500 #define TSK_ERR_MUTATION_PARENT_EQUAL -501 #define TSK_ERR_MUTATION_PARENT_AFTER_CHILD -502 -#define TSK_ERR_TOO_MANY_ALLELES -503 -#define TSK_ERR_INCONSISTENT_MUTATIONS -504 -#define TSK_ERR_NON_SINGLE_CHAR_MUTATION -505 -#define TSK_ERR_UNSORTED_MUTATIONS -506 +#define TSK_ERR_INCONSISTENT_MUTATIONS -503 +#define TSK_ERR_UNSORTED_MUTATIONS -505 /* Sample errors */ #define TSK_ERR_DUPLICATE_SAMPLE -600 @@ -219,8 +217,11 @@ of tskit. #define TSK_ERR_GENOTYPES_ALL_MISSING -1000 #define TSK_ERR_BAD_GENOTYPE -1001 -/* Missing data errors */ +/* Genotype decoding errors */ #define TSK_ERR_MUST_IMPUTE_NON_SAMPLES -1100 +#define TSK_ERR_ALLELE_NOT_FOUND -1101 +#define TSK_ERR_TOO_MANY_ALLELES -1102 +#define TSK_ERR_ZERO_ALLELES -1103 /* Distance metric errors */ #define TSK_ERR_SAMPLE_SIZE_MISMATCH -1200 diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index dc2f9074d2..3382a7cbfb 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -39,8 +39,18 @@ void tsk_vargen_print_state(tsk_vargen_t *self, FILE *out) { + tsk_size_t j; + fprintf(out, "tsk_vargen state\n"); + fprintf(out, "tree_index = %d\n", self->tree.index); fprintf(out, "tree_site_index = %d\n", (int) self->tree_site_index); + fprintf(out, "user_alleles = %d\n", self->user_alleles); + fprintf(out, "num_alleles = %d\n", self->variant.num_alleles); + for (j = 0; j < self->variant.num_alleles; j++) { + fprintf(out, "\tlen = %d, '%.*s'\n", self->variant.allele_lengths[j], + self->variant.allele_lengths[j], self->variant.alleles[j]); + + } } static int @@ -59,16 +69,49 @@ tsk_vargen_next_tree(tsk_vargen_t *self) return ret; } +/* Copy the fixed allele mapping specified by the user into local + * memory. */ +static int +tsk_vargen_copy_alleles(tsk_vargen_t *self, const char **alleles) +{ + int ret = 0; + tsk_size_t j; + size_t total_len, allele_len, offset; + + self->variant.num_alleles = self->variant.max_alleles; + + total_len = 0; + for (j = 0; j < self->variant.num_alleles; j++) { + allele_len = strlen(alleles[j]); + self->variant.allele_lengths[j] = (tsk_size_t) allele_len; + total_len += allele_len; + } + self->user_alleles_mem = malloc(total_len * sizeof(char *)); + if (self->user_alleles_mem == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + offset = 0; + for (j = 0; j < self->variant.num_alleles; j++) { + strcpy(self->user_alleles_mem + offset, alleles[j]); + self->variant.alleles[j] = self->user_alleles_mem + offset; + offset += self->variant.allele_lengths[j]; + } +out: + return ret; +} + int tsk_vargen_init(tsk_vargen_t *self, tsk_treeseq_t *tree_sequence, - tsk_id_t *samples, size_t num_samples, tsk_flags_t options) + tsk_id_t *samples, size_t num_samples, const char **alleles, + tsk_flags_t options) { int ret = TSK_ERR_NO_MEMORY; tsk_flags_t tree_options; const tsk_flags_t *flags = tree_sequence->tables->nodes.flags; - size_t j, num_nodes, num_samples_alloc; + size_t j, num_nodes, num_samples_alloc, max_alleles_limit; bool impute_missing = !!(options & TSK_IMPUTE_MISSING_DATA); - tsk_size_t max_alleles = 4; + tsk_size_t max_alleles; tsk_id_t u; assert(tree_sequence != NULL); @@ -119,12 +162,31 @@ tsk_vargen_init(tsk_vargen_t *self, tsk_treeseq_t *tree_sequence, if (self->options & TSK_16_BIT_GENOTYPES) { self->variant.genotypes.i16 = malloc( num_samples_alloc * sizeof(*self->variant.genotypes.i16)); + max_alleles_limit = INT16_MAX; } else { self->variant.genotypes.i8 = malloc( num_samples_alloc * sizeof(*self->variant.genotypes.i8)); + max_alleles_limit = INT8_MAX; + } + + if (alleles == NULL) { + self->user_alleles = false; + max_alleles = 4; /* Arbitrary --- we'll rarely have more than this */ + } else { + self->user_alleles = true; + /* Count the input alleles. The end is designated by the NULL sentinel. */ + for (max_alleles = 0; alleles[max_alleles] != NULL; max_alleles++); + if (max_alleles > max_alleles_limit) { + ret = TSK_ERR_TOO_MANY_ALLELES; + goto out; + } + if (max_alleles == 0) { + ret = TSK_ERR_ZERO_ALLELES; + goto out; + } } self->variant.max_alleles = max_alleles; - self->variant.alleles = malloc(max_alleles * sizeof(*self->variant.alleles)); + self->variant.alleles = calloc(max_alleles, sizeof(*self->variant.alleles)); self->variant.allele_lengths = malloc(max_alleles * sizeof(*self->variant.allele_lengths)); /* Because genotypes is a union we can check the pointer */ @@ -133,6 +195,13 @@ tsk_vargen_init(tsk_vargen_t *self, tsk_treeseq_t *tree_sequence, ret = TSK_ERR_NO_MEMORY; goto out; } + if (self->user_alleles) { + ret = tsk_vargen_copy_alleles(self, alleles); + if (ret != 0) { + goto out; + } + } + /* When a list of samples is given, we use the traversal based algorithm * and turn off the sample list tracking in the tree */ tree_options = 0; @@ -161,6 +230,7 @@ tsk_vargen_free(tsk_vargen_t *self) tsk_safe_free(self->variant.genotypes.i8); tsk_safe_free(self->variant.alleles); tsk_safe_free(self->variant.allele_lengths); + tsk_safe_free(self->user_alleles_mem); tsk_safe_free(self->samples); if (self->sample_index_map_allocated) { tsk_safe_free(self->sample_index_map); @@ -207,7 +277,7 @@ tsk_vargen_expand_alleles(tsk_vargen_t *self) * same reason. */ static int TSK_WARN_UNUSED -tsk_vargen_update_genotypes_i8_sample_list(tsk_vargen_t *self, tsk_id_t node, tsk_size_t derived) +tsk_vargen_update_genotypes_i8_sample_list(tsk_vargen_t *self, tsk_id_t node, tsk_id_t derived) { int8_t *restrict genotypes = self->variant.genotypes.i8; const tsk_id_t *restrict list_left = self->tree.left_sample; @@ -238,7 +308,7 @@ tsk_vargen_update_genotypes_i8_sample_list(tsk_vargen_t *self, tsk_id_t node, ts } static int TSK_WARN_UNUSED -tsk_vargen_update_genotypes_i16_sample_list(tsk_vargen_t *self, tsk_id_t node, tsk_size_t derived) +tsk_vargen_update_genotypes_i16_sample_list(tsk_vargen_t *self, tsk_id_t node, tsk_id_t derived) { int16_t *restrict genotypes = self->variant.genotypes.i16; const tsk_id_t *restrict list_left = self->tree.left_sample; @@ -274,10 +344,10 @@ tsk_vargen_update_genotypes_i16_sample_list(tsk_vargen_t *self, tsk_id_t node, t * and so we use a visit function to avoid duplicating code. */ -typedef int (*visit_func_t)(tsk_vargen_t *, tsk_id_t, tsk_size_t); +typedef int (*visit_func_t)(tsk_vargen_t *, tsk_id_t, tsk_id_t); static int TSK_WARN_UNUSED -tsk_vargen_traverse(tsk_vargen_t *self, tsk_id_t node, tsk_size_t derived, visit_func_t visit) +tsk_vargen_traverse(tsk_vargen_t *self, tsk_id_t node, tsk_id_t derived, visit_func_t visit) { int ret = 0; tsk_id_t * restrict stack = self->tree.stack1; @@ -309,7 +379,7 @@ tsk_vargen_traverse(tsk_vargen_t *self, tsk_id_t node, tsk_size_t derived, visit } static int -tsk_vargen_visit_i8(tsk_vargen_t *self, tsk_id_t sample_index, tsk_size_t derived) +tsk_vargen_visit_i8(tsk_vargen_t *self, tsk_id_t sample_index, tsk_id_t derived) { int ret = 0; int8_t *restrict genotypes = self->variant.genotypes.i8; @@ -326,7 +396,7 @@ tsk_vargen_visit_i8(tsk_vargen_t *self, tsk_id_t sample_index, tsk_size_t derive } static int -tsk_vargen_visit_i16(tsk_vargen_t *self, tsk_id_t sample_index, tsk_size_t derived) +tsk_vargen_visit_i16(tsk_vargen_t *self, tsk_id_t sample_index, tsk_id_t derived) { int ret = 0; int16_t *restrict genotypes = self->variant.genotypes.i16; @@ -343,13 +413,13 @@ tsk_vargen_visit_i16(tsk_vargen_t *self, tsk_id_t sample_index, tsk_size_t deriv } static int TSK_WARN_UNUSED -tsk_vargen_update_genotypes_i8_traversal(tsk_vargen_t *self, tsk_id_t node, tsk_size_t derived) +tsk_vargen_update_genotypes_i8_traversal(tsk_vargen_t *self, tsk_id_t node, tsk_id_t derived) { return tsk_vargen_traverse(self, node, derived, tsk_vargen_visit_i8); } static int TSK_WARN_UNUSED -tsk_vargen_update_genotypes_i16_traversal(tsk_vargen_t *self, tsk_id_t node, tsk_size_t derived) +tsk_vargen_update_genotypes_i16_traversal(tsk_vargen_t *self, tsk_id_t node, tsk_id_t derived) { return tsk_vargen_traverse(self, node, derived, tsk_vargen_visit_i16); } @@ -394,18 +464,36 @@ tsk_vargen_mark_missing_i8(tsk_vargen_t *self) return ret; } +static tsk_id_t +tsk_vargen_get_allele_index(tsk_vargen_t *self, const char *allele, tsk_size_t length) +{ + tsk_id_t ret = -1; + tsk_size_t j; + const tsk_variant_t *var = &self->variant; + + for (j = 0; j < var->num_alleles; j++) { + if (length == var->allele_lengths[j] + && memcmp(allele, var->alleles[j], length) == 0) { + ret = (tsk_id_t) j; + break; + } + } + return ret; +} + static int tsk_vargen_update_site(tsk_vargen_t *self) { int ret = 0; - tsk_size_t j, derived; + tsk_id_t allele_index; + tsk_size_t j; tsk_variant_t *var = &self->variant; tsk_site_t *site = var->site; tsk_mutation_t mutation; bool genotypes16 = !!(self->options & TSK_16_BIT_GENOTYPES); bool impute_missing = !!(self->options & TSK_IMPUTE_MISSING_DATA); bool by_traversal = self->samples != NULL; - int (*update_genotypes)(tsk_vargen_t *, tsk_id_t, tsk_size_t); + int (*update_genotypes)(tsk_vargen_t *, tsk_id_t, tsk_id_t); bool (*mark_missing)(tsk_vargen_t *); /* For now we use a traversal method to find genotypes when we have a @@ -427,11 +515,20 @@ tsk_vargen_update_site(tsk_vargen_t *self) update_genotypes = tsk_vargen_update_genotypes_i8_traversal; } } - - /* Ancestral state is always allele 0 */ - var->alleles[0] = site->ancestral_state; - var->allele_lengths[0] = site->ancestral_state_length; - var->num_alleles = 1; + if (self->user_alleles) { + allele_index = tsk_vargen_get_allele_index(self, site->ancestral_state, + site->ancestral_state_length); + if (allele_index == -1) { + ret = TSK_ERR_ALLELE_NOT_FOUND; + goto out; + } + } else { + /* Ancestral state is always allele 0 */ + var->alleles[0] = site->ancestral_state; + var->allele_lengths[0] = site->ancestral_state_length; + var->num_alleles = 1; + allele_index = 0; + } /* The algorithm for generating the allelic state of every sample works by * examining each mutation in order, and setting the state for all the @@ -443,34 +540,37 @@ tsk_vargen_update_site(tsk_vargen_t *self) * in the list of mutations. This guarantees the correctness of this algorithm. */ if (genotypes16) { - memset(self->variant.genotypes.i16, 0, 2 * self->num_samples); + for (j = 0; j < self->num_samples; j++) { + self->variant.genotypes.i16[j] = (int16_t) allele_index; + } } else { - memset(self->variant.genotypes.i8, 0, self->num_samples); + for (j = 0; j < self->num_samples; j++) { + self->variant.genotypes.i8[j] = (int8_t) allele_index; + } } for (j = 0; j < site->mutations_length; j++) { mutation = site->mutations[j]; /* Compute the allele index for this derived state value. */ - derived = 0; - while (derived < var->num_alleles) { - if (mutation.derived_state_length == var->allele_lengths[derived] - && memcmp(mutation.derived_state, var->alleles[derived], - var->allele_lengths[derived]) == 0) { - break; + allele_index = tsk_vargen_get_allele_index(self, + mutation.derived_state, mutation.derived_state_length); + if (allele_index == -1) { + if (self->user_alleles) { + ret = TSK_ERR_ALLELE_NOT_FOUND; + goto out; } - derived++; - } - if (derived == var->num_alleles) { if (var->num_alleles == var->max_alleles) { ret = tsk_vargen_expand_alleles(self); if (ret != 0) { goto out; } } - var->alleles[derived] = mutation.derived_state; - var->allele_lengths[derived] = mutation.derived_state_length; + allele_index = (tsk_id_t) var->num_alleles; + var->alleles[allele_index] = mutation.derived_state; + var->allele_lengths[allele_index] = mutation.derived_state_length; var->num_alleles++; } - ret = update_genotypes(self, mutation.node, derived); + + ret = update_genotypes(self, mutation.node, allele_index); if (ret != 0) { goto out; } diff --git a/c/tskit/genotypes.h b/c/tskit/genotypes.h index dbfcd33ef6..3200647bf8 100644 --- a/c/tskit/genotypes.h +++ b/c/tskit/genotypes.h @@ -55,6 +55,8 @@ typedef struct { tsk_id_t *samples; tsk_id_t *sample_index_map; bool sample_index_map_allocated; + bool user_alleles; + char *user_alleles_mem; size_t tree_site_index; int finished; tsk_tree_t tree; @@ -63,7 +65,8 @@ typedef struct { } tsk_vargen_t; int tsk_vargen_init(tsk_vargen_t *self, tsk_treeseq_t *tree_sequence, - tsk_id_t *samples, size_t num_samples, tsk_flags_t options); + tsk_id_t *samples, size_t num_samples, const char **alleles, + tsk_flags_t options); int tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant); int tsk_vargen_free(tsk_vargen_t *self); void tsk_vargen_print_state(tsk_vargen_t *self, FILE *out); diff --git a/docs/python-api.rst b/docs/python-api.rst index 36acd7e96d..0068bd9d41 100644 --- a/docs/python-api.rst +++ b/docs/python-api.rst @@ -64,6 +64,8 @@ Constants .. autodata:: tskit.REVERSE :annotation: = -1 +.. autodata:: tskit.ALLELES_ACGT + ++++++++++++++++++++++++ Simple container classes diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 83069e321a..157fc78c89 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -16,6 +16,9 @@ In development - Access the number of children of a node in a tree directly using ``tree.num_children(u)`` (:user:`hyanwong`, :pr:`436`). +- User specified allele mapping for genotypes in ``variants`` and + ``genotype_matrix`` (:user:`jeromekelleher`, :pr:`430`). + **Bugfixes** -------------------- diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 6428d66356..fbb5eb8ee0 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -512,6 +512,57 @@ convert_transitions(tsk_state_transition_t *transitions, size_t num_transitions) return ret; } +static const char ** +parse_allele_list(PyObject *allele_tuple) +{ + const char **ret = NULL; + const char **alleles = NULL; + PyObject *str; + Py_ssize_t j, num_alleles; + + if (! PyTuple_Check(allele_tuple)) { + PyErr_SetString(PyExc_TypeError, "Fixed allele list must be a tuple"); + goto out; + } + + num_alleles = PyTuple_Size(allele_tuple); + if (num_alleles == 0) { + PyErr_SetString(PyExc_ValueError, "Must specify at least one allele"); + goto out; + } + /* Leave space for the sentinel, and initialise to NULL */ + alleles = PyMem_Calloc(num_alleles + 1, sizeof(*alleles)); + if (alleles == NULL) { + PyErr_NoMemory(); + goto out; + } + for (j = 0; j < num_alleles; j++) { + str = PyTuple_GetItem(allele_tuple, j); + if (str == NULL) { + goto out; + } + if (!PyUnicode_Check(str)) { + PyErr_SetString(PyExc_TypeError, "alleles must be strings"); + goto out; + } + /* PyUnicode_AsUTF8AndSize caches the UTF8 representation of the string + * within the object, and we're not responsible for freeing it. Thus, + * once we're sure the string object stays alive for the lifetime of the + * returned string, we can be sure it's safe. These strings are immediately + * copied during tsk_vargen_init, so the operation is safe. + */ + alleles[j] = PyUnicode_AsUTF8AndSize(str, NULL); + if (alleles[j] == NULL) { + goto out; + } + } + ret = alleles; + alleles = NULL; +out: + PyMem_Free(alleles); + return ret; +} + /*=================================================================== * General table code. *=================================================================== @@ -7278,17 +7329,19 @@ static PyObject * TreeSequence_get_genotype_matrix(TreeSequence *self, PyObject *args, PyObject *kwds) { PyObject *ret = NULL; - static char *kwlist[] = {"impute_missing_data", NULL}; + static char *kwlist[] = {"impute_missing_data", "alleles", NULL}; int err; size_t num_sites; size_t num_samples; npy_intp dims[2]; + PyObject *py_alleles = Py_None; PyArrayObject *genotype_matrix = NULL; tsk_vargen_t *vg = NULL; char *V; tsk_variant_t *variant; size_t j; int impute_missing_data = 0; + const char **alleles = NULL; tsk_flags_t options = 0; if (TreeSequence_check_tree_sequence(self) != 0) { @@ -7296,12 +7349,21 @@ TreeSequence_get_genotype_matrix(TreeSequence *self, PyObject *args, PyObject *k } /* TODO add option for 16 bit genotypes */ - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i", kwlist, &impute_missing_data)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO", kwlist, + &impute_missing_data, &py_alleles)) { goto out; } if (impute_missing_data) { options |= TSK_IMPUTE_MISSING_DATA; } + + if (py_alleles != Py_None) { + alleles = parse_allele_list(py_alleles); + if (alleles == NULL) { + goto out; + } + } + num_sites = tsk_treeseq_get_num_sites(self->tree_sequence); num_samples = tsk_treeseq_get_num_samples(self->tree_sequence); dims[0] = num_sites; @@ -7317,7 +7379,7 @@ TreeSequence_get_genotype_matrix(TreeSequence *self, PyObject *args, PyObject *k PyErr_NoMemory(); goto out; } - err = tsk_vargen_init(vg, self->tree_sequence, NULL, 0, options); + err = tsk_vargen_init(vg, self->tree_sequence, NULL, 0, alleles, options); if (err != 0) { handle_library_error(err); goto out; @@ -7339,6 +7401,7 @@ TreeSequence_get_genotype_matrix(TreeSequence *self, PyObject *args, PyObject *k PyMem_Free(vg); } Py_XDECREF(genotype_matrix); + PyMem_Free(alleles); return ret; } @@ -8739,21 +8802,25 @@ VariantGenerator_init(VariantGenerator *self, PyObject *args, PyObject *kwds) { int ret = -1; int err; - static char *kwlist[] = {"tree_sequence", "samples", "impute_missing_data", NULL}; + static char *kwlist[] = {"tree_sequence", "samples", "impute_missing_data", + "alleles", NULL}; TreeSequence *tree_sequence = NULL; PyObject *samples_input = Py_None; + PyObject *py_alleles = Py_None; PyArrayObject *samples_array = NULL; tsk_id_t *samples = NULL; size_t num_samples = 0; int impute_missing_data = 0; + const char **alleles = NULL; npy_intp *shape; tsk_flags_t options = 0; /* TODO add option for 16 bit genotypes */ self->variant_generator = NULL; self->tree_sequence = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|Oi", kwlist, - &TreeSequenceType, &tree_sequence, &samples_input, &impute_missing_data)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|OiO", kwlist, + &TreeSequenceType, &tree_sequence, &samples_input, + &impute_missing_data, &py_alleles)) { goto out; } if (impute_missing_data) { @@ -8774,6 +8841,12 @@ VariantGenerator_init(VariantGenerator *self, PyObject *args, PyObject *kwds) num_samples = (size_t) shape[0]; samples = PyArray_DATA(samples_array); } + if (py_alleles != Py_None) { + alleles = parse_allele_list(py_alleles); + if (alleles == NULL) { + goto out; + } + } self->variant_generator = PyMem_Malloc(sizeof(tsk_vargen_t)); if (self->variant_generator == NULL) { PyErr_NoMemory(); @@ -8783,13 +8856,15 @@ VariantGenerator_init(VariantGenerator *self, PyObject *args, PyObject *kwds) * to avoid this we would INCREF the samples array above and keep a reference * to in the object struct */ err = tsk_vargen_init(self->variant_generator, - self->tree_sequence->tree_sequence, samples, num_samples, options); + self->tree_sequence->tree_sequence, samples, num_samples, alleles, + options); if (err != 0) { handle_library_error(err); goto out; } ret = 0; out: + PyMem_Free(alleles); Py_XDECREF(samples_array); return ret; } diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 3362240a47..506a0fb7a9 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -30,6 +30,7 @@ import msprime import tskit + from tskit import exceptions import tests.tsutil as tsutil import tests.test_wright_fisher as wf @@ -617,3 +618,207 @@ def test_missing_data(self): self.assertEqual(h, [c, c]) h = list(ts.haplotypes(impute_missing_data=True)) self.assertEqual(h, ["A", "A"]) + + +class TestUserAlleles(unittest.TestCase): + """ + Tests the functionality of providing a user-specified allele mapping. + """ + + def test_simple_01(self): + ts = msprime.simulate(10, mutation_rate=5, random_seed=2) + self.assertGreater(ts.num_sites, 2) + G1 = ts.genotype_matrix() + G2 = ts.genotype_matrix(alleles=("0", "1")) + self.assertTrue(np.array_equal(G1, G2)) + for v1, v2 in itertools.zip_longest( + ts.variants(), ts.variants(alleles=("0", "1"))): + self.assertEqual(v1.alleles, v2.alleles) + self.assertEqual(v1.site, v2.site) + self.assertTrue(np.array_equal(v1.genotypes, v2.genotypes)) + + def test_simple_01_trailing_alleles(self): + ts = msprime.simulate(10, mutation_rate=5, random_seed=2) + self.assertGreater(ts.num_sites, 2) + G1 = ts.genotype_matrix() + alleles = ("0", "1", "2", "xxxxx") + G2 = ts.genotype_matrix(alleles=alleles) + self.assertTrue(np.array_equal(G1, G2)) + for v1, v2 in itertools.zip_longest( + ts.variants(), ts.variants(alleles=alleles)): + self.assertEqual(v2.alleles, alleles) + self.assertEqual(v1.site, v2.site) + self.assertTrue(np.array_equal(v1.genotypes, v2.genotypes)) + + def test_simple_01_leading_alleles(self): + ts = msprime.simulate(10, mutation_rate=5, random_seed=2) + self.assertGreater(ts.num_sites, 2) + G1 = ts.genotype_matrix() + alleles = ("A", "B", "C", "0", "1") + G2 = ts.genotype_matrix(alleles=alleles) + self.assertTrue(np.array_equal(G1 + 3, G2)) + for v1, v2 in itertools.zip_longest( + ts.variants(), ts.variants(alleles=alleles)): + self.assertEqual(v2.alleles, alleles) + self.assertEqual(v1.site, v2.site) + self.assertTrue(np.array_equal(v1.genotypes + 3, v2.genotypes)) + + def test_simple_01_duplicate_alleles(self): + ts = msprime.simulate(10, mutation_rate=5, random_seed=2) + self.assertGreater(ts.num_sites, 2) + G1 = ts.genotype_matrix() + alleles = ("0", "0", "1") + G2 = ts.genotype_matrix(alleles=alleles) + index = np.where(G1 == 1) + G1[index] = 2 + self.assertTrue(np.array_equal(G1, G2)) + for v1, v2 in itertools.zip_longest( + ts.variants(), ts.variants(alleles=alleles)): + self.assertEqual(v2.alleles, alleles) + self.assertEqual(v1.site, v2.site) + g = v1.genotypes + index = np.where(g == 1) + g[index] = 2 + self.assertTrue(np.array_equal(g, v2.genotypes)) + + def test_simple_acgt(self): + ts = msprime.simulate(10, random_seed=2) + ts = msprime.mutate( + ts, rate=4, random_seed=2, model=msprime.InfiniteSites(msprime.NUCLEOTIDES)) + self.assertGreater(ts.num_sites, 2) + alleles = tskit.ALLELES_ACGT + G = ts.genotype_matrix(alleles=alleles) + for v1, v2 in itertools.zip_longest(ts.variants(), ts.variants(alleles=alleles)): + self.assertEqual(v2.alleles, alleles) + self.assertEqual(v1.site, v2.site) + h1 = "".join(v1.alleles[g] for g in v1.genotypes) + h2 = "".join(v2.alleles[g] for g in v2.genotypes) + self.assertEqual(h1, h2) + self.assertTrue(np.array_equal(v2.genotypes, G[v1.site.id])) + + def test_missing_alleles(self): + ts = msprime.simulate(10, random_seed=2) + ts = msprime.mutate( + ts, rate=4, random_seed=2, model=msprime.InfiniteSites(msprime.NUCLEOTIDES)) + self.assertGreater(ts.num_sites, 2) + bad_allele_examples = [ + tskit.ALLELES_01, tuple(["A"]), ("C", "T", "G"), ("AA", "C", "T", "G"), + tuple(["ACTG"])] + for bad_alleles in bad_allele_examples: + with self.assertRaises(exceptions.LibraryError): + ts.genotype_matrix(alleles=bad_alleles) + with self.assertRaises(exceptions.LibraryError): + list(ts.variants(alleles=bad_alleles)) + + def test_too_many_alleles(self): + ts = msprime.simulate(10, mutation_rate=5, random_seed=2) + for n in range(128, 138): + bad_alleles = tuple(["0" for _ in range(n)]) + with self.assertRaises(exceptions.LibraryError): + ts.genotype_matrix(alleles=bad_alleles) + with self.assertRaises(exceptions.LibraryError): + list(ts.variants(alleles=bad_alleles)) + + def test_zero_allele(self): + ts = msprime.simulate(10, mutation_rate=5, random_seed=2) + with self.assertRaises(ValueError): + ts.genotype_matrix(alleles=tuple()) + with self.assertRaises(ValueError): + list(ts.variants(alleles=tuple())) + + def test_missing_data(self): + tables = tskit.TableCollection(1) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.sites.add_row(0.5, "0") + tables.mutations.add_row(0, 0, "1") + + ts = tables.tree_sequence() + for impute in [True, False]: + G1 = ts.genotype_matrix(impute_missing_data=impute) + G2 = ts.genotype_matrix( + impute_missing_data=impute, alleles=tskit.ALLELES_01) + self.assertTrue(np.array_equal(G1, G2)) + vars1 = ts.variants(impute_missing_data=impute) + vars2 = ts.variants(impute_missing_data=impute, alleles=tskit.ALLELES_01) + for v1, v2 in itertools.zip_longest(vars1, vars2): + self.assertEqual(v2.alleles, v1.alleles) + self.assertEqual(v1.site, v2.site) + self.assertTrue(np.array_equal(v1.genotypes, v2.genotypes)) + + +class TestUserAllelesRoundTrip(unittest.TestCase): + """ + Tests that we correctly produce haplotypes in a variety of situations for + the user specified allele map encoding. + """ + def verify(self, ts, alleles): + for v1, v2 in itertools.zip_longest(ts.variants(), ts.variants(alleles=alleles)): + h1 = [v1.alleles[g] for g in v1.genotypes] + h2 = [v2.alleles[g] for g in v2.genotypes] + self.assertEqual(h1, h2) + + def test_simple_01(self): + ts = msprime.simulate(5, mutation_rate=2, random_seed=3) + self.assertGreater(ts.num_sites, 3) + valid_alleles = [ + tskit.ALLELES_01, + ("0", "1", "xry"), + ("xry", "0", "1", "xry"), + tuple([str(j) for j in range(127)]), + tuple(["0" for j in range(126)] + ["1"]), + ] + for alleles in valid_alleles: + self.verify(ts, alleles) + + def test_simple_acgt(self): + ts = msprime.simulate(5, random_seed=3) + ts = msprime.mutate( + ts, rate=4, random_seed=3, model=msprime.InfiniteSites(msprime.NUCLEOTIDES)) + self.assertGreater(ts.num_sites, 3) + valid_alleles = [ + tskit.ALLELES_ACGT, + ("A", "C", "T", "G", "AAAAAAAAAAAAAA"), + ("AA", "CC", "TT", "GG", "A", "C", "T", "G"), + ] + for alleles in valid_alleles: + self.verify(ts, alleles) + + def test_jukes_cantor(self): + ts = msprime.simulate(6, random_seed=1, mutation_rate=1) + ts = tsutil.jukes_cantor(ts, 20, 1, seed=10) + valid_alleles = [ + tskit.ALLELES_ACGT, + ("A", "C", "T", "G", "AAAAAAAAAAAAAA"), + ("AA", "CC", "TT", "GG", "A", "C", "T", "G"), + ] + for alleles in valid_alleles: + self.verify(ts, alleles) + + def test_multichar_mutations(self): + ts = msprime.simulate(6, random_seed=1, recombination_rate=2) + ts = tsutil.insert_multichar_mutations(ts) + self.assertGreater(ts.num_sites, 5) + all_alleles = set() + for var in ts.variants(): + all_alleles.update(var.alleles) + all_alleles = tuple(all_alleles) + self.verify(ts, all_alleles) + self.verify(ts, all_alleles[::-1]) + + def test_simple_01_missing_data(self): + ts = msprime.simulate(6, mutation_rate=2, random_seed=3) + tables = ts.dump_tables() + # Add another sample node. This will be missing data everywhere. + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + self.assertGreater(ts.num_sites, 3) + valid_alleles = [ + tskit.ALLELES_01, + ("0", "1", "xry"), + ("xry", "0", "1", "xry"), + tuple([str(j) for j in range(127)]), + tuple(["0" for j in range(126)] + ["1"]), + ] + for alleles in valid_alleles: + self.verify(ts, alleles) diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index be52e0bd4c..b9ee7fa1ed 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -341,7 +341,11 @@ def test_get_genotype_matrix_interface(self): G = ts.get_genotype_matrix() self.assertEqual(G.shape, (num_sites, num_samples)) self.assertRaises( - TypeError, ts.get_genotype_matrix, impute_missing_data=None) + TypeError, ts.get_genotype_matrix, impute_missing_data=None) + self.assertRaises( + TypeError, ts.get_genotype_matrix, alleles="XYZ") + self.assertRaises( + ValueError, ts.get_genotype_matrix, alleles=tuple()) G = ts.get_genotype_matrix(impute_missing_data=True) self.assertEqual(G.shape, (num_sites, num_samples)) @@ -1093,6 +1097,24 @@ def test_constructor(self): TypeError, _tskit.VariantGenerator, ts, impute_missing_data=None) self.assertRaises( _tskit.LibraryError, _tskit.VariantGenerator, ts, samples=[-1, 2]) + self.assertRaises( + TypeError, _tskit.VariantGenerator, ts, alleles=1234) + + def test_alleles(self): + ts = self.get_example_tree_sequence() + for bad_type in [["a", "b"], "sdf", 234]: + with self.assertRaises(TypeError): + _tskit.VariantGenerator(ts, samples=[1, 2], alleles=bad_type) + with self.assertRaises(ValueError): + _tskit.VariantGenerator(ts, samples=[1, 2], alleles=tuple()) + + for bad_allele_type in [None, 0, b"x", []]: + with self.assertRaises(TypeError): + _tskit.VariantGenerator(ts, samples=[1, 2], alleles=(bad_allele_type,)) + + too_many_alleles = tuple(str(j) for j in range(128)) + with self.assertRaises(_tskit.LibraryError): + _tskit.VariantGenerator(ts, samples=[1, 2], alleles=too_many_alleles) def test_iterator(self): ts = self.get_example_tree_sequence() diff --git a/python/tskit/__init__.py b/python/tskit/__init__.py index a9a8690e68..6747e023e0 100644 --- a/python/tskit/__init__.py +++ b/python/tskit/__init__.py @@ -39,6 +39,14 @@ #: decreasing genomic coordinate values). REVERSE = _tskit.REVERSE +#: The allele mapping where the strings "0" and "1" map to genotype +#: values 0 and 1. +ALLELES_01 = ("0", "1") + +#: The allele mapping where the four nucleotides A, C, G and T map to +#: the genotype integers 0, 1, 2, and 3, respectively. +ALLELES_ACGT = ("A", "C", "G", "T") + from tskit.provenance import __version__ # NOQA from tskit.provenance import validate_provenance # NOQA from tskit.formats import * # NOQA diff --git a/python/tskit/trees.py b/python/tskit/trees.py index bc62d3ac8b..fe1db506b1 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -332,10 +332,13 @@ class Variant(SimpleContainer): mapping sample IDs to the observed alleles. Each element in the ``alleles`` tuple is a string, representing the - actual observed state for a given sample. The first element of this - tuple is guaranteed to be the same as the site's ``ancestral_state`` value. - The list of alleles is also guaranteed not to contain any duplicates. - However, allelic values may be listed that are not referred to by any + actual observed state for a given sample. The ``alleles`` tuple is + generated in one of two ways. The first (and default) way is for + ``tskit`` to generate the encoding on the fly as alleles are encountered + while generating genotypes. In this case, the first element of this + tuple is guaranteed to be the same as the site's ``ancestral_state`` value + and the list of alleles is also guaranteed not to contain any duplicates. + Note that allelic values may be listed that are not referred to by any samples. For example, if we have a site that is fixed for the derived state (i.e., we have a mutation over the tree root), all genotypes will be 1, but the alleles list will be equal to ``('0', '1')``. Other than the @@ -343,6 +346,12 @@ class Variant(SimpleContainer): no particular order, and the ordering should not be relied upon (but see the notes on missing data below). + The second way is for the user to define the mapping between + genotype values and allelic state strings using the + ``alleles`` parameter to the :meth:`TreeSequence.variants` method. + In this case, there is no indication of which allele is the ancestral state, + as the ordering is determined by the user. + The ``genotypes`` represent the observed allelic states for each sample, such that ``var.alleles[var.genotypes[j]]`` gives the string allele for sample ID ``j``. Thus, the elements of the genotypes array are @@ -2935,14 +2944,25 @@ def haplotypes(self, impute_missing_data=False, missing_data_character="-"): for h in H: yield h.tostring().decode('ascii') - def variants(self, as_bytes=False, samples=None, impute_missing_data=False): + def variants( + self, as_bytes=False, samples=None, impute_missing_data=False, + alleles=None): """ Returns an iterator over the variants in this tree sequence. See the :class:`Variant` class for details on the fields of each returned - object. By default the ``genotypes`` for the variants are numpy arrays, - corresponding to indexes into the ``alleles`` array. If the - ``as_bytes`` parameter is true, these allelic values are recorded - directly into a bytes array. + object. The ``genotypes`` for the variants are numpy arrays, + corresponding to indexes into the ``alleles`` attribute in the + :class:`Variant` object. By default, the ``alleles`` for each + site are generated automatically, such that the ancestral state + is at the zeroth index and subsequent alleles are listed in no + particular order. This means that the encoding of alleles in + terms of genotype values can vary from site-to-site, which is + sometimes inconvenient. It is possible to specify a fixed mapping + from allele strings to genotype values using the ``alleles`` + parameter. For example, if we set ``alleles=("A", "C", "G", "T")``, + this will map allele "A" to 0, "C" to 1 and so on (the + :data:`ALLELES_ACGT` constant provides a shortcut for this + common mapping). By default, genotypes are generated for all samples. The ``samples`` parameter allows us to specify the nodes for which genotypes are @@ -2955,9 +2975,11 @@ def variants(self, as_bytes=False, samples=None, impute_missing_data=False): If :ref:`missing data` is present at a given site, the genotypes array will contain a special value - ``tskit.MISSING_DATA`` (-1) to identify these missing samples. - See the :class:`Variant` class for more details on how missing - data is reported. + :data:`MISSING_DATA` (-1) to identify these missing samples, + and the ``alleles`` tuple will end with the value ``None`` + (note that this is true whether we specify a fixed mapping + using the ``alleles`` parameter or not). See the :class:`Variant` + class for more details on how missing data is reported. Missing data is reported by default, but if ``impute_missing_data`` is set to to True, missing data will be imputed such that all @@ -2970,19 +2992,21 @@ def variants(self, as_bytes=False, samples=None, impute_missing_data=False): The ``as_bytes`` parameter is kept as a compatibility option for older code. It is not the recommended way of accessing variant data, and will be deprecated in a later - release. Another method will be provided to obtain the allelic - states for each site directly. + release. :param bool as_bytes: If True, the genotype values will be returned - as a Python bytes object. This is useful in certain situations - (i.e., directly printing the genotypes) or when numpy is - not available. Otherwise, genotypes are returned as a numpy - array (the default). + as a Python bytes object. Legacy use only. :param array_like samples: An array of sample IDs for which to generate genotypes, or None for all samples. Default: None. :param bool impute_missing_data: If True, the allele assigned to any isolated samples is the ancestral state; that is, we impute missing data as the ancestral state. Default: False. + :param tuple alleles: A tuple of strings defining the encoding of + alleles as integer genotype values. At least one allele must be provided. + If duplicate alleles are provided, output genotypes will always be + encoded as the first occurance of the allele. If None (the default), + the alleles are encoded as they are encountered during genotype + generation. :return: An iterator of all variants this tree sequence. :rtype: iter(:class:`Variant`) """ @@ -2990,7 +3014,7 @@ def variants(self, as_bytes=False, samples=None, impute_missing_data=False): # present form was chosen. iterator = _tskit.VariantGenerator( self._ll_tree_sequence, samples=samples, - impute_missing_data=impute_missing_data) + impute_missing_data=impute_missing_data, alleles=alleles) for site_id, genotypes, alleles in iterator: site = self.site(site_id) if as_bytes: @@ -3003,14 +3027,12 @@ def variants(self, as_bytes=False, samples=None, impute_missing_data=False): genotypes = bytes_genotypes.tobytes() yield Variant(site, alleles, genotypes) - def genotype_matrix(self, impute_missing_data=False): + def genotype_matrix(self, impute_missing_data=False, alleles=None): """ Returns an :math:`m \\times n` numpy array of the genotypes in this tree sequence, where :math:`m` is the number of sites and :math:`n` the number of samples. The genotypes are the indexes into the array - of ``alleles``, as described for the :class:`Variant` class. The value - 0 always corresponds to the ancestal state, and values > 0 represent - distinct derived states. + of ``alleles``, as described for the :class:`Variant` class. If there is :ref:`missing data` present the genotypes array will contain a special value @@ -3028,11 +3050,17 @@ def genotype_matrix(self, impute_missing_data=False): :param bool impute_missing_data: If True, the allele assigned to any isolated samples is the ancestral state; that is, we impute missing data as the ancestral state. Default: False. + :param tuple alleles: A tuple of strings describing the encoding of + alleles to genotype values. At least one allele must be provided. + If duplicate alleles are provided, output genotypes will always be + encoded as the first occurance of the allele. If None (the default), + the alleles are encoded as they are encountered during genotype + generation. :return: The full matrix of genotypes. :rtype: numpy.ndarray (dtype=np.int8) """ return self._ll_tree_sequence.get_genotype_matrix( - impute_missing_data=impute_missing_data) + impute_missing_data=impute_missing_data, alleles=alleles) def individual(self, id_): """