From ad1e01779ddcb673f42e13707e2f382083f3b1b6 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Thu, 27 Jan 2022 14:44:59 +0000 Subject: [PATCH] Change genotypes to 32bit --- c/CHANGELOG.rst | 9 +- c/tests/test_genotypes.c | 370 ++++++++++-------------------- c/tests/test_haplotype_matching.c | 20 +- c/tests/test_trees.c | 64 +++--- c/tskit/genotypes.c | 152 +++--------- c/tskit/genotypes.h | 6 +- c/tskit/haplotype_matching.c | 18 +- c/tskit/haplotype_matching.h | 10 +- c/tskit/tables.c | 3 +- c/tskit/trees.c | 16 +- c/tskit/trees.h | 6 +- python/_tskitmodule.c | 25 +- python/tests/test_genotypes.py | 57 ++--- python/tests/test_lowlevel.py | 9 +- 14 files changed, 267 insertions(+), 498 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index dc56315205..fdbbffd160 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -2,7 +2,14 @@ [0.99.16] - 2022-0X-XX ---------------------- -- Make dumping of tables and tree seqences to disk a zero-copy operation. +**Breaking changes** + +- Change the type of genotypes to ``int32_t``, removing the TSK_16_BIT_GENOTYPES flag option. + (:user:`benjeffery`, :issue:`463`, :pr:`2108`) + +**Features** + +- Make dumping of tables and tree sequences to disk a zero-copy operation. (:user:`benjeffery`, :issue:`2111`, :pr:`2124`) ---------------------- diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index df351ba408..b8a5c68a1b 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -49,20 +49,8 @@ test_simplest_missing_data(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], TSK_MISSING_DATA); - CU_ASSERT_EQUAL(var->genotypes.i8[1], TSK_MISSING_DATA); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_free(&vargen); - - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_16_BIT_GENOTYPES); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->site->position, 0.0); - CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i16[0], TSK_MISSING_DATA); - CU_ASSERT_EQUAL(var->genotypes.i16[1], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -73,8 +61,8 @@ test_simplest_missing_data(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -105,8 +93,8 @@ test_simplest_missing_data_user_alleles(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], TSK_MISSING_DATA); - CU_ASSERT_EQUAL(var->genotypes.i8[1], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -117,30 +105,7 @@ test_simplest_missing_data_user_alleles(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], TSK_MISSING_DATA); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_free(&vargen); - - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_16_BIT_GENOTYPES); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->site->position, 0.0); - CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i16[0], TSK_MISSING_DATA); - CU_ASSERT_EQUAL(var->genotypes.i16[1], TSK_MISSING_DATA); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_free(&vargen); - - ret = tsk_vargen_init(&vargen, &ts, samples, 1, alleles, TSK_16_BIT_GENOTYPES); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->site->position, 0.0); - CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -151,8 +116,8 @@ test_simplest_missing_data_user_alleles(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -185,20 +150,8 @@ test_simplest_missing_data_mutations(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], TSK_MISSING_DATA); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_free(&vargen); - - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, alleles, TSK_16_BIT_GENOTYPES); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->site->position, 0.0); - CU_ASSERT_TRUE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i16[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i16[1], TSK_MISSING_DATA); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -210,19 +163,7 @@ test_simplest_missing_data_mutations(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_free(&vargen); - - ret = tsk_vargen_init(&vargen, &ts, samples, 1, alleles, TSK_16_BIT_GENOTYPES); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_print_state(&vargen, _devnull); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->site->position, 0.0); - CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i16[0], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -233,8 +174,8 @@ test_simplest_missing_data_mutations(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -268,8 +209,8 @@ test_simplest_missing_data_mutations_all_samples(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -281,21 +222,8 @@ test_simplest_missing_data_mutations_all_samples(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_free(&vargen); - - ret = tsk_vargen_init(&vargen, &ts, samples, 2, alleles, TSK_16_BIT_GENOTYPES); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_print_state(&vargen, _devnull); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->site->position, 0.0); - CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i16[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i16[1], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -306,8 +234,8 @@ test_simplest_missing_data_mutations_all_samples(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site->position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); @@ -356,10 +284,10 @@ test_single_tree_user_alleles(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 3); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 2); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 2); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 2); + CU_ASSERT_EQUAL(var->genotypes[0], 3); + CU_ASSERT_EQUAL(var->genotypes[1], 2); + CU_ASSERT_EQUAL(var->genotypes[2], 2); + CU_ASSERT_EQUAL(var->genotypes[3], 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -374,10 +302,10 @@ test_single_tree_user_alleles(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -392,10 +320,10 @@ test_single_tree_user_alleles(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 2); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 3); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 2); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[2], 3); + CU_ASSERT_EQUAL(var->genotypes[3], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -410,10 +338,10 @@ test_single_tree_user_alleles(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 3); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 3); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -456,10 +384,10 @@ test_single_tree_char_alphabet(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -470,10 +398,10 @@ test_single_tree_char_alphabet(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "TTTAAGGG", 8); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -488,10 +416,10 @@ test_single_tree_char_alphabet(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "AT", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 2); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 3); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 2); + CU_ASSERT_EQUAL(var->genotypes[2], 3); + CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); @@ -502,10 +430,10 @@ test_single_tree_char_alphabet(void) CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -530,10 +458,10 @@ test_single_tree_binary_alphabet(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[2], 1); + CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -542,10 +470,10 @@ test_single_tree_binary_alphabet(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -554,10 +482,10 @@ test_single_tree_binary_alphabet(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 1); + CU_ASSERT_EQUAL(var->genotypes[3], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -595,8 +523,8 @@ test_single_tree_non_samples(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -605,8 +533,8 @@ test_single_tree_non_samples(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -615,8 +543,8 @@ test_single_tree_non_samples(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -670,14 +598,12 @@ test_single_tree_user_alleles_errors(void) tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; - int j; + /* The maximium number of alleles is 127. We need space for one more plus the * sentinel */ - const int max_alleles = 129; const char *acct_alleles[] = { "A", "C", "G", "T", NULL }; const char *zero_allele[] = { "0", NULL }; const char *no_alleles[] = { NULL }; - const char *many_alleles[max_alleles]; tsk_id_t samples[] = { 0, 3 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, @@ -702,13 +628,13 @@ test_single_tree_user_alleles_errors(void) CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ZERO_ALLELES); tsk_vargen_free(&vargen); - for (j = 0; j < max_alleles; j++) { - many_alleles[j] = "0"; - } - many_alleles[128] = NULL; - ret = tsk_vargen_init(&vargen, &ts, samples, 2, many_alleles, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_ALLELES); - tsk_vargen_free(&vargen); + // for (j = 0; j < max_alleles; j++) { + // many_alleles[j] = "0"; + // } + // many_alleles[128] = NULL; + // ret = tsk_vargen_init(&vargen, &ts, samples, 2, many_alleles, 0); + // CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_ALLELES); + // tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } @@ -730,8 +656,8 @@ test_single_tree_subsample(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -740,8 +666,8 @@ test_single_tree_subsample(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -750,8 +676,8 @@ test_single_tree_subsample(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -811,8 +737,7 @@ test_single_tree_many_alleles(void) tsk_vargen_t vargen; tsk_variant_t *var; tsk_size_t num_alleles = 257; - tsk_id_t j, k, l; - tsk_flags_t options; + tsk_id_t j, k; char alleles[num_alleles]; tsk_table_collection_t tables; @@ -833,80 +758,34 @@ test_single_tree_many_alleles(void) CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); - for (l = 0; l < 2; l++) { - options = 0; - if (l == 1) { - options = TSK_16_BIT_GENOTYPES; - } - ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, options); - CU_ASSERT_EQUAL_FATAL(ret, 0); - tsk_vargen_print_state(&vargen, _devnull); - ret = tsk_vargen_next(&vargen, &var); - /* We have j + 2 alleles. So, if j >= 126, we should fail with 8bit - * genotypes */ - if (l == 0 && j >= 126) { - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_ALLELES); - } else { - CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "Y", 1); - for (k = 1; k < (tsk_id_t) var->num_alleles; k++) { - CU_ASSERT_EQUAL(k - 1, (tsk_id_t) var->allele_lengths[k]); - CU_ASSERT_NSTRING_EQUAL( - var->alleles[k], alleles, var->allele_lengths[k]); - } - CU_ASSERT_EQUAL(var->num_alleles, (tsk_size_t) j + 2); - } - ret = tsk_vargen_free(&vargen); - CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_print_state(&vargen, _devnull); + ret = tsk_vargen_next(&vargen, &var); + /* We have j + 2 alleles. So, if j >= 126, we should fail with 8bit + * genotypes */ + // if (j >= 126) { + // CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_ALLELES); + // } else { + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "Y", 1); + for (k = 1; k < (tsk_id_t) var->num_alleles; k++) { + CU_ASSERT_EQUAL(k - 1, (tsk_id_t) var->allele_lengths[k]); + CU_ASSERT_NSTRING_EQUAL(var->alleles[k], alleles, var->allele_lengths[k]); } + CU_ASSERT_EQUAL(var->num_alleles, (tsk_size_t) j + 2); + // } + ret = tsk_vargen_free(&vargen); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); } tsk_table_collection_free(&tables); } static void -test_single_tree_silent_mutations_i16(void) -{ - const char *sites = "0.0 0\n" - "0.1 0\n" - "0.2 0\n"; - const char *mutations = "0 0 1\n" - "1 1 1\n" - "2 4 1\n" - "2 0 1\n"; - tsk_treeseq_t ts; - tsk_variant_t *var; - tsk_vargen_t vargen; - tsk_flags_t options[] = { 0, TSK_16_BIT_GENOTYPES }; - tsk_id_t all_samples[] = { 0, 1, 2, 3 }; - tsk_id_t *samples[] = { NULL, all_samples }; - tsk_size_t num_samples = 4; - tsk_size_t s, f; - int ret; - - tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, - sites, mutations, NULL, NULL, 0); - - for (s = 0; s < 2; s++) { - for (f = 0; f < sizeof(options) / sizeof(*options); f++) { - ret = tsk_vargen_init( - &vargen, &ts, samples[s], num_samples, NULL, options[f]); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - ret = tsk_vargen_next(&vargen, &var); - CU_ASSERT_EQUAL_FATAL(ret, 1); - tsk_vargen_free(&vargen); - } - } - - tsk_treeseq_free(&ts); -} - -static void -test_single_tree_silent_mutations_i8(void) +test_single_tree_silent_mutations(void) { int ret = 0; tsk_treeseq_t ts; @@ -941,10 +820,10 @@ test_single_tree_silent_mutations_i8(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[2], 1); + CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -953,10 +832,10 @@ test_single_tree_silent_mutations_i8(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -965,10 +844,10 @@ test_single_tree_silent_mutations_i8(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 1); + CU_ASSERT_EQUAL(var->genotypes[3], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -977,10 +856,10 @@ test_single_tree_silent_mutations_i8(void) ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[3], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[2], 0); + CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); @@ -1012,10 +891,7 @@ main(int argc, char **argv) { "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors }, { "test_single_tree_subsample", test_single_tree_subsample }, { "test_single_tree_many_alleles", test_single_tree_many_alleles }, - { "test_single_tree_silent_mutations_i16", - test_single_tree_silent_mutations_i16 }, - { "test_single_tree_silent_mutations_i8", test_single_tree_silent_mutations_i8 }, - + { "test_single_tree_silent_mutations", test_single_tree_silent_mutations }, { NULL, NULL }, }; diff --git a/c/tests/test_haplotype_matching.c b/c/tests/test_haplotype_matching.c index 6500fd33b3..928bdc3545 100644 --- a/c/tests/test_haplotype_matching.c +++ b/c/tests/test_haplotype_matching.c @@ -49,7 +49,7 @@ tsk_ls_hmm_next_probability_test(tsk_ls_hmm_t *TSK_UNUSED(self), } static int -run_test_hmm(tsk_ls_hmm_t *hmm, int8_t *haplotype, tsk_compressed_matrix_t *output) +run_test_hmm(tsk_ls_hmm_t *hmm, int32_t *haplotype, tsk_compressed_matrix_t *output) { int ret = 0; @@ -79,7 +79,7 @@ test_single_tree_missing_alleles(void) double rho[] = { 0, 0.25, 0.25 }; double mu[] = { 0.125, 0.125, 0.125 }; - int8_t h[] = { 0, 0, 0, 0 }; + int32_t h[] = { 0, 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); @@ -108,7 +108,7 @@ test_single_tree_exact_match(void) double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; - int8_t h[] = { 1, 1, 1 }; + int32_t h[] = { 1, 1, 1 }; tsk_id_t path[3]; double decoded_compressed_matrix[12]; unsigned int precision; @@ -167,7 +167,7 @@ test_single_tree_missing_haplotype_data(void) double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; - int8_t h[] = { 1, TSK_MISSING_DATA, 1 }; + int32_t h[] = { 1, TSK_MISSING_DATA, 1 }; tsk_id_t path[3]; double decoded_compressed_matrix[12]; @@ -211,7 +211,7 @@ test_single_tree_match_impossible(void) double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; /* This haplotype can't happen with a mutation rate of 0 */ - int8_t h[] = { 0, 0, 0 }; + int32_t h[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); @@ -247,7 +247,7 @@ test_single_tree_errors(void) double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; - int8_t h[] = { 0, 0, 0 }; + int32_t h[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); @@ -309,7 +309,7 @@ test_single_tree_compressed_matrix(void) double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0.1, 0.1, 0.1 }; - int8_t h[] = { 0, 0, 0 }; + int32_t h[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); @@ -374,7 +374,7 @@ test_single_tree_viterbi_matrix(void) tsk_ls_hmm_t ls_hmm; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; - int8_t h[] = { 1, 1, 1 }; + int32_t h[] = { 1, 1, 1 }; tsk_id_t path[3]; tsk_value_transition_t T[2]; int j; @@ -448,7 +448,7 @@ test_multi_tree_exact_match(void) double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; - int8_t h[] = { 1, 1, 1 }; + int32_t h[] = { 1, 1, 1 }; tsk_id_t path[3]; double decoded_compressed_matrix[12]; unsigned int precision; @@ -530,7 +530,7 @@ test_caterpillar_tree_many_values(void) tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t matrix; double unused[] = { 0, 0, 0, 0, 0 }; - int8_t h[] = { 0, 0, 0, 0, 0 }; + int32_t h[] = { 0, 0, 0, 0, 0 }; tsk_size_t n[] = { 8, 16, diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 5fe63adb26..38f7f576a3 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -486,7 +486,7 @@ verify_simplify_genotypes(tsk_treeseq_t *ts, tsk_treeseq_t *subset, tsk_vargen_t vargen, subset_vargen; tsk_variant_t *variant, *subset_variant; tsk_size_t j, k; - int8_t a1, a2; + int32_t a1, a2; const tsk_id_t *sample_index_map; sample_index_map = tsk_treeseq_get_sample_index_map(ts); @@ -511,8 +511,8 @@ verify_simplify_genotypes(tsk_treeseq_t *ts, tsk_treeseq_t *subset, CU_ASSERT_EQUAL(variant->site->position, subset_variant->site->position); for (k = 0; k < num_samples; k++) { CU_ASSERT_FATAL(sample_index_map[samples[k]] < (tsk_id_t) ts->num_samples); - a1 = variant->genotypes.i8[sample_index_map[samples[k]]]; - a2 = subset_variant->genotypes.i8[k]; + a1 = variant->genotypes[sample_index_map[samples[k]]]; + a2 = subset_variant->genotypes[k]; /* printf("a1 = %d, a2 = %d\n", a1, a2); */ /* printf("k = %d original node = %d " */ /* "original_index = %d a1=%.*s a2=%.*s\n", */ @@ -1330,29 +1330,29 @@ test_simplest_non_sample_leaf_records(void) CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -1659,9 +1659,9 @@ test_simplest_back_mutations(void) CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); - CU_ASSERT_EQUAL(var->genotypes.i8[0], 0); - CU_ASSERT_EQUAL(var->genotypes.i8[1], 1); - CU_ASSERT_EQUAL(var->genotypes.i8[2], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 1); + CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->site->id, 0); CU_ASSERT_EQUAL(var->site->mutations_length, 2); tsk_vargen_free(&vargen); @@ -3069,10 +3069,10 @@ test_simplest_map_mutations(void) const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 0 }; + int32_t genotypes[] = { 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -3151,10 +3151,10 @@ test_simplest_nonbinary_map_mutations(void) const char *edges = "0 1 4 0,1,2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 0, 0, 0 }; + int32_t genotypes[] = { 0, 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -3197,10 +3197,10 @@ test_simplest_unary_map_mutations(void) "0 1 4 2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 0 }; + int32_t genotypes[] = { 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -3241,10 +3241,10 @@ test_simplest_non_sample_leaf_map_mutations(void) const char *edges = "0 1 2 0,1,3,4\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 0 }; + int32_t genotypes[] = { 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -3283,10 +3283,10 @@ test_simplest_internal_sample_map_mutations(void) const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 0, 0 }; + int32_t genotypes[] = { 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -3340,10 +3340,10 @@ test_simplest_multiple_root_map_mutations(void) "0 1 5 2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 0, 0, 0 }; + int32_t genotypes[] = { 0, 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -3397,10 +3397,10 @@ test_simplest_chained_map_mutations(void) "0 1 4 2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 0, 0, 0 }; + int32_t genotypes[] = { 0, 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -4543,11 +4543,11 @@ test_single_tree_map_mutations(void) { tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 1, 1, 1 }; + int32_t genotypes[] = { 0, 1, 1, 1 }; int ret = 0; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state, j; + int32_t ancestral_state, j; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -4689,11 +4689,11 @@ test_single_tree_map_mutations_internal_samples(void) "0.00000000 1.00000000 8 7\n"; tsk_treeseq_t ts; tsk_tree_t t; - int8_t genotypes[] = { 0, 2, 2, 1, 0 }; + int32_t genotypes[] = { 0, 2, 2, 1, 0 }; int ret = 0; tsk_size_t num_transitions; tsk_state_transition_t *transitions; - int8_t ancestral_state; + int32_t ancestral_state; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 5); diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index ab90790412..ba2bb9ba08 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -171,11 +171,7 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, self->options = options; - if (self->options & TSK_16_BIT_GENOTYPES) { - max_alleles_limit = INT16_MAX; - } else { - max_alleles_limit = INT8_MAX; - } + max_alleles_limit = INT32_MAX; if (alleles == NULL) { self->user_alleles = false; @@ -241,14 +237,10 @@ tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, } } - if (options & TSK_16_BIT_GENOTYPES) { - self->genotypes.i16 - = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes.i16)); - } else { - self->genotypes.i8 = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes.i8)); - } + self->genotypes = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes)); + /* Because genotypes is a union we can check the pointer */ - if (self->genotypes.i8 == NULL || self->alleles == NULL + if (self->genotypes == NULL || self->alleles == NULL || self->allele_lengths == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; @@ -293,7 +285,7 @@ tsk_vargen_init(tsk_vargen_t *self, const tsk_treeseq_t *tree_sequence, int tsk_variant_free(tsk_variant_t *self) { - tsk_safe_free(self->genotypes.i8); + tsk_safe_free(self->genotypes); tsk_safe_free(self->alleles); tsk_safe_free(self->allele_lengths); tsk_safe_free(self->user_alleles_mem); @@ -317,11 +309,8 @@ tsk_variant_expand_alleles(tsk_variant_t *self) { int ret = 0; void *p; - tsk_size_t hard_limit = INT8_MAX; + tsk_size_t hard_limit = INT32_MAX; - if (self->options & TSK_16_BIT_GENOTYPES) { - hard_limit = INT16_MAX; - } if (self->max_alleles == hard_limit) { ret = TSK_ERR_TOO_MANY_ALLELES; goto out; @@ -351,17 +340,17 @@ tsk_variant_expand_alleles(tsk_variant_t *self) * same reason. */ static int TSK_WARN_UNUSED -tsk_variant_update_genotypes_i8_sample_list( +tsk_variant_update_genotypes_sample_list( tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) { - int8_t *restrict genotypes = self->genotypes.i8; + int32_t *restrict genotypes = self->genotypes; const tsk_id_t *restrict list_left = self->tree->left_sample; const tsk_id_t *restrict list_right = self->tree->right_sample; const tsk_id_t *restrict list_next = self->tree->next_sample; tsk_id_t index, stop; int ret = 0; - tsk_bug_assert(derived < INT8_MAX); + tsk_bug_assert(derived < INT32_MAX); index = list_left[node]; if (index != TSK_NULL) { @@ -369,37 +358,7 @@ tsk_variant_update_genotypes_i8_sample_list( while (true) { ret += genotypes[index] == TSK_MISSING_DATA; - genotypes[index] = (int8_t) derived; - if (index == stop) { - break; - } - index = list_next[index]; - } - } - - return ret; -} - -static int TSK_WARN_UNUSED -tsk_variant_update_genotypes_i16_sample_list( - tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) -{ - int16_t *restrict genotypes = self->genotypes.i16; - const tsk_id_t *restrict list_left = self->tree->left_sample; - const tsk_id_t *restrict list_right = self->tree->right_sample; - const tsk_id_t *restrict list_next = self->tree->next_sample; - tsk_id_t index, stop; - int ret = 0; - - tsk_bug_assert(derived < INT16_MAX); - - index = list_left[node]; - if (index != TSK_NULL) { - stop = list_right[node]; - while (true) { - - ret += genotypes[index] == TSK_MISSING_DATA; - genotypes[index] = (int16_t) derived; + genotypes[index] = (int32_t) derived; if (index == stop) { break; } @@ -455,81 +414,36 @@ tsk_variant_traverse( } static int -tsk_variant_visit_i8(tsk_variant_t *self, tsk_id_t sample_index, tsk_id_t derived) -{ - int ret = 0; - int8_t *restrict genotypes = self->genotypes.i8; - - tsk_bug_assert(derived < INT8_MAX); - tsk_bug_assert(sample_index != -1); - - ret = genotypes[sample_index] == TSK_MISSING_DATA; - genotypes[sample_index] = (int8_t) derived; - - return ret; -} - -static int -tsk_variant_visit_i16(tsk_variant_t *self, tsk_id_t sample_index, tsk_id_t derived) +tsk_variant_visit(tsk_variant_t *self, tsk_id_t sample_index, tsk_id_t derived) { int ret = 0; - int16_t *restrict genotypes = self->genotypes.i16; + int32_t *restrict genotypes = self->genotypes; - tsk_bug_assert(derived < INT16_MAX); + tsk_bug_assert(derived < INT32_MAX); tsk_bug_assert(sample_index != -1); ret = genotypes[sample_index] == TSK_MISSING_DATA; - genotypes[sample_index] = (int16_t) derived; + genotypes[sample_index] = (int32_t) derived; return ret; } static int TSK_WARN_UNUSED -tsk_variant_update_genotypes_i8_traversal( - tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) -{ - return tsk_variant_traverse(self, node, derived, tsk_variant_visit_i8); -} - -static int TSK_WARN_UNUSED -tsk_variant_update_genotypes_i16_traversal( +tsk_variant_update_genotypes_traversal( tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) { - return tsk_variant_traverse(self, node, derived, tsk_variant_visit_i16); + return tsk_variant_traverse(self, node, derived, tsk_variant_visit); } static tsk_size_t -tsk_variant_mark_missing_i16(tsk_variant_t *self) +tsk_variant_mark_missing(tsk_variant_t *self) { tsk_size_t num_missing = 0; const tsk_id_t *restrict left_child = self->tree->left_child; const tsk_id_t *restrict right_sib = self->tree->right_sib; const tsk_id_t *restrict sample_index_map = self->sample_index_map; const tsk_id_t N = self->tree->virtual_root; - int16_t *restrict genotypes = self->genotypes.i16; - tsk_id_t root, sample_index; - - for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { - if (left_child[root] == TSK_NULL) { - sample_index = sample_index_map[root]; - if (sample_index != TSK_NULL) { - genotypes[sample_index] = TSK_MISSING_DATA; - num_missing++; - } - } - } - return num_missing; -} - -static tsk_size_t -tsk_variant_mark_missing_i8(tsk_variant_t *self) -{ - tsk_size_t num_missing = 0; - const tsk_id_t *restrict left_child = self->tree->left_child; - const tsk_id_t *restrict right_sib = self->tree->right_sib; - const tsk_id_t *restrict sample_index_map = self->sample_index_map; - const tsk_id_t N = self->tree->virtual_root; - int8_t *restrict genotypes = self->genotypes.i8; + int32_t *restrict genotypes = self->genotypes; tsk_id_t root, sample_index; for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { @@ -569,7 +483,6 @@ tsk_variant_update_site(tsk_variant_t *self) int no_longer_missing; const tsk_site_t *site = self->site; tsk_mutation_t mutation; - bool genotypes16 = !!(self->options & TSK_16_BIT_GENOTYPES); bool impute_missing = !!(self->options & TSK_ISOLATED_NOT_MISSING); bool by_traversal = self->alt_samples != NULL; int (*update_genotypes)(tsk_variant_t *, tsk_id_t, tsk_id_t); @@ -581,19 +494,13 @@ tsk_variant_update_site(tsk_variant_t *self) * we only have a small number of samples, it's probably better to * do it by traversal. For large sets of samples though, it may be * better to use the sample list infrastructure. */ - if (genotypes16) { - mark_missing = tsk_variant_mark_missing_i16; - update_genotypes = tsk_variant_update_genotypes_i16_sample_list; - if (by_traversal) { - update_genotypes = tsk_variant_update_genotypes_i16_traversal; - } - } else { - mark_missing = tsk_variant_mark_missing_i8; - update_genotypes = tsk_variant_update_genotypes_i8_sample_list; - if (by_traversal) { - update_genotypes = tsk_variant_update_genotypes_i8_traversal; - } + + mark_missing = tsk_variant_mark_missing; + update_genotypes = tsk_variant_update_genotypes_sample_list; + if (by_traversal) { + update_genotypes = tsk_variant_update_genotypes_traversal; } + if (self->user_alleles) { allele_index = tsk_variant_get_allele_index( self, site->ancestral_state, site->ancestral_state_length); @@ -618,15 +525,10 @@ tsk_variant_update_site(tsk_variant_t *self) * field, where we require that a mutation's parent must appear before it * in the list of mutations. This guarantees the correctness of this algorithm. */ - if (genotypes16) { - for (j = 0; j < self->num_samples; j++) { - self->genotypes.i16[j] = (int16_t) allele_index; - } - } else { - for (j = 0; j < self->num_samples; j++) { - self->genotypes.i8[j] = (int8_t) allele_index; - } + for (j = 0; j < self->num_samples; j++) { + self->genotypes[j] = (int32_t) allele_index; } + /* We mark missing data *before* updating the genotypes because * mutations directly over samples should not be missing */ num_missing = 0; diff --git a/c/tskit/genotypes.h b/c/tskit/genotypes.h index 81cb92c0aa..71044d863a 100644 --- a/c/tskit/genotypes.h +++ b/c/tskit/genotypes.h @@ -32,7 +32,6 @@ extern "C" { #include -#define TSK_16_BIT_GENOTYPES (1 << 0) #define TSK_ISOLATED_NOT_MISSING (1 << 1) typedef struct { @@ -43,10 +42,7 @@ typedef struct { tsk_size_t num_alleles; tsk_size_t max_alleles; bool has_missing_data; - union { - int8_t *i8; - int16_t *i16; - } genotypes; + int32_t *genotypes; tsk_size_t num_samples; tsk_size_t num_sites; diff --git a/c/tskit/haplotype_matching.c b/c/tskit/haplotype_matching.c index 48f56e200b..fda3144ba1 100644 --- a/c/tskit/haplotype_matching.c +++ b/c/tskit/haplotype_matching.c @@ -377,7 +377,7 @@ tsk_ls_hmm_get_allele_index(tsk_ls_hmm_t *self, tsk_id_t site, const char *allel { int ret = TSK_ERR_ALLELE_NOT_FOUND; const char **alleles = self->alleles[site]; - const tsk_id_t num_alleles = self->num_alleles[site]; + const tsk_id_t num_alleles = (tsk_id_t) self->num_alleles[site]; tsk_id_t j; @@ -395,7 +395,7 @@ tsk_ls_hmm_get_allele_index(tsk_ls_hmm_t *self, tsk_id_t site, const char *allel static int tsk_ls_hmm_update_probabilities( - tsk_ls_hmm_t *self, const tsk_site_t *site, int8_t haplotype_state) + tsk_ls_hmm_t *self, const tsk_site_t *site, int32_t haplotype_state) { int ret = 0; tsk_id_t root; @@ -403,7 +403,7 @@ tsk_ls_hmm_update_probabilities( tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict T_index = self->transition_index; tsk_value_transition_t *restrict T = self->transitions; - int8_t *restrict allelic_state = self->allelic_state; + int32_t *restrict allelic_state = self->allelic_state; const tsk_id_t left_root = tsk_tree_get_left_root(tree); tsk_mutation_t mut; tsk_id_t j, u, v; @@ -417,7 +417,7 @@ tsk_ls_hmm_update_probabilities( goto out; } for (root = left_root; root != TSK_NULL; root = tree->right_sib[root]) { - allelic_state[root] = (int8_t) ret; + allelic_state[root] = (int32_t) ret; } for (j = 0; j < (tsk_id_t) site->mutations_length; j++) { @@ -428,7 +428,7 @@ tsk_ls_hmm_update_probabilities( goto out; } u = mut.node; - allelic_state[u] = (int8_t) ret; + allelic_state[u] = (int32_t) ret; if (T_index[u] == TSK_NULL) { while (T_index[u] == TSK_NULL) { u = parent[u]; @@ -922,7 +922,7 @@ tsk_ls_hmm_compress(tsk_ls_hmm_t *self) static int tsk_ls_hmm_process_site( - tsk_ls_hmm_t *self, const tsk_site_t *site, int8_t haplotype_state) + tsk_ls_hmm_t *self, const tsk_site_t *site, int32_t haplotype_state) { int ret = 0; double x, normalisation_factor; @@ -961,7 +961,7 @@ tsk_ls_hmm_process_site( } int -tsk_ls_hmm_run(tsk_ls_hmm_t *self, int8_t *haplotype, +tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *), double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output) { @@ -1061,7 +1061,7 @@ tsk_ls_hmm_next_probability_forward(tsk_ls_hmm_t *self, tsk_id_t site_id, double } int -tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int8_t *haplotype, +tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_compressed_matrix_t *output, tsk_flags_t options) { int ret = 0; @@ -1143,7 +1143,7 @@ tsk_ls_hmm_next_probability_viterbi(tsk_ls_hmm_t *self, tsk_id_t site, double p_ } int -tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int8_t *haplotype, tsk_viterbi_matrix_t *output, +tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_viterbi_matrix_t *output, tsk_flags_t options) { int ret = 0; diff --git a/c/tskit/haplotype_matching.h b/c/tskit/haplotype_matching.h index 4c392a1b94..7f70d32915 100644 --- a/c/tskit/haplotype_matching.h +++ b/c/tskit/haplotype_matching.h @@ -92,7 +92,7 @@ typedef struct _tsk_ls_hmm_t { double *mutation_rate; const char ***alleles; unsigned int precision; - uint8_t *num_alleles; + uint32_t *num_alleles; tsk_size_t num_samples; tsk_size_t num_sites; tsk_size_t num_nodes; @@ -123,7 +123,7 @@ typedef struct _tsk_ls_hmm_t { tsk_id_t *transition_parent; /* The number of samples directly subtended by a transition */ tsk_size_t *num_transition_samples; - int8_t *allelic_state; + int32_t *allelic_state; /* Algorithms set these values before they are run */ int (*next_probability)( struct _tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *); @@ -136,11 +136,11 @@ int tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, int tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision); int tsk_ls_hmm_free(tsk_ls_hmm_t *self); void tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out); -int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int8_t *haplotype, +int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_compressed_matrix_t *output, tsk_flags_t options); -int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int8_t *haplotype, +int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_viterbi_matrix_t *output, tsk_flags_t options); -int tsk_ls_hmm_run(tsk_ls_hmm_t *self, int8_t *haplotype, +int tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *), double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output); diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 43be3fee67..d9d889c0a9 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -11849,7 +11849,8 @@ tsk_check_subset_equality(tsk_table_collection_t *self, goto out; } if (!tsk_table_collection_equals(&self_copy, &other_copy, - TSK_CMP_IGNORE_TS_METADATA | TSK_CMP_IGNORE_PROVENANCE)) { + TSK_CMP_IGNORE_TS_METADATA | TSK_CMP_IGNORE_PROVENANCE + | TSK_CMP_IGNORE_REFERENCE_SEQUENCE)) { ret = TSK_ERR_UNION_DIFF_HISTORIES; goto out; } diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 90623320d6..9d499d69d0 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4635,13 +4635,13 @@ tsk_tree_postorder( /* Parsimony methods */ static inline uint64_t -set_bit(uint64_t value, int8_t bit) +set_bit(uint64_t value, int32_t bit) { return value | (1ULL << bit); } static inline bool -bit_is_set(uint64_t value, int8_t bit) +bit_is_set(uint64_t value, int32_t bit) { return (value & (1ULL << bit)) != 0; } @@ -4677,15 +4677,15 @@ get_smallest_set_bit(uint64_t v) * Given Tree", Biometrics 1973. */ int TSK_WARN_UNUSED -tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, - double *TSK_UNUSED(cost_matrix), tsk_flags_t options, int8_t *r_ancestral_state, +tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, + double *TSK_UNUSED(cost_matrix), tsk_flags_t options, int32_t *r_ancestral_state, tsk_size_t *r_num_transitions, tsk_state_transition_t **r_transitions) { int ret = 0; struct stack_elem { tsk_id_t node; tsk_id_t transition_parent; - int8_t state; + int32_t state; }; const tsk_size_t num_samples = self->tree_sequence->num_samples; const tsk_id_t *restrict left_child = self->left_child; @@ -4706,13 +4706,13 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, tsk_id_t u, v; /* The largest possible number of transitions is one over every sample */ tsk_state_transition_t *transitions = tsk_malloc(num_samples * sizeof(*transitions)); - int8_t allele, ancestral_state; + int32_t allele, ancestral_state; int stack_top; struct stack_elem s; tsk_size_t j, num_transitions, max_allele_count, num_nodes; tsk_size_t allele_count[HARTIGAN_MAX_ALLELES]; tsk_size_t non_missing = 0; - int8_t num_alleles = 0; + int32_t num_alleles = 0; if (optimal_set == NULL || preorder_stack == NULL || transitions == NULL || nodes == NULL) { @@ -4748,7 +4748,7 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, ret = TSK_ERR_BAD_ANCESTRAL_STATE; goto out; } else if (ancestral_state >= num_alleles) { - num_alleles = (int8_t)(ancestral_state + 1); + num_alleles = (int32_t)(ancestral_state + 1); } } diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 896820e268..5378922d26 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -507,11 +507,11 @@ int tsk_tree_preorder_samples( typedef struct { tsk_id_t node; tsk_id_t parent; - int8_t state; + int32_t state; } tsk_state_transition_t; -int tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, double *cost_matrix, - tsk_flags_t options, int8_t *ancestral_state, tsk_size_t *num_transitions, +int tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, double *cost_matrix, + tsk_flags_t options, int32_t *ancestral_state, tsk_size_t *num_transitions, tsk_state_transition_t **transitions); int tsk_tree_kc_distance( diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 5f5c4e5295..b68a7f3919 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -584,14 +584,14 @@ make_variant(tsk_variant_t *variant, tsk_size_t num_samples) PyObject *ret = NULL; npy_intp dims = num_samples; PyObject *alleles = make_alleles(variant); - PyArrayObject *genotypes = (PyArrayObject *) PyArray_SimpleNew(1, &dims, NPY_INT8); + PyArrayObject *genotypes = (PyArrayObject *) PyArray_SimpleNew(1, &dims, NPY_INT32); /* TODO update this to account for 16 bit variants when we provide the * high-level interface. */ if (genotypes == NULL || alleles == NULL) { goto out; } - memcpy(PyArray_DATA(genotypes), variant->genotypes.i8, num_samples * sizeof(int8_t)); + memcpy(PyArray_DATA(genotypes), variant->genotypes, num_samples * sizeof(int32_t)); ret = Py_BuildValue("iOO", variant->site->id, genotypes, alleles); out: Py_XDECREF(genotypes); @@ -9287,7 +9287,7 @@ TreeSequence_get_genotype_matrix(TreeSequence *self, PyObject *args, PyObject *k PyObject *py_alleles = Py_None; PyArrayObject *genotype_matrix = NULL; tsk_vargen_t *vg = NULL; - char *V; + int32_t *V; tsk_variant_t *variant; tsk_size_t j; int isolated_as_missing = 1; @@ -9319,11 +9319,11 @@ TreeSequence_get_genotype_matrix(TreeSequence *self, PyObject *args, PyObject *k dims[0] = num_sites; dims[1] = num_samples; - genotype_matrix = (PyArrayObject *) PyArray_SimpleNew(2, dims, NPY_INT8); + genotype_matrix = (PyArrayObject *) PyArray_SimpleNew(2, dims, NPY_INT32); if (genotype_matrix == NULL) { goto out; } - V = (char *) PyArray_DATA(genotype_matrix); + V = (int32_t *) PyArray_DATA(genotype_matrix); vg = PyMem_Malloc(sizeof(tsk_vargen_t)); if (vg == NULL) { PyErr_NoMemory(); @@ -9336,8 +9336,7 @@ TreeSequence_get_genotype_matrix(TreeSequence *self, PyObject *args, PyObject *k } j = 0; while ((err = tsk_vargen_next(vg, &variant)) == 1) { - memcpy( - V + (j * num_samples), variant->genotypes.i8, num_samples * sizeof(int8_t)); + memcpy(V + (j * num_samples), variant->genotypes, num_samples * sizeof(int32_t)); j++; } if (err != 0) { @@ -10459,7 +10458,7 @@ Tree_map_mutations(Tree *self, PyObject *args, PyObject *kwds) PyObject *py_ancestral_state = Py_None; PyArrayObject *genotypes_array = NULL; static char *kwlist[] = { "genotypes", "ancestral_state", NULL }; - int8_t ancestral_state; + int32_t ancestral_state; tsk_state_transition_t *transitions = NULL; tsk_size_t num_transitions; npy_intp *shape; @@ -10474,7 +10473,7 @@ Tree_map_mutations(Tree *self, PyObject *args, PyObject *kwds) goto out; } genotypes_array = (PyArrayObject *) PyArray_FROMANY( - genotypes, NPY_INT8, 1, 1, NPY_ARRAY_IN_ARRAY); + genotypes, NPY_INT32, 1, 1, NPY_ARRAY_IN_ARRAY); if (genotypes_array == NULL) { goto out; } @@ -10493,10 +10492,10 @@ Tree_map_mutations(Tree *self, PyObject *args, PyObject *kwds) } /* Note this does allow large numbers to overflow, but higher levels * should be checking for these error anyway. */ - ancestral_state = (int8_t) PyLong_AsLong(py_ancestral_state); + ancestral_state = (int32_t) PyLong_AsLong(py_ancestral_state); } - err = tsk_tree_map_mutations(self->tree, (int8_t *) PyArray_DATA(genotypes_array), + err = tsk_tree_map_mutations(self->tree, (int32_t *) PyArray_DATA(genotypes_array), NULL, options, &ancestral_state, &num_transitions, &transitions); if (err != 0) { handle_library_error(err); @@ -12056,7 +12055,7 @@ LsHmm_forward_matrix(LsHmm *self, PyObject *args) } num_sites = (npy_intp) tsk_treeseq_get_num_sites(self->tree_sequence->tree_sequence); haplotype_array = (PyArrayObject *) PyArray_FROMANY( - haplotype, NPY_INT8, 1, 1, NPY_ARRAY_IN_ARRAY); + haplotype, NPY_INT32, 1, 1, NPY_ARRAY_IN_ARRAY); if (haplotype_array == NULL) { goto out; } @@ -12097,7 +12096,7 @@ LsHmm_viterbi_matrix(LsHmm *self, PyObject *args) } num_sites = (npy_intp) tsk_treeseq_get_num_sites(self->tree_sequence->tree_sequence); haplotype_array = (PyArrayObject *) PyArray_FROMANY( - haplotype, NPY_INT8, 1, 1, NPY_ARRAY_IN_ARRAY); + haplotype, NPY_INT32, 1, 1, NPY_ARRAY_IN_ARRAY); if (haplotype_array == NULL) { goto out; } diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index acd1109b5a..b6d66999b1 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -217,7 +217,7 @@ def test_as_bytes_fails(self): def test_dtype(self): ts = self.get_tree_sequence() for var in ts.variants(): - assert var.genotypes.dtype == np.int8 + assert var.genotypes.dtype == np.int32 def test_dtype_conversion(self): # Check if we hit any issues if we assume the variants are uint8 @@ -256,19 +256,15 @@ def test_many_alleles(self): num_alleles = 1 for allele in alleles[1:]: ts = tables.tree_sequence() - if num_alleles > 127: - with pytest.raises(exceptions.LibraryError): - next(ts.variants()) - else: - var = next(ts.variants()) - assert not var.has_missing_data - assert var.num_alleles == num_alleles - assert len(var.alleles) == num_alleles - assert list(var.alleles) == alleles[:num_alleles] - assert var.alleles[var.genotypes[0]] == alleles[num_alleles - 1] - for u in ts.samples(): - if u != 0: - assert var.alleles[var.genotypes[u]] == alleles[0] + var = next(ts.variants()) + assert not var.has_missing_data + assert var.num_alleles == num_alleles + assert len(var.alleles) == num_alleles + assert list(var.alleles) == alleles[:num_alleles] + assert var.alleles[var.genotypes[0]] == alleles[num_alleles - 1] + for u in ts.samples(): + if u != 0: + assert var.alleles[var.genotypes[u]] == alleles[0] tables.mutations.add_row(0, 0, allele, parent=parent) parent += 1 num_alleles += 1 @@ -288,22 +284,19 @@ def test_many_alleles_missing_data(self): num_alleles = 1 for allele in alleles[1:]: ts = tables.tree_sequence() - if num_alleles > 127: - with pytest.raises(exceptions.LibraryError): - next(ts.variants()) - else: - var = next(ts.variants()) - assert var.has_missing_data - assert var.num_alleles == num_alleles - assert len(var.alleles) == num_alleles + 1 - assert list(var.alleles)[:-1] == alleles[:num_alleles] - assert var.alleles[-1] is None - assert var.alleles[var.genotypes[0]] == alleles[num_alleles - 1] - assert var.genotypes[-1] == -1 - samples = ts.samples() - for u in samples[:-1]: - if u != 0: - assert var.alleles[var.genotypes[u]] == alleles[0] + + var = next(ts.variants()) + assert var.has_missing_data + assert var.num_alleles == num_alleles + assert len(var.alleles) == num_alleles + 1 + assert list(var.alleles)[:-1] == alleles[:num_alleles] + assert var.alleles[-1] is None + assert var.alleles[var.genotypes[0]] == alleles[num_alleles - 1] + assert var.genotypes[-1] == -1 + samples = ts.samples() + for u in samples[:-1]: + if u != 0: + assert var.alleles[var.genotypes[u]] == alleles[0] tables.mutations.add_row(0, 0, allele, parent=parent) parent += 1 num_alleles += 1 @@ -322,12 +315,12 @@ def test_no_mutations(self): def test_genotype_matrix(self): ts = self.get_tree_sequence() - G = np.empty((ts.num_sites, ts.num_samples), dtype=np.int8) + G = np.empty((ts.num_sites, ts.num_samples), dtype=np.int32) for v in ts.variants(): G[v.index, :] = v.genotypes G2 = ts.genotype_matrix() assert np.array_equal(G, G2) - assert G2.dtype == np.int8 + assert G2.dtype == np.int32 def test_recurrent_mutations_over_samples(self): ts = self.get_tree_sequence() diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 63350cbf78..56c1f0c676 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -2315,10 +2315,6 @@ def test_alleles(self): with pytest.raises(TypeError): _tskit.VariantGenerator(ts, samples=[1, 2], alleles=(bad_allele_type,)) - too_many_alleles = tuple(str(j) for j in range(128)) - with pytest.raises(_tskit.LibraryError): - _tskit.VariantGenerator(ts, samples=[1, 2], alleles=too_many_alleles) - def test_iterator(self): ts = self.get_example_tree_sequence() self.verify_iterator(_tskit.VariantGenerator(ts)) @@ -3182,7 +3178,6 @@ def test_map_mutations_errors(self): ts = self.get_example_tree_sequence() tree = _tskit.Tree(ts) n = ts.get_num_samples() - genotypes = np.zeros(n, dtype=np.int8) with pytest.raises(TypeError): tree.map_mutations() for bad_size in [0, 1, n - 1, n + 1]: @@ -3191,9 +3186,9 @@ def test_map_mutations_errors(self): for bad_type in [None, {}, set()]: with pytest.raises(TypeError): tree.map_mutations([bad_type] * n) - for bad_type in [np.uint8, np.uint64, np.float32]: + for bad_type in [np.uint32, np.uint64, np.float32]: with pytest.raises(TypeError): - tree.map_mutations(np.zeros(bad_size, dtype=bad_type)) + tree.map_mutations(np.zeros(n, dtype=bad_type)) genotypes = np.zeros(n, dtype=np.int8) tree.map_mutations(genotypes) for bad_value in [64, 65, 127, -2]: