diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8105a987..2159d019 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,13 +4,35 @@ **Breaking changes**: -- +- `reference_sequence` is now a tskit attribute, no longer managed by pyslim. + It is no longer mutable on tree sequences (only TableCollections), and + previous calls to `ts.reference_sequence` to get the actual sequence + should be replaced by `ts.reference_sequence.data`. + +- `annotate_defaults` no longer returns a SlimTreeSequence; wrap the call + in `pyslim.SlimTreeSequence( )` to retain previous behavior. + +- Old-style "legacy" metadata (previously deprecated) has been removed. + See `the documentation `_ + for instructions on migrating your code. + +- The `SlimTreeSequence` class is now deprecated, and using it produces a + warning, and will be removed at some point in the future (not soon). **New features**: - Added `pyslim.population_size( )` to compute an array giving numbers of individuals across a grid of space and time bins. ({user}giliapatterson) +- `recapitate` is updated to use new demography features in msprime 1.0. + +- Methods of the SlimTreeSequence class are now methods of pyslim: + `recapitate`, `individual_parents`, `individual_ages_at`, + `has_individual_parents`, `individuals_alive_at`, + `mutation_at`, `nucleotide_at`, `slim_time`. For instance it is now + recommended to call `pyslim.recapitate(ts, ...)` instead of + `ts.recapitate(...)`. + ******************** [0.600] - 2021-02-24 ******************** diff --git a/docs/_toc.yml b/docs/_toc.yml index cb71c078..e772cba1 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -11,6 +11,7 @@ - file: vignette_coalescent_diversity - file: vignette_parallel_phylo - file: metadata + - file: previous_versions - part: pyslim reference chapters: diff --git a/docs/metadata.md b/docs/metadata.md index 92535243..5b4b47bc 100644 --- a/docs/metadata.md +++ b/docs/metadata.md @@ -19,7 +19,7 @@ import numpy as np import random random.seed(23) -ts = pyslim.load("example_sim.trees") +ts = tskit.load("example_sim.trees") tables = ts.tables ``` @@ -243,12 +243,13 @@ mod_ts.dump("modified_ts.trees") ### Metadata entries -SLiM records additional information in the metadata columns of Population, Individual, Node, and Mutation tables, +SLiM records additional information in the metadata columns of Individual, Node, and Mutation tables, in a binary format using the python ``struct`` module. See {ref}`tskit's metadata documentation ` for details on how this works. Nothing besides this binary information can be stored in the metadata of these tables if the tree sequence is to be used by SLiM, and so when ``pyslim`` annotates an existing tree sequence, anything in those columns is overwritten. +Population metadata is stored as JSON, however, which is more flexible. For more detailed documentation on the contents and format of the metadata, see the SLiM manual. Of particular note is that *nodes* and *populations* may have empty metadata. @@ -256,104 +257,3 @@ SLiM will not use the metadata of nodes that are not associated with alive indiv so this can safely be omitted (and makes recapitation easier). And, populations not used by SLiM will have empty metadata. All remaining metadata are required (besides edges and sites, whose metadata is not used at all). - - -(sec_legacy_metadata)= - -## Legacy metadata - -In previous versions of pyslim, -SLiM-specific metadata was provided as customized objects: -for instance, for a node ``n`` provided by a ``SlimTreeSequence``, -we'd have ``n.metadata`` as a ``NodeMetadata`` object, -with attributes ``n.metadata.slim_id`` and ``n.metadata.is_null`` and ``n.metadata.genome_type``. -However, with tskit 0.3, -the capacity to deal with structured metadata -was implemented in {ref}`tskit itself `, -and so pyslim shifted to using the tskit-native metadata tools. -As a result, parsed metadata is provided as a dictionary instead of an object, -so that now ``n.metadata`` would be a dict, -with entries ``n.metadata["slim_id"]`` and ``n.metadata["is_null"]`` and ``n.metadata["genome_type"]``. -Annotation should be done with tskit methods (e.g., ``packset_metadata``). - -.. note:: - - Until pyslim version 0.600, the old-style metadata was still available, - but this functionality has been removed. - -Here are more detailed notes on how to migrate a script from the legacy -metadata handling. If you run into issues, please ask (open a discussion on github). - -**1.** Use top-level metadata instead of ``slim_provenance``: -previously, information about the model type and the time counter (generation) -in SLiM was provided in the Provenances table, made available through -the ``ts.slim_provenance`` object. This is still available but deprecated, -and should be obtained from the *top-level* metadata object, ``ts.metadata["SLiM"]``. -So, in your scripts ``ts.slim_provenance.model_type`` should be replaced with -``ts.metadata["SLiM"]["model_type"]``, -and (although it's not deprecated), probably ``ts.slim_generation`` should -probably be replaced with -``ts.metadata["SLiM"]["generation"]``. - -**2.** Switch metadata objects to dicts: -if ``md`` is the ``metadata`` property of a population, individual, or node, -this means replacing ``md.X`` with ``md["X"]``. -The ``migration_records`` property of population metadata is similarly -a list of dicts rather than a list of objects, so instead of -``ts.population(1).metadata.migration_records[0].source_subpop`` -we would write -``ts.population(1).metadata["migration_records"][0]["source_subpop"]``. - -Mutations were previously a bit different - if ``mut`` is a mutation -(e.g., ``mut = ts.mutation(0)``) -then ``mut.metadata`` was previously a list of MutationMetadata objects. -Now, ``mut.metadata`` is a dict, with a single entry: -``mut.metadata["mutation_list"]`` is a list of dicts, each containing the information -that was previously in the MutationMetadata objects. -So, for instance, instead of ``mut.metadata[0].selection_coeff`` -we would write ``mut.metadata["mutation_list"][0]["selection_coeff"]``. - -**3.** The ``decode_X`` and ``encode_X`` methods are now deprecated, -as this is handled by tskit itself. -For instance, ``encode_node`` would take a NodeMetadata object -and produce the raw bytes necessary to encode it in a Node table, -and ``decode_node`` would do the inverse operation. -This is now handled by the relevant MetadataSchema object: -for nodes one can obtain this as ``nms = ts.tables.nodes.metadata_schema``, -which has the methods ``nms.validate_and_encode_row`` and ``nms.decode_row``. -Decoding is for the most part not necessary, -since the metadata is automatically decoded, -but ``pyslim.decode_node(raw_md)`` could be replaced by ``nms.decode_row(raw_md)``. -Encoding is necessary to modify tables, -and ``pyslim.encode_node(md)`` can be replaced by ``nms.validate_and_encode_row(md)`` -(where furthermore ``md`` should now be a dict rather than a NodeMetadata object). - -**4.** The ``annotate_X_metadata`` methods are deprecated, -as again tskit has tools to do this. -These methods would set the metadata column of a table - -for instance, if ``metadata`` is a list of NodeMetadata objects, then -``annotate_node_metadata(tables, metadata)`` would modify ``tables.nodes`` in place -to contain the (encoded) metadata in the list ``metadata``. -Now, this could be done as follows (where now ``metadata`` is a list of metadata dicts): - -```{code-cell} -metadata = [ {'slim_id': k, 'is_null': False, 'genome_type': 0} - for k in range(tables.nodes.num_rows) ] -nms = tables.nodes.metadata_schema -tables.nodes.packset_metadata( - [nms.validate_and_encode_row(r) for r in metadata]) -``` - -If speed is an issue, then ``encode_row`` can be substituted for ``validate_and_encode_row``, -but at the risk of missing errors in metadata. - -**5.** the ``extract_X_metadata`` methods are not necessary, -since the metadata in the tables of a TableCollection are automatically decoded. -For instance, ``[ind.metadata["sex"] for ind in tables.individuals]`` will obtain -a list of sexes of the individuals in the IndividualTable. - -:::{warning} - It is our intention to remain backwards-compatible for a time. - However, the legacy code will disappear at some point in the future, - so please migrate over scripts you intend to rely on. -::: diff --git a/docs/previous_versions.md b/docs/previous_versions.md new file mode 100644 index 00000000..992915b4 --- /dev/null +++ b/docs/previous_versions.md @@ -0,0 +1,159 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{code-cell} +:tags: [remove-cell] +import pyslim, tskit, msprime + +ts = tskit.load("example_sim.trees") +tables = ts.tables +``` + + +(sec_previous_versions)= + + +# Migrating from previous versions of pyslim + +A number of features that were first introduced in pyslim have been made part of core +tskit functionality. For instance, reference sequence support was provided (although +loosely) inpyslim to support SLiM's nucleotide models, but is now part of a standard +tskit {class}`tskit.TreeSequence`. Similarly, metadata processing in tskit made +code to do this within pyslim obsolete; this "legacy metadata" code has been removed +and instructions for how to migrate your code are {ref}`below `. + +In fact, we are now at the (very good) place where we don't need +the {class}`pyslim.SlimTreeSequence` class any longer - it only makes code more compliated. +So, pyslim is migrating to be purely functional: instead of providing the SlimTreeSequence +class with specialized methods, all methods will be functions of TreeSequences, +that take in a tree sequence and return something +(a modified tree sequence or some summary of it). +Backwards compatibility will be maintained for some time, but we request that you +switch over sooner, as your code will be cleaner and faster. + +To do this, you should + +1. Remove all calls to `pyslim.SlimTreeSequence( )`. They are unnecessary. +2. Replace `ts.slim_generation` with `ts.metadata['SLiM']['generation']`, + and `ts.model_type` with `ts.metadata['SLiM']['model_type']`. +3. Replace `ts.reference_sequence` with `ts.reference_sequence.data`. +4. Replace calls to `ts.recapitate(...)` with `pyslim.recapitate(ts, ...)`, + and similarly with other SlimTreeSequence methods. + +If you encounter difficulties, please post an +[issue](https://github.com/tskit-dev/pyslim/issues) +or [discussion](https://github.com/tskit-dev/pyslim/discussions) on github. + + +(sec_legacy_metadata)= + +## Legacy metadata + +In previous versions of pyslim, +SLiM-specific metadata was provided as customized objects: +for instance, for a node ``n`` provided by a ``SlimTreeSequence``, +we'd have ``n.metadata`` as a ``NodeMetadata`` object, +with attributes ``n.metadata.slim_id`` and ``n.metadata.is_null`` and ``n.metadata.genome_type``. +However, with tskit 0.3, +the capacity to deal with structured metadata +was implemented in {ref}`tskit itself `, +and so pyslim shifted to using the tskit-native metadata tools. +As a result, parsed metadata is provided as a dictionary instead of an object, +so that now ``n.metadata`` would be a dict, +with entries ``n.metadata["slim_id"]`` and ``n.metadata["is_null"]`` and ``n.metadata["genome_type"]``. +Annotation should be done with tskit methods (e.g., ``packset_metadata``). + +.. note:: + + Until pyslim version 0.600, the old-style metadata was still available, + but this functionality has been removed. + +Here are more detailed notes on how to migrate a script from the legacy +metadata handling. If you run into issues, please ask (open a discussion on github). + +**1.** Use top-level metadata instead of ``slim_provenance``: +previously, information about the model type and the time counter (generation) +in SLiM was provided in the Provenances table, made available through +the ``ts.slim_provenance`` object. This is still available but deprecated, +and should be obtained from the *top-level* metadata object, ``ts.metadata["SLiM"]``. +So, in your scripts ``ts.slim_provenance.model_type`` should be replaced with +``ts.metadata["SLiM"]["model_type"]``, +and (although it's not deprecated), probably ``ts.slim_generation`` should +probably be replaced with +``ts.metadata["SLiM"]["generation"]``. + +**2.** Switch metadata objects to dicts: +if ``md`` is the ``metadata`` property of a population, individual, or node, +this means replacing ``md.X`` with ``md["X"]``. +The ``migration_records`` property of population metadata is similarly +a list of dicts rather than a list of objects, so instead of +``ts.population(1).metadata.migration_records[0].source_subpop`` +we would write +``ts.population(1).metadata["migration_records"][0]["source_subpop"]``. + +Mutations were previously a bit different - if ``mut`` is a mutation +(e.g., ``mut = ts.mutation(0)``) +then ``mut.metadata`` was previously a list of MutationMetadata objects. +Now, ``mut.metadata`` is a dict, with a single entry: +``mut.metadata["mutation_list"]`` is a list of dicts, each containing the information +that was previously in the MutationMetadata objects. +So, for instance, instead of ``mut.metadata[0].selection_coeff`` +we would write ``mut.metadata["mutation_list"][0]["selection_coeff"]``. + +**3.** The ``decode_X`` and ``encode_X`` methods are now deprecated, +as this is handled by tskit itself. +For instance, ``encode_node`` would take a NodeMetadata object +and produce the raw bytes necessary to encode it in a Node table, +and ``decode_node`` would do the inverse operation. +This is now handled by the relevant MetadataSchema object: +for nodes one can obtain this as ``nms = ts.tables.nodes.metadata_schema``, +which has the methods ``nms.validate_and_encode_row`` and ``nms.decode_row``. +Decoding is for the most part not necessary, +since the metadata is automatically decoded, +but ``pyslim.decode_node(raw_md)`` could be replaced by ``nms.decode_row(raw_md)``. +Encoding is necessary to modify tables, +and ``pyslim.encode_node(md)`` can be replaced by ``nms.validate_and_encode_row(md)`` +(where furthermore ``md`` should now be a dict rather than a NodeMetadata object). + +**4.** The ``annotate_X_metadata`` methods are deprecated, +as again tskit has tools to do this. +These methods would set the metadata column of a table - +for instance, if ``metadata`` is a list of NodeMetadata objects, then +``annotate_node_metadata(tables, metadata)`` would modify ``tables.nodes`` in place +to contain the (encoded) metadata in the list ``metadata``. +Now, this could be done as follows (where now ``metadata`` is a list of metadata dicts): + +```{code-cell} +metadata = [ {'slim_id': k, 'is_null': False, 'genome_type': 0} + for k in range(tables.nodes.num_rows) ] +nms = tables.nodes.metadata_schema +tables.nodes.packset_metadata( + [nms.validate_and_encode_row(r) for r in metadata] +) +``` + +If speed is an issue, then ``encode_row`` can be substituted for ``validate_and_encode_row``, +but at the risk of missing errors in metadata. + +**5.** the ``extract_X_metadata`` methods are not necessary, +since the metadata in the tables of a TableCollection are automatically decoded. +For instance, ``[ind.metadata["sex"] for ind in tables.individuals]`` will obtain +a list of sexes of the individuals in the IndividualTable. + +:::{warning} + It is our intention to remain backwards-compatible for a time. + However, the legacy code will disappear at some point in the future, + so please migrate over scripts you intend to rely on. +::: +======= +>>>>>>> 483184a (deprecation start) diff --git a/docs/python_api.md b/docs/python_api.md index 1e7713a3..278f6771 100644 --- a/docs/python_api.md +++ b/docs/python_api.md @@ -18,7 +18,7 @@ kernelspec: from IPython.display import SVG import numpy as np - ts = pyslim.load("example_sim.trees") + ts = tskit.load("example_sim.trees") tables = ts.tables ``` @@ -51,12 +51,19 @@ available in pyslim. ## Summarizing tree sequences -Additionally, ``pyslim`` contains the following summary methods: +Additionally, ``pyslim`` contains the following methods: ```{eval-rst} .. autosummary:: + has_individual_parents + individual_ages_at + individual_parents + individuals_alive_at + mutation_at + nucleotide_at population_size + slim_time ``` diff --git a/docs/tutorial.md b/docs/tutorial.md index 5b4faeae..44dbef8c 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -308,7 +308,7 @@ can be done with the {meth}`tskit.TreeSequence.simplify` method: ```{code-cell} import numpy as np np.random.seed(3) -alive_inds = rts.individuals_alive_at(0) +alive_inds = pyslim.individuals_alive_at(rts, 0) keep_indivs = np.random.choice(alive_inds, 100, replace=False) keep_nodes = [] for i in keep_indivs: @@ -356,14 +356,12 @@ but any of the other mutation models in msprime could be used. This works as follows: ```{code-cell} -ts = pyslim.SlimTreeSequence( - msprime.sim_mutations( +ts = msprime.sim_mutations( sts, rate=1e-8, model=msprime.SLiMMutationModel(type=0), keep=True, - ) - ) +) print(f"The tree sequence now has {ts.num_mutations} mutations,\n" f"and mean pairwise nucleotide diversity is {ts.diversity():0.3e}.") @@ -412,8 +410,8 @@ the list of individual IDs of all those alive at the end of the simulation (i.e., zero time units ago), we could do: ```{code-cell} -orig_ts = pyslim.load("example_sim.trees") -alive = orig_ts.individuals_alive_at(0) +orig_ts = tskit.load("example_sim.trees") +alive = pyslim.individuals_alive_at(orig_ts, 0) print(f"There are {len(alive)} individuals alive in the final generation.") ``` @@ -425,9 +423,7 @@ and write their SNPs to a VCF is: ```{code-cell} np.random.seed(1) keep_indivs = np.random.choice(alive, 100, replace=False) -ts = pyslim.SlimTreeSequence( - msprime.sim_mutations(orig_ts, rate=1e-8, random_seed=1) -) +ts = msprime.sim_mutations(orig_ts, rate=1e-8, random_seed=1) with open("example_snps.vcf", "w") as vcffile: ts.write_vcf(vcffile, individuals=keep_indivs) ``` @@ -456,9 +452,7 @@ keep_nodes = [] for i in keep_indivs: keep_nodes.extend(orig_ts.individual(i).nodes) sts = rts.simplify(keep_nodes) -ts = pyslim.SlimTreeSequence( - msprime.sim_mutations(sts, rate=1e-8, random_seed=1) -) +ts = msprime.sim_mutations(sts, rate=1e-8, random_seed=1) ``` Individuals are retained by simplify if any of their nodes are, so we would get an alive individual without sample nodes if, for instance, @@ -488,7 +482,7 @@ using {meth}`is_sample() `: ```{code-cell} indivlist = [] -for i in ts.individuals_alive_at(0): +for i in pyslim.individuals_alive_at(ts, 0): ind = ts.individual(i) if ts.node(ind.nodes[0]).is_sample(): indivlist.append(i) @@ -520,8 +514,8 @@ To count up how many individuals are in each population, we could do: ```{code-cell} -orig_ts = pyslim.load("migrants.trees") -alive = orig_ts.individuals_alive_at(0) +orig_ts = tskit.load("migrants.trees") +alive = pyslim.individuals_alive_at(orig_ts, 0) num_alive = [0 for _ in range(orig_ts.num_populations)] for i in alive: ind = orig_ts.individual(i) @@ -566,15 +560,15 @@ demography.add_migration_rate_change( time=orig_ts.metadata['SLiM']['generation'], rate=0.1, source="p2", dest="p1", ) -rts = pyslim.recapitate(orig_ts, demography=demography, - recombination_rate=1e-8, - random_seed=4 +rts = pyslim.recapitate( + orig_ts, demography=demography, + recombination_rate=1e-8, + random_seed=4 ) -ts = pyslim.SlimTreeSequence( - msprime.sim_mutations( - rts, rate=1e-8, - model=msprime.SLiMMutationModel(type=0), - random_seed=7) +ts = msprime.sim_mutations( + rts, rate=1e-8, + model=msprime.SLiMMutationModel(type=0), + random_seed=7 ) ``` @@ -589,7 +583,7 @@ that are alive at the end of the simulation. ```{code-cell} pop_indivs = [[], [], []] pop_nodes = [[], [], []] -for i in ts.individuals_alive_at(0): +for i in pyslim.individuals_alive_at(ts, 0): ind = ts.individual(i) pop_indivs[ind.population].append(i) pop_nodes[ind.population].extend(ind.nodes) @@ -652,7 +646,7 @@ max_age = max([ind.metadata["age"] for ind in ts.individuals()]) age_table = np.zeros((max_age + 1, 2)) age_labels = {pyslim.INDIVIDUAL_TYPE_FEMALE: 'females', pyslim.INDIVIDUAL_TYPE_MALE: 'males'} -for i in ts.individuals_alive_at(0): +for i in pyslim.individuals_alive_at(ts, 0): ind = ts.individual(i) age_table[ind.metadata["age"], ind.metadata["sex"]] += 1 @@ -685,8 +679,9 @@ and {attr}`.SlimTreeSequence.individual_populations` as follows: ```{code-cell} alive = ts.individuals_alive_at(0) adults = alive[ts.individual_ages[alive] > 2] -pops = [np.where( - ts.individual_populations[adults] == k)[0] for k in [1, 2]] +pops = [ + np.where(ts.individual_populations[adults] == k)[0] for k in [1, 2] +] sample_inds = [np.random.choice(pop, 10, replace=False) for pop in pops] sample_nodes = [] for samp in sample_inds: @@ -894,13 +889,11 @@ stored in the mutation metadata. To modify the mutations to be under selection, see {ref}`sec_vignette_coalescent_diversity`. ```{code-cell} -ts = pyslim.SlimTreeSequence( - msprime.sim_mutations( - ts, rate=1e-8, - model=msprime.SLiMMutationModel(type=0), - random_seed=9 - ) - ) +ts = msprime.sim_mutations( + ts, rate=1e-8, + model=msprime.SLiMMutationModel(type=0), + random_seed=9 +) ``` Now the mutations have SLiM metadata. For instance, here's the first mutation: @@ -949,7 +942,7 @@ slim -s 23 selection.slim First, let's see how many mutations there are: ```{code-cell} -ts = pyslim.load("selection.trees") +ts = tskit.load("selection.trees") print(f"Number of sites: {ts.num_sites}\n" f"Number of mutations: {ts.num_mutations}") ``` diff --git a/docs/vignette_coalescent_diversity.md b/docs/vignette_coalescent_diversity.md index 1f654f26..91ff2bd1 100644 --- a/docs/vignette_coalescent_diversity.md +++ b/docs/vignette_coalescent_diversity.md @@ -286,7 +286,7 @@ This runs quickly, since it's only 100 generations. First, let's look at what mutations are present. ```{code-cell} -ts = pyslim.load("vignette_annotated.trees") +ts = tskit.load("vignette_annotated.trees") num_stacked = np.array([len(m.metadata["mutation_list"]) for m in ts.mutations()]) old_mut = np.array([m.time > ts.slim_generation - 1 - 1e-12 for m in ts.mutations()]) assert sum(old_mut) == ots.num_mutations diff --git a/docs/vignette_continuing.md b/docs/vignette_continuing.md index 64e7c04c..19afa790 100644 --- a/docs/vignette_continuing.md +++ b/docs/vignette_continuing.md @@ -58,7 +58,7 @@ slim -s 5 rapid_adaptation.slim We can see what happened in the GUI, but let's pull some more statistics out of the tree sequence: ```{code-cell} -ts = pyslim.load("rapid_adaptation.trees") +ts = tskit.load("rapid_adaptation.trees") # allele frequencies p = ts.sample_count_stat( @@ -78,7 +78,7 @@ We'll add SLiM mutations with "mutation type" 0 so first we check that all the existing mutations are of a different type. ```{code-cell} -rts = ts.recapitate(Ne=1000, recombination_rate=1e-8, random_seed=6) +rts = pyslim.recapitate(ts, Ne=1000, recombination_rate=1e-8, random_seed=6) # check type m0 is not used: mut_types = set([md['mutation_type'] @@ -88,11 +88,10 @@ print(f"Keeping {rts.num_mutations} existing mutations of type(s) {mut_types}.") assert 0 not in mut_types # add type m0 mutations -rts = pyslim.SlimTreeSequence( - msprime.sim_mutations( +rts = msprime.sim_mutations( rts, rate=1e-8, random_seed=7, keep=True, - model=msprime.SLiMMutationModel(type=0)) - ) + model=msprime.SLiMMutationModel(type=0) +) p = rts.sample_count_stat( [rts.samples()], lambda x: x/20000, 1, windows='sites', @@ -163,7 +162,7 @@ new_nodes = np.where(new_tables.nodes.time == new_time)[0] print(f"There are {len(new_nodes)} nodes from the start of the new simulation.") # There are 4425 nodes from the start of the new simulation. -slim_indivs = rts.individuals_alive_at(0) +slim_indivs = pyslim.individuals_alive_at(rts, 0) slim_nodes = [] for ind in slim_indivs: slim_nodes.extend(ts.individual(ind).nodes) @@ -189,7 +188,7 @@ tables.union(new_tables, node_map, check_shared_equality=False) # get back the tree sequence -full_ts = pyslim.SlimTreeSequence(tables.tree_sequence()) +full_ts = tables.tree_sequence() p = full_ts.sample_count_stat( [full_ts.samples()], lambda x: x/20000, 1, diff --git a/docs/vignette_parallel_phylo.md b/docs/vignette_parallel_phylo.md index 794948b6..783d1e11 100644 --- a/docs/vignette_parallel_phylo.md +++ b/docs/vignette_parallel_phylo.md @@ -304,8 +304,9 @@ samples per population. ```{code-cell} rng = np.random.default_rng(seed=123) -ind_alive = tsu.individuals_alive_at(0) -ind_pops = tsu.individual_populations[ind_alive] +ind_alive = pyslim.individuals_alive_at(tsu, 0) +_, populations, _ = pyslim.individual_times_populations_ages(tsu) +ind_pops = populations[ind_alive] subsample_indivs = [ rng.choice(ind_alive[ind_pops == pop_ids[name]], 2) for name in pops @@ -314,11 +315,9 @@ subsample_nodes = [ np.concatenate([tsu.individual(i).nodes for i in x]) for x in subsample_indivs ] -tsus = pyslim.SlimTreeSequence( - tsu.simplify( +tsus = tsu.simplify( np.concatenate(subsample_nodes), filter_populations=False, - ) ) pop_labels = {v: k for k, v in pop_ids.items()} SVG(tsus.draw_svg( diff --git a/docs/vignette_space.md b/docs/vignette_space.md index 3d202042..fc945c7c 100644 --- a/docs/vignette_space.md +++ b/docs/vignette_space.md @@ -60,7 +60,7 @@ slim -s 23 vignette_space.slim Ok, now let's have a quick look at the output: ```{code-cell} -slim_ts = pyslim.load("spatial_sim.trees") +slim_ts = tskit.load("spatial_sim.trees") print(f"The tree sequence has {slim_ts.num_trees} trees\n" f"on a genome of length {slim_ts.sequence_length},\n" f"{slim_ts.num_individuals} individuals, {slim_ts.num_samples} 'sample' genomes,\n" @@ -76,8 +76,9 @@ Let's have a look at how old those individuals are, by tabulating when they were born: ```{code-cell} -for t in np.unique(slim_ts.individual_times): - print(f"There are {np.sum(slim_ts.individual_times == t)} individuals from time {t}.") +individual_times, _, _ = pyslim.individual_times_populations_ages(slim_ts) +for t in np.unique(individual_times): + print(f"There are {np.sum(individual_times == t)} individuals from time {t}.") ``` These "times" record the birth times of each individual. @@ -94,7 +95,7 @@ Let's check that all these individuals are alive at either (a) today or (b) 1000 ```{code-cell} for t in [0, 1000]: - alive = slim_ts.individuals_alive_at(t) + alive = pyslim.individuals_alive_at(slim_ts, t) print(f"There were {len(alive)} individuals alive {t} time steps in the past.") ``` @@ -137,14 +138,12 @@ we would need to pass ``keep_input_roots=True`` to allow recapitation. ```{code-cell} recap_ts = pyslim.recapitate(slim_ts, recombination_rate=1e-8, ancestral_Ne=1000) -ts = pyslim.SlimTreeSequence( - msprime.sim_mutations( +ts = msprime.sim_mutations( recap_ts, rate=1e-8, model=msprime.SLiMMutationModel(type=0), keep=True, - ) - ) + ) ts.dump("spatial_sim.recap.trees") print(f"The tree sequence now has {ts.num_trees} trees,\n" @@ -185,7 +184,7 @@ We'll get genomes to work with by pulling out np.random.seed(23) -alive = ts.individuals_alive_at(0) +alive = pyslim.individuals_alive_at(ts, 0) locs = ts.individual_locations[alive, :] W = 35 @@ -199,7 +198,7 @@ groups = { np.abs(locs[:, 1] - W/2) < w/2)] } -old_ones = ts.individuals_alive_at(1000) +old_ones = pyslim.individuals_alive_at(ts, 1000) groups['ancient'] = np.random.choice(old_ones, size=5) for k in groups: diff --git a/pyslim/slim_tree_sequence.py b/pyslim/slim_tree_sequence.py index af4ab42d..8fefffdf 100644 --- a/pyslim/slim_tree_sequence.py +++ b/pyslim/slim_tree_sequence.py @@ -26,6 +26,17 @@ NUCLEOTIDES = ['A', 'C', 'G', 'T'] +def _deprecation_warning(w): + warnings.warn( + "The SlimTreeSequence class is deprecated, and will be removed in the " + "near future, as all important functionality is provided by tskit. " + "Please see the " + "`documentation `_. " + f"{w}", + FutureWarning + ) + + def load(path, legacy_metadata=False): ''' Load the SLiM-compatible tree sequence found in the .trees file at ``path``. @@ -40,7 +51,7 @@ def load(path, legacy_metadata=False): "for how to update your script." ) - ts = SlimTreeSequence.load(path, legacy_metadata=legacy_metadata) + ts = SlimTreeSequence.load(path) return ts @@ -58,7 +69,7 @@ def annotate_defaults(ts, **kwargs): ''' Takes a tree sequence (as produced by msprime, for instance), and adds in the information necessary for SLiM to use it as an initial state, filling in - mostly default values. Returns a :class:`SlimTreeSequence`. + mostly default values. Returns a :class:`tskit.TreeSequence`. :param TreeSequence ts: A :class:`TreeSequence`. :param string model_type: SLiM model type: either "WF" or "nonWF". @@ -71,7 +82,7 @@ def annotate_defaults(ts, **kwargs): ''' tables = ts.dump_tables() annotate_defaults_tables(tables, **kwargs) - return SlimTreeSequence.load_tables(tables) + return tables.tree_sequence() def annotate_defaults_tables(tables, model_type, slim_generation, reference_sequence=None, @@ -119,11 +130,12 @@ class MetadataDictWrapper(dict): def __getattr__(self, name): if name in self.keys(): raise AttributeError( - f"'dict' object has no attribute '{name}'. " - "It looks like you're trying to use the legacy " - "metadata interface: see " - "`the documentation `_ " - "for how to switch over your script") + f"'dict' object has no attribute '{name}'. " + "It looks like you're trying to use the legacy " + "metadata interface: see the " + "`documentation `_ " + "for how to switch over your script" + ) else: raise AttributeError(f"'dict' object has no attribute '{name}'") @@ -133,10 +145,12 @@ def __getitem__(self, key): except KeyError as e: if isinstance(key, int): msg = e.args[0] - e.args = (f"{msg}: It looks like you're trying to use the legacy " - "metadata interface: see " - "`the documentation `_ " - "for how to switch over your script",) + e.args = ( + f"{msg}: It looks like you're trying to use the legacy " + "metadata interface: see the " + "`documentation `_ " + "for how to switch over your script", + ) raise e @@ -173,6 +187,7 @@ def __init__(self, ts, legacy_metadata=False): "`_" "for how to update your script." ) + _deprecation_warning("") if not (isinstance(ts.metadata, dict) and 'SLiM' in ts.metadata and ts.metadata['SLiM']['file_version'] in compatible_slim_file_versions): @@ -185,24 +200,11 @@ def __init__(self, ts, legacy_metadata=False): # pre-extract individual metadata self.individual_locations = ts.tables.individuals.location self.individual_locations.shape = (int(len(self.individual_locations)/3), 3) - self.individual_ages = np.zeros(ts.num_individuals, dtype='int') - if self.model_type != "WF": - self.individual_ages = np.fromiter(map(lambda ind: ind.metadata['age'], ts.individuals()), dtype='int64') - - self.individual_times = np.zeros(ts.num_individuals) - self.individual_populations = np.repeat(np.int32(-1), ts.num_individuals) - if not np.all(unique_labels_by_group(ts.tables.nodes.individual, - ts.tables.nodes.population)): - raise ValueError("Individual has nodes from more than one population.") - if not np.all(unique_labels_by_group(ts.tables.nodes.individual, - ts.tables.nodes.time)): - raise ValueError("Individual has nodes from more than one time.") - has_indiv = (ts.tables.nodes.individual >= 0) - which_indiv = ts.tables.nodes.individual[has_indiv] - # if we did not do the sanity check above then an individual with nodes in more than one pop - # would get the pop of their last node in the list - self.individual_populations[which_indiv] = ts.tables.nodes.population[has_indiv] - self.individual_times[which_indiv] = ts.tables.nodes.time[has_indiv] + + times, populations, ages = individual_times_populations_ages(self) + self.individual_times = times + self.individual_populations = populations + self.individual_ages = ages def __getstate__(self): return { @@ -216,16 +218,17 @@ def __setstate__(self, state): @property def slim_generation(self): - # return self.metadata['SLiM']['generation'] + _deprecation_warning("Please access ts.metadata['SLiM']['generation'] instead.") return self._slim_generation @slim_generation.setter def slim_generation(self, value): - # TODO: throw a deprecation warning here after stdpopsim updates + _deprecation_warning("Please access ts.metadata['SLiM']['generation'] instead.") self._slim_generation = value @property def model_type(self): + _deprecation_warning("Please access ts.metadata['SLiM']['model_type'] instead.") return self.metadata['SLiM']['model_type'] @classmethod @@ -254,17 +257,17 @@ def load_tables(cls, tables, **kwargs): def dump(self, path, **kwargs): ''' - Dumps the tree sequence to the path specified. This is mostly just a wrapper for - :func:`tskit.TreeSequence.dump`, but also writes out the reference sequence. + Dumps the tree sequence to the path specified. This is only a wrapper + :func:`tskit.TreeSequence.dump`. :param str path: The file path to write the TreeSequence to. :param kwargs: Additional keyword args to pass to tskit.TreeSequence.dump ''' # temporary until we remove support for setting slim_generation - if self.slim_generation != self.metadata['SLiM']['generation']: + if self._slim_generation != self.metadata['SLiM']['generation']: tables = self.dump_tables() md = tables.metadata - md['SLiM']['generation'] = self.slim_generation + md['SLiM']['generation'] = self._slim_generation tables.metadata = md tables.dump(path, **kwargs) else: @@ -282,10 +285,10 @@ def simplify(self, *args, **kwargs): :rtype SlimTreeSequence: ''' # temporary until we remove support for setting slim_generation - if self.slim_generation != self.metadata['SLiM']['generation']: + if self._slim_generation != self.metadata['SLiM']['generation']: tables = self.dump_tables() md = tables.metadata - md['SLiM']['generation'] = self.slim_generation + md['SLiM']['generation'] = self._slim_generation tables.metadata = md # note we have to go to a tree sequence to get the map_nodes argument sts = tables.tree_sequence().simplify(*args, **kwargs) @@ -407,6 +410,7 @@ def recapitate(self, if "keep_first_generation" in kwargs: raise ValueError("The keep_first_generation argument is deprecated: " "the FIRST_GEN flag is no longer used.") + _deprecation_warning("Please use pyslim.recapitate( ) instead.") if recombination_map is None: recombination_map = msprime.RecombinationMap( @@ -418,10 +422,10 @@ def recapitate(self, population_configurations = [msprime.PopulationConfiguration() for _ in range(self.num_populations)] # temporary until we remove support for setting slim_generation - if self.slim_generation != self.metadata['SLiM']['generation']: + if self._slim_generation != self.metadata['SLiM']['generation']: tables = self.dump_tables() md = tables.metadata - md['SLiM']['generation'] = self.slim_generation + md['SLiM']['generation'] = self._slim_generation tables.metadata = md ts = tables.tree_sequence() else: @@ -454,34 +458,8 @@ def mutation_at(self, node, position, time=None): :returns: Index of the mutation in question, or -1 if none. ''' - if position < 0 or position >= self.sequence_length: - raise ValueError("Position {} not valid.".format(position)) - if node < 0 or node >= self.num_nodes: - raise ValueError("Node {} not valid.".format(node)) - if time is None: - time = self.node(node).time - tree = self.at(position) - site_pos = self.tables.sites.position - out = tskit.NULL - if position in site_pos: - site_index = np.where(site_pos == position)[0][0] - site = self.site(site_index) - mut_nodes = [] - # look for only mutations that occurred before `time` - # not strictly necessary if time was None - for mut in site.mutations: - if mut.time >= time: - mut_nodes.append(mut.node) - n = node - while n > -1 and n not in mut_nodes: - n = tree.parent(n) - if n >= 0: - # do careful error checking here - for mut in site.mutations: - if mut.node == n: - assert(out == tskit.NULL or out == mut.parent) - out = mut.id - return out + _deprecation_warning("Please use pyslim.mutation_at(ts, ...) instead.") + return mutation_at(self, node=node, position=position, time=time) def nucleotide_at(self, node, position, time=None): ''' @@ -500,16 +478,8 @@ def nucleotide_at(self, node, position, time=None): :returns: Index of the nucleotide in ``NUCLEOTIDES`` (0=A, 1=C, 2=G, 3=T). ''' - if not self.has_reference_sequence(): - raise ValueError("This tree sequence has no reference sequence.") - mut_id = self.mutation_at(node, position, time) - if mut_id == tskit.NULL: - out = NUCLEOTIDES.index(self.reference_sequence.data[int(position)]) - else: - mut = self.mutation(mut_id) - k = np.argmax([u["slim_time"] for u in mut.metadata["mutation_list"]]) - out = mut.metadata["mutation_list"][k]["nucleotide"] - return out + _deprecation_warning("Please use pyslim.nucleotide_at(ts, ...) instead.") + return nucleotide_at(self, node=node, position=position, time=time) @property def slim_provenance(self): @@ -582,62 +552,11 @@ def individuals_alive_at(self, time, stage='late', remembered_stage=None, :param bool samples_only: Whether to return only individuals who have at least one node marked as samples. """ - if stage not in ("late", "early"): - raise ValueError(f"Unknown stage '{stage}': " - "should be either 'early' or 'late'.") - - if remembered_stage is None: - remembered_stage = self.metadata['SLiM']['stage'] - - if remembered_stage not in ("late", "early"): - raise ValueError(f"Unknown remembered_stage '{remembered_stage}': " - "should be either 'early' or 'late'.") - if remembered_stage != self.metadata['SLiM']['stage']: - warnings.warn(f"Provided remembered_stage '{remembered_stage}' does not" - " match the stage at which the tree sequence was saved" - f" ('{self.metadata['SLiM']['stage']}'). This is not necessarily" - " an error, but mismatched stages will lead to inconsistencies:" - " make sure you know what you're doing.") - - # birth_time is the time ago that they were first alive in 'late' - # in a nonWF model they are alive for the same time step's 'early' - # but in a WF model the first 'early' they were alive for is one more recent - birth_time = self.individual_times - # birth_time - birth_offset is the first time ago they were alive - # during stage 'stage' - if stage == "early" and self.metadata['SLiM']['model_type'] == "WF": - birth_offset = 1 - else: - birth_offset = 0 - # ages is the number of complete life cycles they are known to have lived through, - # and so individuals have lived through at least 'age + 1' of both stages. - # In nonWF models, they live for one more 'early' than 'late', - # but this is only reflected in their age if Remembered in 'early'. - ages = self.individual_ages - # ages + age_offset + 1 is the number of 'stage' stages they are known - # to have lived through - if (self.metadata['SLiM']['model_type'] == "WF" - or stage == remembered_stage): - age_offset = 0 - else: - if (remembered_stage == "early" - and stage == "late"): - age_offset = -1 - else: - age_offset = 1 - # if adjusted age=0 then they are be alive at exactly one time step - alive_bool = np.logical_and( - birth_time >= time + birth_offset, - birth_time - ages <= time + birth_offset + age_offset) - - if population is not None: - alive_bool &= np.isin(self.individual_populations, population) - if samples_only: - alive_bool &= (0 < np.bincount(1 + self.tables.nodes.individual, - self.tables.nodes.flags & tskit.NODE_IS_SAMPLE, - minlength=1 + self.num_individuals)[1:]) - - return np.where(alive_bool)[0] + _deprecation_warning("Please use pyslim.individuals_alive_at(ts, ...) instead.") + return individuals_alive_at( + self, time=time, stage=stage, remembered_stage=remembered_stage, + population=population, samples_only=samples_only + ) def individual_ages_at(self, time, stage="late", remembered_stage="late"): """ @@ -665,11 +584,10 @@ def individual_ages_at(self, time, stage="late", remembered_stage="late"): :param str remembered_stage: The stage in the SLiM life cycle during which individuals were Remembered. """ - ages = np.repeat(np.nan, self.num_individuals) - alive = self.individuals_alive_at(time, stage=stage, - remembered_stage=remembered_stage) - ages[alive] = self.individual_times[alive] - time - return ages + _deprecation_warning("Please use pyslim.individual_ages_at(ts, ...) instead.") + return individual_ages_at( + self, time=time, stage=stage, remembered_stage=remembered_stage + ) def slim_time(self, time, stage="late"): """ @@ -699,13 +617,8 @@ def slim_time(self, time, stage="late"): :param string stage: The stage of the SLiM life cycle that the SLiM time should be computed for. """ - slim_time = self.metadata['SLiM']['generation'] - time - if self.metadata['SLiM']['model_type'] == "WF": - if (self.metadata['SLiM']['stage'] == "early" and stage == "late"): - slim_time -= 1 - if (self.metadata['SLiM']['stage'] == "late" and stage == "early"): - slim_time += 1 - return slim_time + _deprecation_warning("Please use pyslim.slim_time(ts, ...) instead.") + return slim_time(self, time=time, stage=stage) def first_generation_individuals(self): """ @@ -726,53 +639,6 @@ def first_generation_individuals(self): "instead.", FutureWarning) return np.where(self.tables.individuals.flags & INDIVIDUAL_RETAINED > 0)[0] - def _do_individual_parents_stuff(self, return_parents=False): - # Helper for has_individual_parents and individual_parents, - # which share a lot of machinery. - edges = self.tables.edges - nodes = self.tables.nodes - edge_parent_indiv = nodes.individual[edges.parent] - edge_child_indiv = nodes.individual[edges.child] - # nodes whose parent nodes are all in the same individual - unique_parent_nodes = unique_labels_by_group( - edges.child, - edge_parent_indiv, - minlength=nodes.num_rows) - unique_parent_edges = unique_parent_nodes[edges.child] - # edges describing relationships between individuals - indiv_edges = np.logical_and( - np.logical_and(edge_parent_indiv != tskit.NULL, - edge_child_indiv != tskit.NULL), - unique_parent_edges) - # individual edges where the parent was alive during "late" - # of the time step before the child is born - child_births = self.individual_times[edge_child_indiv[indiv_edges]] - parent_births = self.individual_times[edge_parent_indiv[indiv_edges]] - alive_edges = indiv_edges.copy() - if self.metadata['SLiM']['model_type'] == "WF": - alive_edges[indiv_edges] = (child_births + 1 == parent_births) - else: - parent_deaths = parent_births - self.individual_ages[edge_parent_indiv[indiv_edges]] - alive_edges[indiv_edges] = (child_births + 1 >= parent_deaths) - edge_spans = edges.right - edges.left - parental_span = np.bincount(edge_child_indiv[alive_edges], - weights=edge_spans[alive_edges], minlength=self.num_individuals) - # we could also check for edges without individual parents terminating - # in this individual, but this is unnecessary as the entire genome is - # accounted for - has_all_parents = (parental_span == 2 * self.sequence_length) - if return_parents: - full_parent_edges = np.logical_and( - alive_edges, - has_all_parents[edge_child_indiv]) - parents = np.unique(np.column_stack( - [edge_parent_indiv[full_parent_edges], - edge_child_indiv[full_parent_edges]] - ), axis=0) - return parents - else: - return has_all_parents - def individual_parents(self): ''' Finds all parent-child relationships in the tree sequence (as far as we @@ -786,7 +652,8 @@ def individual_parents(self): :return: An array of individual IDs, with row [i, j] if individual i is a parent of individual j. ''' - return self._do_individual_parents_stuff(return_parents=True) + _deprecation_warning("Please use pyslim.individual_parents(ts, ...) instead.") + return individual_parents(self) def has_individual_parents(self): ''' @@ -808,7 +675,8 @@ def has_individual_parents(self): :return: A boolean array of length equal to ``targets``. ''' - return self._do_individual_parents_stuff(return_parents=False) + _deprecation_warning("Please use pyslim.has_individual_parents(ts, ...) instead.") + return has_individual_parents(self) def _set_nodes_individuals(tables, age): @@ -1080,3 +948,377 @@ def update_tables(tables): md = tables.metadata md['SLiM']['file_version'] = slim_file_version tables.metadata = md + + +def mutation_at(ts, node, position, time=None): + ''' + Finds the mutation present in the genome of ``node`` at ``position``, + returning -1 if there is no such mutation recorded in the tree + sequence. Warning: if ``node`` is not actually in the tree sequence + (e.g., not ancestral to any samples) at ``position``, then this + function will return -1, possibly erroneously. If `time` is provided, + returns the last mutation at ``position`` inherited by ``node`` that + occurred at or before ``time`` ago. + + :param TreeSequence ts: The tree sequence. + :param int node: The index of a node in the tree sequence. + :param float position: A position along the genome. + :param int time: The time ago that we want the nucleotide, or None, + in which case the ``time`` of ``node`` is used. + + :returns: Index of the mutation in question, or -1 if none. + ''' + if position < 0 or position >= ts.sequence_length: + raise ValueError("Position {} not valid.".format(position)) + if node < 0 or node >= ts.num_nodes: + raise ValueError("Node {} not valid.".format(node)) + if time is None: + time = ts.node(node).time + tree = ts.at(position) + site_pos = ts.tables.sites.position + out = tskit.NULL + if position in site_pos: + site_index = np.where(site_pos == position)[0][0] + site = ts.site(site_index) + mut_nodes = [] + # look for only mutations that occurred before `time` + # not strictly necessary if time was None + for mut in site.mutations: + if mut.time >= time: + mut_nodes.append(mut.node) + n = node + while n > -1 and n not in mut_nodes: + n = tree.parent(n) + if n >= 0: + # do careful error checking here + for mut in site.mutations: + if mut.node == n: + assert(out == tskit.NULL or out == mut.parent) + out = mut.id + return out + + +def nucleotide_at(ts, node, position, time=None): + ''' + Finds the nucleotide present in the genome of ``node`` at ``position``. + Warning: if ``node`` is not actually in the tree sequence (e.g., not + ancestral to any samples) at ``position``, then this function will + return the reference sequence nucleotide, possibly erroneously. If + `time` is provided, returns the last nucletide produced by a mutation + at ``position`` inherited by ``node`` that occurred at or before + ``time`` ago. + + :param TreeSequence ts: The tree sequence. + :param int node: The index of a node in the tree sequence. + :param float position: A position along the genome. + :param int time: The time ago that we want the nucleotide, or None, + in which case the ``time`` of ``node`` is used. + + :returns: Index of the nucleotide in ``NUCLEOTIDES`` (0=A, 1=C, 2=G, 3=T). + ''' + if not ts.has_reference_sequence(): + raise ValueError("This tree sequence has no reference sequence.") + mut_id = pyslim.mutation_at(ts, node, position, time) + if mut_id == tskit.NULL: + out = NUCLEOTIDES.index(ts.reference_sequence.data[int(position)]) + else: + mut = ts.mutation(mut_id) + k = np.argmax([u["slim_time"] for u in mut.metadata["mutation_list"]]) + out = mut.metadata["mutation_list"][k]["nucleotide"] + return out + +def individual_times_populations_ages(ts): + tables = ts.tables + times = np.zeros(ts.num_individuals) + populations = np.repeat(np.int32(-1), ts.num_individuals) + if not np.all(unique_labels_by_group(tables.nodes.individual, + tables.nodes.population)): + raise ValueError("Individual has nodes from more than one population.") + if not np.all(unique_labels_by_group(tables.nodes.individual, + tables.nodes.time)): + raise ValueError("Individual has nodes from more than one time.") + has_indiv = (tables.nodes.individual >= 0) + which_indiv = tables.nodes.individual[has_indiv] + # if we did not do the sanity check above then an individual with nodes in more than one pop + # would get the pop of their last node in the list + times[which_indiv] = tables.nodes.time[has_indiv] + populations[which_indiv] = tables.nodes.population[has_indiv] + if ts.metadata['SLiM']['model_type'] == "WF": + ages = np.zeros(ts.num_individuals, dtype='int') + else: + ages = tables.individuals.metadata_vector("age") + return times, populations, ages + + +def individuals_alive_at(ts, time, stage='late', remembered_stage=None, + population=None, samples_only=False): + """ + Returns an array giving the IDs of all individuals that are known to be + alive at the given time ago. This is determined using their birth time + ago (given by their `time` attribute) and, for nonWF models, + their `age` attribute (which is equal to their age at the last time + they were Remembered). See also :meth:`.individual_ages_at`. + + In WF models, birth occurs after "early()", so that individuals are only + alive during "late()" for the time step when they have age zero, + while in nonWF models, birth occurs before "early()", so they are alive + for both stages. + + In both WF and nonWF models, mortality occurs between + "early()" and "late()", so that individuals are last alive during the + "early()" stage of the time step of their final age, and if individuals + are alive during "late()" they will also be alive during "early()" of the + next time step. This means it is important to know during which stage + individuals were Remembered - for instance, if the call to + sim.treeSeqRememberIndividuals() was made during "early()" of a given time step, + then those individuals might not have survived until "late()" of that + time step. Since SLiM does not record the stage at which individuals + were Remembered, you can specify this by setting ``remembered_stages``: + it should be the stage during which *all* calls to + ``sim.treeSeqRememberIndividuals()``, as well as to ``sim.treeSeqOutput()``, + were made. + + Note also that in nonWF models, birth occurs before "early()", so the + possible parents in a given time step are those that are alive in + "early()" and have age greater than zero, or, equivalently, are alive in + "late()" during the previous time step. + In WF models, birth occurs after "early()", so possible parents in a + given time step are those that are alive during "early()" of that time + step or are alive during "late()" of the previous time step. + + :param TreeSequence ts: The tree sequence. + :param float time: The number of time steps ago. + :param str stage: The stage in the SLiM life cycle that we are inquiring + about (either "early" or "late"; defaults to "late"). + :param str remembered_stage: The stage in the SLiM life cycle + during which individuals were Remembered (defaults to the stage the + tree sequence was recorded at, stored in metadata). + :param int population: If given, return only individuals in the + population(s) with these population ID(s). + :param bool samples_only: Whether to return only individuals who have at + least one node marked as samples. + """ + if stage not in ("late", "early"): + raise ValueError(f"Unknown stage '{stage}': " + "should be either 'early' or 'late'.") + + if remembered_stage is None: + remembered_stage = ts.metadata['SLiM']['stage'] + + if remembered_stage not in ("late", "early"): + raise ValueError(f"Unknown remembered_stage '{remembered_stage}': " + "should be either 'early' or 'late'.") + if remembered_stage != ts.metadata['SLiM']['stage']: + warnings.warn(f"Provided remembered_stage '{remembered_stage}' does not" + " match the stage at which the tree sequence was saved" + f" ('{ts.metadata['SLiM']['stage']}'). This is not necessarily" + " an error, but mismatched stages will lead to inconsistencies:" + " make sure you know what you're doing.") + + # birth_time is the time ago that they were first alive in 'late' + # in a nonWF model they are alive for the same time step's 'early' + # but in a WF model the first 'early' they were alive for is one more recent + birth_time, populations, ages = individual_times_populations_ages(ts) + # birth_time - birth_offset is the first time ago they were alive + # during stage 'stage' + if stage == "early" and ts.metadata['SLiM']['model_type'] == "WF": + birth_offset = 1 + else: + birth_offset = 0 + # ages is the number of complete life cycles they are known to have lived through, + # and so individuals have lived through at least 'age + 1' of both stages. + # In nonWF models, they live for one more 'early' than 'late', + # but this is only reflected in their age if Remembered in 'early'. + # So: + # ages + age_offset + 1 is the number of 'stage' stages they are known + # to have lived through + if (ts.metadata['SLiM']['model_type'] == "WF" + or stage == remembered_stage): + age_offset = 0 + else: + if (remembered_stage == "early" + and stage == "late"): + age_offset = -1 + else: + age_offset = 1 + # if adjusted age=0 then they are be alive at exactly one time step + alive_bool = np.logical_and( + birth_time >= time + birth_offset, + birth_time - ages <= time + birth_offset + age_offset) + + if population is not None: + alive_bool &= np.isin(populations, population) + if samples_only: + nodes = ts.tables.nodes + alive_bool &= (0 < np.bincount(1 + nodes.individual, + nodes.flags & tskit.NODE_IS_SAMPLE, + minlength=1 + ts.num_individuals)[1:]) + + return np.where(alive_bool)[0] + + +def individual_ages_at(ts, time, stage="late", remembered_stage="late"): + """ + Returns the `ages` of each individual at the corresponding time ago, + which will be ``nan`` if the individual is either not born yet or dead. + This is computed as the time ago the individual was born (found by the + `time` associated with the the individual's nodes) minus the `time` + argument; while "death" is inferred from the individual's ``age``, + recorded in metadata. These values are the same as what would be shown + in SLiM during the corresponding time step and stage. + + Since age increments at the end of each time step, + the age is the number of time steps ends the individual has lived + through, so if they were born in time step `time`, then their age + will be zero. + + In a WF model, this method does not provide any more information than + does :meth:`.individuals_alive_at`, but for consistency, non-nan ages + will be 0 in "late" and 1 in "early". + See :meth:`.individuals_alive_at` for further discussion. + + :param TreeSequence ts: The tree sequence. + :param float time: The reference time ago. + :param str stage: The stage in the SLiM life cycle used to determine who + is alive (either "early" or "late"; defaults to "late"). + :param str remembered_stage: The stage in the SLiM life cycle during which + individuals were Remembered. + """ + ages = np.repeat(np.nan, ts.num_individuals) + alive = individuals_alive_at( + ts, time, stage=stage, + remembered_stage=remembered_stage + ) + times, _, _ = individual_times_populations_ages(ts) + ages[alive] = times[alive] - time + return ages + + +def slim_time(ts, time, stage="late"): + """ + Converts the given "tskit times" (i.e., in units of time before the end + of the simulation) to SLiM times (those recorded by SLiM, usually in units + of generations since the start of the simulation). Although the latter are + always integers, these will not be if the provided times are not integers. + + When the tree sequence is written out, SLiM records the value of its + current generation, which can be found in the metadata: + ``ts.metadata['SLiM']['generation']``. In most cases, the “SLiM time” + referred to by a time ago in the tree sequence (i.e., the value that would + be reported by sim.generation within SLiM at the point in time thus + referenced) can be obtained by subtracting that time ago from + ``ts.metadata['SLiM']['generation']``. However, in WF models, birth + happens between the “early()” and “late()” stages, so if the tree + sequence was written out using sim.treeSeqOutput() during “early()” in + a WF model, the tree sequence’s times measure time before the last set + of individuals are born, i.e., before SLiM time step + ``ts.metadata['SLiM']['generation'] - 1``. + + In some situations (e.g., mutations added during early() in WF models) + this may not return what you expect. See :ref:`sec_metadata_converting_times` + for more discussion. + + :param TreeSequence ts: The tree sequence. + :param array time: An array of times to be converted. + :param string stage: The stage of the SLiM life cycle that the SLiM time + should be computed for. + """ + slim_time = ts.metadata['SLiM']['generation'] - time + if ts.metadata['SLiM']['model_type'] == "WF": + if (ts.metadata['SLiM']['stage'] == "early" and stage == "late"): + slim_time -= 1 + if (ts.metadata['SLiM']['stage'] == "late" and stage == "early"): + slim_time += 1 + return slim_time + + +def _do_individual_parents_stuff(ts, return_parents=False): + # Helper for has_individual_parents and individual_parents, + # which share a lot of machinery. + tables = ts.tables + edges = tables.edges + nodes = tables.nodes + edge_parent_indiv = nodes.individual[edges.parent] + edge_child_indiv = nodes.individual[edges.child] + # nodes whose parent nodes are all in the same individual + unique_parent_nodes = unique_labels_by_group( + edges.child, + edge_parent_indiv, + minlength=nodes.num_rows) + unique_parent_edges = unique_parent_nodes[edges.child] + # edges describing relationships between individuals + indiv_edges = np.logical_and( + np.logical_and(edge_parent_indiv != tskit.NULL, + edge_child_indiv != tskit.NULL), + unique_parent_edges) + # individual edges where the parent was alive during "late" + # of the time step before the child is born + + times, _, ages = individual_times_populations_ages(ts) + child_births = times[edge_child_indiv[indiv_edges]] + parent_births = times[edge_parent_indiv[indiv_edges]] + alive_edges = indiv_edges.copy() + if ts.metadata['SLiM']['model_type'] == "WF": + alive_edges[indiv_edges] = (child_births + 1 == parent_births) + else: + parent_deaths = parent_births - ages[edge_parent_indiv[indiv_edges]] + alive_edges[indiv_edges] = (child_births + 1 >= parent_deaths) + edge_spans = edges.right - edges.left + parental_span = np.bincount(edge_child_indiv[alive_edges], + weights=edge_spans[alive_edges], minlength=ts.num_individuals) + # we could also check for edges without individual parents terminating + # in this individual, but this is unnecessary as the entire genome is + # accounted for + has_all_parents = (parental_span == 2 * ts.sequence_length) + if return_parents: + full_parent_edges = np.logical_and( + alive_edges, + has_all_parents[edge_child_indiv]) + parents = np.unique(np.column_stack( + [edge_parent_indiv[full_parent_edges], + edge_child_indiv[full_parent_edges]] + ), axis=0) + return parents + else: + return has_all_parents + + +def individual_parents(ts): + ''' + Finds all parent-child relationships in the tree sequence (as far as we + can tell). The output will be a two-column array with row [i,j] + indicating that individual i is a parent of individual j. See + :meth:`.has_individual_parents` for exactly which parents are returned. + + See :meth:`.individuals_alive_at` for further discussion about how + this is determined based on when the individuals were Remembered. + + :param TreeSequence ts: The tree sequence. + :return: An array of individual IDs, with row [i, j] if individual i is + a parent of individual j. + ''' + return _do_individual_parents_stuff(ts, return_parents=True) + + +def has_individual_parents(ts): + ''' + Finds which individuals have both their parent individuals also present + in the tree sequence, as far as we can tell. To do this, we return a + boolean array with True for those individuals for which: + + - all edges terminating in that individual's nodes are in individuals, + - each of the individual's nodes inherit from a single individual only, + - those parental individuals were alive when the individual was born, + - the parental individuals account for two whole genomes. + + This returns a boolean array indicating for each individual whether all + these are true. Note in particular that individuals with only *one* + recorded parent are *not* counted as "having parents". + + See :meth:`.individuals_alive_at` for further discussion about how + this is determined based on when the individuals were Remembered. + + :param TreeSequence ts: The tree sequence. + :return: A boolean array of length equal to ``targets``. + ''' + return _do_individual_parents_stuff(ts, return_parents=False) diff --git a/tests/conftest.py b/tests/conftest.py index 59095597..9a6c6a42 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import pytest import random import subprocess +import warnings from filelock import FileLock import pyslim @@ -54,7 +55,7 @@ def __init__(self, out_dir): self.Outfile( path=os.path.join(out_dir, "out.trees"), slim_name="TREES_FILE", # The var containing the path name for SLiM output - post_process=pyslim.load, # function applied on path to create returned obj + post_process=load, # function applied on path to create returned obj key="ts", # The key to use for the post-processed item in the returned dict ), self.Outfile( @@ -64,6 +65,7 @@ def __init__(self, out_dir): key="info", ), ] + def __getitem__(self, index): return self._outfiles[index] @@ -79,6 +81,13 @@ def results(self): return res +def load(*args, **kwargs): + with warnings.catch_warnings(): + # why isn't this suppressing the warnings?? + warnings.simplefilter("ignore", FutureWarning) + ts = pyslim.load(*args, **kwargs) + return ts + def run_slim(recipe, out_dir, recipe_dir="test_recipes", **kwargs): """ Run the recipe, present in the recipe_dir, outputting to files in out_dir diff --git a/tests/test_annotation.py b/tests/test_annotation.py index d69736ac..b864501a 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -448,7 +448,7 @@ def test_load_without_provenance( in_tables.sort() cleared_ts = pyslim.SlimTreeSequence( in_tables.tree_sequence(), - ) + ) out_ts = helper_functions.run_slim_restart(cleared_ts, restart_name, tmp_path) out_tables = out_ts.dump_tables() out_tables.provenances.clear() diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 8bcc8d4b..b28a5a9c 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -135,7 +135,8 @@ def test_set_tree_sequence_metadata(self, recipe): spatial_dimensionality='xy', spatial_periodicity='y', separate_sexes=False, - nucleotide_based=True) + nucleotide_based=True + ) self.validate_slim_metadata(tables) assert tables.metadata['SLiM']['model_type'] == "WF" assert tables.metadata['SLiM']['generation'] == 99 @@ -207,7 +208,8 @@ def test_load_tables(self, recipe): ts = recipe["ts"] assert isinstance(ts, pyslim.SlimTreeSequence) tables = ts.dump_tables() - new_ts = pyslim.load_tables(tables) + with pytest.warns(FutureWarning): + new_ts = pyslim.load_tables(tables) assert isinstance(new_ts, pyslim.SlimTreeSequence) new_tables = new_ts.dump_tables() assert tables == new_tables @@ -219,19 +221,22 @@ def test_load(self, recipe): assert isinstance(msp_ts, tskit.TreeSequence) # transfer tables msp_tables = msp_ts.dump_tables() - new_ts = pyslim.load_tables(msp_tables) + with pytest.warns(FutureWarning): + new_ts = pyslim.load_tables(msp_tables) assert isinstance(new_ts, pyslim.SlimTreeSequence) self.verify_times(msp_ts, new_ts) new_tables = new_ts.dump_tables() self.assertTableCollectionsEqual(msp_tables, new_tables) # convert directly - new_ts = pyslim.SlimTreeSequence(msp_ts) + with pytest.warns(FutureWarning): + new_ts = pyslim.SlimTreeSequence(msp_ts) assert isinstance(new_ts, pyslim.SlimTreeSequence) self.verify_times(msp_ts, new_ts) new_tables = new_ts.dump_tables() self.assertTableCollectionsEqual(msp_tables, new_tables) # load to pyslim from file - slim_ts = pyslim.load(fn) + with pytest.warns(FutureWarning): + slim_ts = pyslim.load(fn) assert isinstance(slim_ts, pyslim.SlimTreeSequence) slim_tables = slim_ts.dump_tables() self.assertTableCollectionsEqual(msp_tables, slim_tables) @@ -245,7 +250,8 @@ def test_dump_equality(self, recipe, tmp_path): tmp_file = os.path.join(tmp_path, "test_dump.trees") ts = recipe["ts"] ts.dump(tmp_file) - ts2 = pyslim.load(tmp_file) + with pytest.warns(FutureWarning): + ts2 = pyslim.load(tmp_file) assert ts.num_samples == ts2.num_samples assert ts.sequence_length == ts2.sequence_length assert ts.tables == ts2.dump_tables() @@ -259,9 +265,11 @@ def test_legacy_error(self, recipe, tmp_path): ts = recipe["ts"] ts.dump(tmp_file) with pytest.raises(ValueError, match="legacy metadata tools"): - _ = pyslim.load(tmp_file, legacy_metadata=True) + with pytest.warns(FutureWarning): + _ = pyslim.load(tmp_file, legacy_metadata=True) with pytest.raises(ValueError, match="legacy metadata tools"): - _ = pyslim.SlimTreeSequence(ts, legacy_metadata=True) + with pytest.warns(FutureWarning): + _ = pyslim.SlimTreeSequence(ts, legacy_metadata=True) diff --git a/tests/test_spatial.py b/tests/test_spatial.py index 3791e675..ef25408a 100644 --- a/tests/test_spatial.py +++ b/tests/test_spatial.py @@ -42,7 +42,7 @@ def population_size_simple(self, ts, x_bins, y_bins, time_bins, **kwargs): # Location, times, and ages of individuals locations = ts.individual_locations - times = ts.individual_times + times, _, _ = pyslim.individual_times_populations_ages(ts) # Iterate through location bins and time bins for i in np.arange(nxbins): diff --git a/tests/test_tree_sequence.py b/tests/test_tree_sequence.py index c0344930..5190d082 100644 --- a/tests/test_tree_sequence.py +++ b/tests/test_tree_sequence.py @@ -41,7 +41,8 @@ def test_inconsistent_nodes(self): pyslim.annotate_defaults_tables(tables, model_type='nonWF', slim_generation=1) ts = tables.tree_sequence() with pytest.raises(ValueError): - _ = pyslim.SlimTreeSequence(ts) + with pytest.warns(FutureWarning): + _ = pyslim.SlimTreeSequence(ts) def test_inconsistent_times(self): clean_tables = self.clean_example() @@ -51,7 +52,8 @@ def test_inconsistent_times(self): tables.nodes.add_row(time=j, flags=tskit.NODE_IS_SAMPLE, population=1, individual=0) ts = tables.tree_sequence() with pytest.raises(ValueError): - _ = pyslim.SlimTreeSequence(ts) + with pytest.warns(FutureWarning): + _ = pyslim.SlimTreeSequence(ts) def test_bad_metadata(self): clean_tables = self.clean_example() @@ -60,7 +62,8 @@ def test_bad_metadata(self): tables.metadata = {} ts = tables.tree_sequence() with pytest.raises(ValueError): - _ = pyslim.SlimTreeSequence(ts) + with pytest.warns(FutureWarning): + _ = pyslim.SlimTreeSequence(ts) # tmp_path is a pytest fixture, and is a pathlib.Path object @pytest.mark.skip(reason="deprecating this feature") @@ -68,15 +71,20 @@ def test_bad_metadata(self): def test_slim_generation(self, recipe, tmp_path): # tests around awkward backwards-compatible patch for setting slim_generation ts = recipe["ts"] - assert ts.slim_generation == ts.metadata['SLiM']['generation'] + with pytest.warns(FutureWarning): + assert ts.slim_generation == ts.metadata['SLiM']['generation'] new_sg = 12345 - ts.slim_generation = new_sg - assert ts.slim_generation == new_sg + with pytest.warns(FutureWarning): + ts.slim_generation = new_sg + with pytest.warns(FutureWarning): + assert ts.slim_generation == new_sg # check persists through dump/load temp_file = tmp_path / "temp.trees" ts.dump(temp_file.name) - loaded_ts = pyslim.load(temp_file.name) - assert loaded_ts.slim_generation == new_sg + with pytest.warns(FutureWarning): + loaded_ts = pyslim.load(temp_file.name) + with pytest.warns(FutureWarning): + assert loaded_ts.slim_generation == new_sg assert loaded_ts.metadata['SLiM']['generation'] == new_sg # check persists through recapitate recap = self.do_recapitate(ts, recombination_rate=1e-8, ancestral_Ne=10) @@ -87,10 +95,18 @@ def test_slim_generation(self, recipe, tmp_path): def test_pickle(self): ts = self.clean_example().tree_sequence() - ts = pyslim.SlimTreeSequence(ts) - roundtripped = pickle.loads(pickle.dumps(ts)) + with pytest.warns(FutureWarning): + ts = pyslim.SlimTreeSequence(ts) + with pytest.warns(FutureWarning): + roundtripped = pickle.loads(pickle.dumps(ts)) assert roundtripped == ts + def test_deprecation(self): + tables = self.clean_example() + ts = tables.tree_sequence() + with pytest.warns(FutureWarning, match="SlimTreeSequence class is deprecated"): + ts = pyslim.SlimTreeSequence(ts) + class TestSlimTime(tests.PyslimTestCase): # Tests for slim_time() @@ -100,25 +116,17 @@ def test_slim_time(self, recipe): if "init_mutated" not in recipe: for mut in ts.mutations(): mut_time = max([x['slim_time'] for x in mut.metadata['mutation_list']]) - assert mut_time == ts.slim_time(mut.time) + with pytest.warns(FutureWarning): + assert mut_time == ts.slim_time(mut.time) + assert mut_time == pyslim.slim_time(ts, mut.time) # the mutations in "init_mutated" examples have mutations that are *added* # in *early*, and so their times match in that stage. else: for mut in ts.mutations(): mut_time = max([x['slim_time'] for x in mut.metadata['mutation_list']]) - assert mut_time == ts.slim_time(mut.time, stage="early") - - -class TestMutate(tests.PyslimTestCase): - # Tests for making a tree sequence a SlimTreeSequence - # again after msprime.mutate. - - @pytest.mark.parametrize('recipe', recipe_eq(exclude="user_metadata"), indirect=True) - def test_mutate(self, recipe): - ts = recipe["ts"] - mts = msprime.mutate(ts, rate=1e-8, random_seed=5) - pts = pyslim.SlimTreeSequence(mts) - assert ts.metadata == pts.metadata + with pytest.warns(FutureWarning): + assert mut_time == ts.slim_time(mut.time, stage="early") + assert mut_time == pyslim.slim_time(ts, mut.time, stage="early") class TestRecapitate(tests.PyslimTestCase): @@ -245,7 +253,8 @@ def test_with_demography(self, recipe): def test_old_recapitate_errors(self, recipe): ts = recipe["ts"] with pytest.raises(ValueError): - _ = ts.recapitate( + with pytest.warns(FutureWarning): + _ = ts.recapitate( recombination_rate=0.0, keep_first_generation=True) @@ -254,14 +263,16 @@ def test_old_recapitation(self, recipe): if ts.num_populations <= 2: # if not we need migration rates recomb_rate = 1.0 / ts.sequence_length - recap = ts.recapitate(recombination_rate=recomb_rate) + with pytest.warns(FutureWarning): + recap = ts.recapitate(recombination_rate=recomb_rate) # there should be no new mutations assert ts.num_mutations == recap.num_mutations assert ts.num_sites == recap.num_sites assert list(ts.tables.sites.position) == list(recap.tables.sites.position) self.check_recap_consistency(ts, recap) - recap = ts.recapitate(recombination_rate=recomb_rate, Ne=1e-6) + with pytest.warns(FutureWarning): + recap = ts.recapitate(recombination_rate=recomb_rate, Ne=1e-6) self.check_recap_consistency(ts, recap) if ts.metadata['SLiM']['generation'] < 200: for t in recap.trees(): @@ -272,7 +283,8 @@ def test_old_recapitation(self, recipe): positions = [0.0, ts.sequence_length], rates = [recomb_rate, 0.0], num_loci=int(ts.sequence_length)) - recap = ts.recapitate(recombination_map=recombination_map, Ne=1e-6) + with pytest.warns(FutureWarning): + recap = ts.recapitate(recombination_map=recombination_map, Ne=1e-6) self.check_recap_consistency(ts, recap) @@ -290,13 +302,23 @@ def test_individual_embellishments(self, recipe): # Test the individual additional information. ts = recipe["ts"] is_wf = (ts.metadata["SLiM"]["model_type"] == "WF") + times, populations, ages = pyslim.individual_times_populations_ages(ts) + assert np.all( + times == ts.individual_times + ) + assert np.all( + ages == ts.individual_ages + ) + assert np.all( + populations == ts.individual_populations + ) for j, ind in enumerate(ts.individuals()): - assert ts.individual_times[j] == ind.time + assert times[j] == ind.time if is_wf: - assert ts.individual_ages[j] == 0 + assert ages[j] == 0 else: - assert ts.individual_ages[j] == ind.metadata["age"] - assert ts.individual_populations[j] == ind.population + assert ages[j] == ind.metadata["age"] + assert populations[j] == ind.population assert np.array_equal(ts.individual_locations[j], ind.location) def test_first_gen_nodes(self, recipe): @@ -323,7 +345,11 @@ def test_slim_time(self, recipe): ts = recipe["ts"] # check that slim_times make sense, i.e., that # slim_generation == (time + slim_time + (model_type == "WF" and stage="early")) - offset = (ts.metadata["SLiM"]["model_type"] == "WF") and (ts.metadata["SLiM"]["stage"] == "early") + offset = ( + (ts.metadata["SLiM"]["model_type"] == "WF") + and + (ts.metadata["SLiM"]["stage"] == "early") + ) for mut in ts.mutations(): mut_slim_time = max([u["slim_time"] for u in mut.metadata["mutation_list"]]) assert ts.metadata['SLiM']['generation'] == mut_slim_time + mut.time + offset @@ -337,11 +363,11 @@ def test_errors(self, recipe): ts = recipe["ts"] for stage in ['abcd', 10, []]: with pytest.raises(ValueError): - ts.individuals_alive_at(0, stage=stage) + pyslim.individuals_alive_at(ts, 0, stage=stage) with pytest.raises(ValueError): - ts.individuals_alive_at(0, remembered_stage=stage) + pyslim.individuals_alive_at(ts, 0, remembered_stage=stage) with pytest.raises(ValueError): - ts.individual_ages_at(0, stage=stage) + pyslim.individual_ages_at(ts, 0, stage=stage) @pytest.mark.parametrize('recipe', [next(recipe_eq("pedigree", "WF"))], indirect=True) def test_mismatched_remembered_stage(self, recipe): @@ -349,42 +375,43 @@ def test_mismatched_remembered_stage(self, recipe): info = recipe["info"] if "remembered_early" in recipe: with pytest.warns(UserWarning): - ts.individuals_alive_at(0, remembered_stage="late") + pyslim.individuals_alive_at(ts, 0, remembered_stage="late") else: with pytest.warns(UserWarning): - ts.individuals_alive_at(0, remembered_stage="early") + pyslim.individuals_alive_at(ts, 0, remembered_stage="early") @pytest.mark.parametrize('recipe', recipe_eq("multipop", exclude="remembered_early"), indirect=True) def test_population(self, recipe): ts = recipe["ts"] - all_inds = ts.individuals_alive_at(0) + all_inds = pyslim.individuals_alive_at(ts, 0) + times, populations, ages = pyslim.individual_times_populations_ages(ts) for p in range(ts.num_populations): - sub_inds = ts.individuals_alive_at(0, population=p) - assert set(sub_inds) == set(all_inds[ts.individual_populations == p]) - sub_inds = ts.individuals_alive_at(0, population=[p]) - assert set(sub_inds) == set(all_inds[ts.individual_populations == p]) - sub_inds = ts.individuals_alive_at(0, population=np.arange(p)) - assert set(sub_inds) == set(all_inds[ts.individual_populations != p]) + sub_inds = pyslim.individuals_alive_at(ts, 0, population=p) + assert set(sub_inds) == set(all_inds[populations == p]) + sub_inds = pyslim.individuals_alive_at(ts, 0, population=[p]) + assert set(sub_inds) == set(all_inds[populations == p]) + sub_inds = pyslim.individuals_alive_at(ts, 0, population=np.arange(p)) + assert set(sub_inds) == set(all_inds[populations != p]) @pytest.mark.parametrize('recipe', recipe_eq("nonWF", exclude="remembered_early"), indirect=True) def test_samples_only(self, recipe): ts = recipe["ts"] - all_inds = ts.individuals_alive_at(0) - assert set(all_inds) == set(ts.individuals_alive_at(0, samples_only=False)) + all_inds = pyslim.individuals_alive_at(ts, 0) + assert set(all_inds) == set(pyslim.individuals_alive_at(ts, 0, samples_only=False)) sub_inds = np.random.choice(all_inds, size=min(len(all_inds), 4), replace=False) flags = np.array([n.flags & (tskit.NODE_IS_SAMPLE * n.individual in sub_inds) for n in ts.nodes()], dtype=np.uint32) tables = ts.dump_tables() tables.nodes.flags = flags - new_ts = pyslim.SlimTreeSequence(tables.tree_sequence()) - assert set(sub_inds) == set(new_ts.individuals_alive_at(0, samples_only=True)) + new_ts = tables.tree_sequence() + assert set(sub_inds) == set(pyslim.individuals_alive_at(new_ts, 0, samples_only=True)) @pytest.mark.parametrize('recipe', recipe_eq(exclude="remembered_early"), indirect=True) def test_after_simplify(self, recipe): ts = recipe["ts"] sts = ts.simplify() - orig_inds = ts.individuals_alive_at(0) - simp_inds = sts.individuals_alive_at(0) + orig_inds = pyslim.individuals_alive_at(ts, 0) + simp_inds = pyslim.individuals_alive_at(sts, 0) odict = {ts.individual(i).metadata["pedigree_id"]: i for i in orig_inds} sdict = {sts.individual(i).metadata["pedigree_id"]: i for i in simp_inds} for slim_id in odict: @@ -416,11 +443,13 @@ def test_ages(self, recipe): else: check_stages = ('early', 'late') for stage in check_stages: - alive = ts.individuals_alive_at( + alive = pyslim.individuals_alive_at( + ts, time, stage=stage, remembered_stage=remembered_stage) - ages = ts.individual_ages_at( + ages = pyslim.individual_ages_at( + ts, time, stage=stage, remembered_stage=remembered_stage) @@ -448,6 +477,7 @@ def verify_has_parents(self, ts): node_indivs = ts.tables.nodes.individual parent_ids = [set() for _ in ts.individuals()] node_parent_ids = [set() for _ in ts.nodes()] + times, _, ages = pyslim.individual_times_populations_ages(ts) for t in ts.trees(): for i in ts.individuals(): if len(i.nodes) != 2: @@ -461,13 +491,13 @@ def verify_has_parents(self, ts): if p == tskit.NULL: right_answer[i.id] = False else: - ptime = ts.individual_times[p] + ptime = times[p] parent_alive = True if ts.metadata["SLiM"]["model_type"] == "WF": if i.time + 1 != ptime: parent_alive = False else: - pdeath = ptime - ts.individual_ages[p] + pdeath = ptime - ages[p] if i.time + 1 < pdeath: parent_alive = False if not parent_alive: @@ -488,9 +518,9 @@ def verify_has_parents(self, ts): if right_answer[j]: for pp in p: right_parents.append([pp, j]) - has_parents = ts.has_individual_parents() + has_parents = pyslim.has_individual_parents(ts) right_parents = np.sort(np.array(right_parents), axis=0) - parents = np.sort(ts.individual_parents(), axis=0) + parents = np.sort(pyslim.individual_parents(ts), axis=0) assert np.array_equal(right_answer, has_parents) assert np.array_equal(right_parents, parents) @@ -510,7 +540,7 @@ def test_everyone(self, recipe): right_answer = np.repeat(True, ts.num_individuals) first_gen = self.get_first_gen(ts) right_answer[first_gen] = False - has_parents = ts.has_individual_parents() + has_parents = pyslim.has_individual_parents(ts) assert np.array_equal(right_answer, has_parents) self.verify_has_parents(ts) @@ -524,7 +554,7 @@ def test_post_recap(self, recipe): assert(ts.num_populations <= 2) ts = self.do_recapitate(ts, recombination_rate=0.01, ancestral_Ne=10) assert(ts.num_individuals == ts.num_individuals) - has_parents = ts.has_individual_parents() + has_parents = pyslim.has_individual_parents(ts) assert np.array_equal(right_answer, has_parents) self.verify_has_parents(ts) @@ -540,7 +570,7 @@ def test_post_simplify(self, recipe): ts = ts.simplify(samples=keep_nodes, filter_individuals=True, keep_input_roots=True) assert(ts.num_populations <= 2) ts = self.do_recapitate(ts, recombination_rate=0.01, ancestral_Ne=10) - has_parents = ts.has_individual_parents() + has_parents = pyslim.has_individual_parents(ts) assert sum(has_parents) > 0 self.verify_has_parents(ts) @@ -554,8 +584,8 @@ def test_pedigree_parents_everyone(self, recipe): # and so the parentage would not be reported by `individual_parents()`. ts = recipe["ts"] info = recipe["info"] - has_parents = ts.has_individual_parents() - parents = ts.individual_parents() + has_parents = pyslim.has_individual_parents(ts) + parents = pyslim.individual_parents(ts) slim_map = {} for ind in ts.individuals(): slim_map[ind.metadata["pedigree_id"]] = ind.id @@ -584,8 +614,8 @@ def test_pedigree_parents(self, recipe): # or REMEMBERED individuals (not RETAINED), and the same for parental genomes. ts = recipe["ts"] info = recipe["info"] - has_parents = ts.has_individual_parents() - parents = ts.individual_parents() + has_parents = pyslim.has_individual_parents(ts) + parents = pyslim.individual_parents(ts) slim_map = {} for ind in ts.individuals(): slim_map[ind.metadata["pedigree_id"]] = ind.id @@ -636,6 +666,7 @@ def test_simplify(self, recipe): class TestReferenceSequence(tests.PyslimTestCase): ''' Test for operations involving the reference sequence + TODO: run some of these tests on fewer examples ''' def test_reference_sequence(self, recipe): @@ -659,13 +690,13 @@ def test_mutation_at_errors(self, recipe): ts = recipe["ts"] u = ts.samples()[0] with pytest.raises(ValueError): - ts.mutation_at(-2, 3) + pyslim.mutation_at(ts, -2, 3) with pytest.raises(ValueError): - ts.mutation_at(u, -3) + pyslim.mutation_at(ts, u, -3) with pytest.raises(ValueError): - ts.mutation_at(ts.num_nodes + 2, 3) + pyslim.mutation_at(ts, ts.num_nodes + 2, 3) with pytest.raises(ValueError): - ts.mutation_at(u, ts.sequence_length) + pyslim.mutation_at(ts, u, ts.sequence_length) def test_nucleotide_at_errors(self, recipe): ts = recipe["ts"] @@ -675,7 +706,7 @@ def test_nucleotide_at_errors(self, recipe): has_nucleotides = (mut_md["mutation_list"][0]["nucleotide"] >= 0) if not has_nucleotides: with pytest.raises(ValueError): - ts.nucleotide_at(u, 3) + pyslim.nucleotide_at(ts, u, 3) def test_mutation_at(self, recipe): random.seed(42) @@ -685,12 +716,14 @@ def test_mutation_at(self, recipe): pos = random.randint(0, ts.sequence_length - 1) tree = ts.at(pos) parent = tree.parent(node) - a = ts.mutation_at(node, pos) + a = pyslim.mutation_at(ts, node, pos) + with pytest.warns(FutureWarning): + assert a == ts.mutation_at(node, pos) if parent == tskit.NULL: assert a == tskit.NULL else: - b = ts.mutation_at(parent, pos) - c = ts.mutation_at(node, pos, ts.node(parent).time) + b = pyslim.mutation_at(ts, parent, pos) + c = pyslim.mutation_at(ts, node, pos, ts.node(parent).time) assert b == c for k in np.where(node == ts.tables.mutations.node)[0]: mut = ts.mutation(k) @@ -712,13 +745,15 @@ def test_nucleotide_at(self, recipe): pos = random.randint(0, ts.sequence_length - 1) tree = ts.at(pos) parent = tree.parent(node) - a = ts.nucleotide_at(node, pos) + a = pyslim.nucleotide_at(ts, node, pos) + with pytest.warns(FutureWarning): + assert a == ts.nucleotide_at(node, pos) if parent == tskit.NULL: nuc = ts.reference_sequence.data[int(pos)] assert a == pyslim.NUCLEOTIDES.index(nuc) else: - b = ts.nucleotide_at(parent, pos) - c = ts.nucleotide_at(node, pos, ts.node(parent).time) + b = pyslim.nucleotide_at(ts, parent, pos) + c = pyslim.nucleotide_at(ts, node, pos, ts.node(parent).time) assert b == c for k in np.where(node == ts.tables.mutations.node)[0]: mut = ts.mutation(k)