-
Notifications
You must be signed in to change notification settings - Fork 84
/
msprime.h
539 lines (482 loc) · 18.2 KB
/
msprime.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
/*
** Copyright (C) 2015-2020 University of Oxford
**
** This file is part of msprime.
**
** msprime is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** msprime is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with msprime. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __MSPRIME_H__
#define __MSPRIME_H__
#include <stdio.h>
#include <stdbool.h>
#include <gsl/gsl_rng.h>
#define TSK_BUG_ASSERT_MESSAGE \
"Please report this issue on GitHub, ideally with a reproducible example." \
" (https://github.com/tskit-dev/msprime/issues)"
#include <tskit.h>
#include "util.h"
#include "avl.h"
#include "fenwick.h"
#include "object_heap.h"
#include "rate_map.h"
#define MSP_MODEL_HUDSON 0
#define MSP_MODEL_SMC 1
#define MSP_MODEL_SMC_PRIME 2
#define MSP_MODEL_BETA 3
#define MSP_MODEL_DIRAC 4
#define MSP_MODEL_DTWF 5
#define MSP_MODEL_SWEEP 6
#define MSP_MODEL_WF_PED 7
/* Exit codes from msp_run to distinguish different reasons for exiting
* before coalescence. */
#define MSP_EXIT_COALESCENCE 0
#define MSP_EXIT_MAX_EVENTS 1
#define MSP_EXIT_MAX_TIME 2
#define MSP_EXIT_MODEL_COMPLETE 3
#define MSP_NODE_IS_RE_EVENT (1u << 17)
#define MSP_NODE_IS_CA_EVENT (1u << 18)
#define MSP_NODE_IS_MIG_EVENT (1u << 19)
#define MSP_NODE_IS_CEN_EVENT (1u << 20)
#define MSP_NODE_IS_GC_EVENT (1u << 21)
#define MSP_NODE_IS_PASS_THROUGH (1u << 22)
/* Flags for verify */
#define MSP_VERIFY_BREAKPOINTS (1 << 1)
/* Flags for mutgen */
#define MSP_KEEP_SITES (1 << 0)
#define MSP_DISCRETE_SITES (1 << 1)
/* Pedigree states */
#define MSP_PED_STATE_UNCLIMBED 0
#define MSP_PED_STATE_CLIMBING 1
#define MSP_PED_STATE_CLIMB_COMPLETE 2
typedef tsk_id_t population_id_t;
typedef tsk_id_t label_id_t;
typedef struct segment_t_t {
population_id_t population;
label_id_t label;
double left;
double right;
tsk_id_t value;
size_t id;
struct segment_t_t *prev;
struct segment_t_t *next;
} segment_t;
typedef struct {
double position;
uint32_t value;
} node_mapping_t;
#define MSP_POP_STATE_INACTIVE 0
#define MSP_POP_STATE_ACTIVE 1
#define MSP_POP_STATE_PREVIOUSLY_ACTIVE 2
typedef struct {
/* The starting size and time for the current epoch for this population. */
double initial_size;
double start_time;
double growth_rate;
/* The lifecycle state machine. States go strictly from
* inactive -> active -> previously_active */
int state;
avl_tree_t *ancestors;
tsk_size_t num_potential_destinations;
tsk_id_t *potential_destinations;
} population_t;
#define MSP_MAX_PED_PLOIDY 2
/* Note: we might want to make a distinction here between "individual"
* and "pedigree individual". We might want to have a more generic
* individual type at some point which just consists of an array of
* ploidy segment pointers (one head for each strand). This struct
* is quite specialised for the purpose of running the pedigree
* simulation. */
typedef struct {
tsk_id_t id;
tsk_size_t ploidy;
double time;
population_id_t population;
tsk_id_t parents[MSP_MAX_PED_PLOIDY];
tsk_id_t nodes[MSP_MAX_PED_PLOIDY];
avl_tree_t common_ancestors[MSP_MAX_PED_PLOIDY];
} individual_t;
typedef struct {
individual_t *individuals;
individual_t **visit_order;
/* The number of nodes in the input pedigree. */
tsk_size_t num_nodes;
tsk_size_t num_individuals;
tsk_id_t next_individual;
} pedigree_t;
typedef struct {
double time;
tsk_id_t sample;
population_id_t population;
} sampling_event_t;
/* Simulation models */
typedef struct {
double alpha;
double truncation_point;
} beta_coalescent_t;
typedef struct {
double psi;
double c;
} dirac_coalescent_t;
/* Forward declaration */
struct _msp_t;
typedef struct {
/* TODO document these parameters.*/
double start_frequency;
double end_frequency;
double s;
double dt;
} genic_selection_trajectory_t;
typedef struct _sweep_t {
double position;
union {
/* Future trajectory simulation models would go here */
genic_selection_trajectory_t genic_selection_trajectory;
} trajectory_params;
int (*generate_trajectory)(struct _sweep_t *self, struct _msp_t *simulator,
size_t *num_steps, double **time, double **allele_frequency);
void (*print_state)(struct _sweep_t *self, FILE *out);
} sweep_t;
typedef struct _simulation_model_t {
int type;
union {
beta_coalescent_t beta_coalescent;
dirac_coalescent_t dirac_coalescent;
sweep_t sweep;
} params;
/* If the model allocates memory this function should be non-null. */
void (*free)(struct _simulation_model_t *model);
} simulation_model_t;
typedef struct {
double left;
uint32_t count;
} overlap_count_t;
typedef struct _msp_t {
gsl_rng *rng;
/* input parameters */
simulation_model_t model;
bool store_migrations;
bool store_full_arg;
uint32_t additional_nodes;
bool coalescing_segments_only;
double sequence_length;
bool discrete_genome;
rate_map_t recomb_map;
rate_map_t gc_map;
double gc_tract_length;
uint32_t num_populations;
uint32_t num_labels;
uint32_t ploidy;
double start_time;
pedigree_t pedigree;
/* Initial state for replication */
segment_t **root_segments;
overlap_count_t *initial_overlaps;
simulation_model_t initial_model;
double *initial_migration_matrix;
population_t *initial_populations;
/* allocation block sizes */
size_t avl_node_block_size;
size_t node_mapping_block_size;
size_t segment_block_size;
/* Counters for statistics */
size_t num_re_events;
size_t num_ca_events;
size_t num_gc_events;
size_t num_internal_gc_events;
double sum_internal_gc_tract_lengths;
size_t num_rejected_ca_events;
size_t *num_migration_events;
size_t num_trapped_re_events;
size_t num_multiple_re_events;
size_t num_noneffective_gc_events;
size_t num_fenwick_rebuilds;
/* sampling events */
sampling_event_t *sampling_events;
size_t num_sampling_events;
size_t next_sampling_event;
/* Demographic events */
struct demographic_event_t_t *demographic_events_head;
struct demographic_event_t_t *demographic_events_tail;
struct demographic_event_t_t *next_demographic_event;
/* algorithm state */
int state;
double time;
double *migration_matrix;
population_t *populations;
avl_tree_t non_empty_populations;
avl_tree_t breakpoints;
avl_tree_t overlap_counts;
/* We keep an independent Fenwick tree for each label */
fenwick_t *recomb_mass_index;
fenwick_t *gc_mass_index;
/* memory management */
object_heap_t avl_node_heap;
object_heap_t node_mapping_heap;
/* We keep an independent segment heap for each label */
object_heap_t *segment_heap;
/* The tables used to store the simulation state */
tsk_table_collection_t *tables;
tsk_bookmark_t input_position;
/* edges are buffered in a flat array until they are squashed and flushed */
tsk_edge_t *buffered_edges;
tsk_size_t num_buffered_edges;
tsk_size_t max_buffered_edges;
/* Methods for getting the waiting time until the next common ancestor
* event and the event are defined by the simulation model */
double (*get_common_ancestor_waiting_time)(
struct _msp_t *self, population_id_t pop, label_id_t label);
int (*common_ancestor_event)(
struct _msp_t *selt, population_id_t pop, label_id_t label);
} msp_t;
/* Demographic events */
typedef struct {
population_id_t population;
double initial_size;
double growth_rate;
} population_parameters_change_t;
typedef struct {
int matrix_index;
double migration_rate;
} migration_rate_change_t;
typedef struct {
population_id_t source;
population_id_t destination;
double proportion;
} mass_migration_t;
/* Arbitrary limit, saves us having to put in complex malloc/free
* logic in the demographic_events. Can easily be changed if
* needs be. */
#define MSP_MAX_EVENT_POPULATIONS 100
typedef struct {
population_id_t derived[MSP_MAX_EVENT_POPULATIONS];
population_id_t ancestral;
size_t num_derived;
} population_split_t;
typedef struct {
population_id_t population;
} deactivate_population_t;
typedef struct {
population_id_t population;
} activate_population_t;
typedef struct {
population_id_t derived;
population_id_t ancestral[MSP_MAX_EVENT_POPULATIONS];
double proportion[MSP_MAX_EVENT_POPULATIONS];
size_t num_ancestral;
} admixture_t;
typedef struct {
population_id_t population;
double proportion;
} simple_bottleneck_t;
typedef struct {
population_id_t population;
double strength;
} instantaneous_bottleneck_t;
typedef struct demographic_event_t_t {
double time;
int (*change_state)(msp_t *, struct demographic_event_t_t *);
void (*print_state)(msp_t *, struct demographic_event_t_t *, FILE *out);
union {
simple_bottleneck_t simple_bottleneck;
instantaneous_bottleneck_t instantaneous_bottleneck;
mass_migration_t mass_migration;
population_split_t population_split;
deactivate_population_t deactivate_population;
activate_population_t activate_population;
admixture_t admixture;
migration_rate_change_t migration_rate_change;
population_parameters_change_t population_parameters_change;
} params;
struct demographic_event_t_t *next;
} demographic_event_t;
/* The site_t and mutation_t are similar the equivalent tsk_ types,
* with some extra fields that we need for mutation generation. */
typedef struct mutation_t {
tsk_id_t id;
tsk_id_t site;
tsk_id_t node;
char *derived_state;
tsk_size_t derived_state_length;
char *metadata;
tsk_size_t metadata_length;
double time;
struct mutation_t *parent;
struct mutation_t *next;
bool new;
} mutation_t;
typedef struct {
double position;
char *ancestral_state;
tsk_size_t ancestral_state_length;
char *metadata;
tsk_size_t metadata_length;
mutation_t *mutations;
size_t mutations_length;
bool new;
} site_t;
typedef struct {
size_t num_alleles;
char **alleles;
tsk_size_t *allele_length;
double *root_distribution;
double *transition_matrix;
} mutation_matrix_t;
typedef struct {
int32_t mutation_type_id; // following SLiM's MutationMetadataRec
int64_t next_mutation_id; // following SLiM's slim_mutationid_t
int32_t slim_generation;
tsk_blkalloc_t allocator;
} slim_mutator_t;
typedef struct {
uint64_t start_allele;
uint64_t next_allele;
tsk_blkalloc_t allocator;
} infinite_alleles_t;
typedef struct _mutation_model_t {
union {
mutation_matrix_t mutation_matrix;
slim_mutator_t slim_mutator;
infinite_alleles_t infinite_alleles;
/* Other known mutation models */
} params;
void (*print_state)(struct _mutation_model_t *model, FILE *out);
int (*free)(struct _mutation_model_t *model);
int (*choose_root_state)(
struct _mutation_model_t *model, gsl_rng *rng, site_t *site);
int (*transition)(struct _mutation_model_t *model, gsl_rng *rng,
const char *parent_allele, tsk_size_t parent_allele_length,
const char *parent_metadata, tsk_size_t parent_metadata_length,
mutation_t *mutation);
} mutation_model_t;
typedef struct {
gsl_rng *rng;
tsk_table_collection_t *tables;
double start_time;
double end_time;
size_t block_size;
rate_map_t rate_map;
avl_tree_t sites;
tsk_blkalloc_t allocator;
mutation_model_t *model;
} mutgen_t;
int msp_alloc(msp_t *self, tsk_table_collection_t *tables, gsl_rng *rng);
int msp_set_simulation_model_hudson(msp_t *self);
int msp_set_simulation_model_smc(msp_t *self);
int msp_set_simulation_model_smc_prime(msp_t *self);
int msp_set_simulation_model_dtwf(msp_t *self);
int msp_set_simulation_model_fixed_pedigree(msp_t *self);
int msp_set_simulation_model_dirac(msp_t *self, double psi, double c);
int msp_set_simulation_model_beta(msp_t *self, double alpha, double truncation_point);
int msp_set_simulation_model_sweep_genic_selection(msp_t *self, double position,
double start_frequency, double end_frequency, double s, double dt);
int msp_set_start_time(msp_t *self, double start_time);
int msp_set_store_migrations(msp_t *self, bool store_migrations);
int msp_set_store_full_arg(msp_t *self, bool store_full_arg);
int msp_set_additional_nodes(msp_t *self, uint32_t additional_nodes);
int msp_set_coalescing_segments_only(msp_t *self, bool coalescing_segments_only);
int msp_set_ploidy(msp_t *self, int ploidy);
int msp_set_recombination_map(msp_t *self, size_t size, double *position, double *rate);
int msp_set_recombination_rate(msp_t *self, double rate);
int msp_set_gene_conversion_map(
msp_t *self, size_t size, double *position, double *rate);
int msp_set_gene_conversion_rate(msp_t *self, double rate);
int msp_set_gene_conversion_tract_length(msp_t *self, double tract_length);
int msp_set_discrete_genome(msp_t *self, bool is_discrete);
int msp_set_num_labels(msp_t *self, size_t num_labels);
int msp_set_node_mapping_block_size(msp_t *self, size_t block_size);
int msp_set_segment_block_size(msp_t *self, size_t block_size);
int msp_set_avl_node_block_size(msp_t *self, size_t block_size);
int msp_set_migration_matrix(msp_t *self, size_t size, double *migration_matrix);
int msp_set_population_configuration(msp_t *self, int population_id, double initial_size,
double growth_rate, bool initially_active);
int msp_add_population_parameters_change(
msp_t *self, double time, int population_id, double size, double growth_rate);
int msp_add_migration_rate_change(
msp_t *self, double time, int source_pop, int dest_pop, double migration_rate);
int msp_add_mass_migration(
msp_t *self, double time, int source, int dest, double proportion);
int msp_add_population_split(
msp_t *self, double time, size_t num_derived, int32_t *derived, int ancestral);
int msp_add_activate_population_event(msp_t *self, double time, int population_id);
int msp_add_deactivate_population_event(msp_t *self, double time, int population_id);
int msp_add_admixture(msp_t *self, double time, int derived, size_t num_ancestral,
int32_t *ancestral, double *proportions);
int msp_add_simple_bottleneck(
msp_t *self, double time, int population_id, double intensity);
int msp_add_instantaneous_bottleneck(
msp_t *self, double time, int population_id, double strength);
int msp_add_census_event(msp_t *self, double time);
int msp_initialise(msp_t *self);
int msp_run(msp_t *self, double max_time, unsigned long max_events);
int msp_debug_demography(msp_t *self, double *end_time);
int msp_finalise_tables(msp_t *self);
int msp_reset(msp_t *self);
int msp_print_state(msp_t *self, FILE *out);
int msp_free(msp_t *self);
void msp_verify(msp_t *self, int options);
int msp_get_ancestors(msp_t *self, segment_t **ancestors);
int msp_get_breakpoints(msp_t *self, size_t *breakpoints);
int msp_get_migration_matrix(msp_t *self, double *migration_matrix);
int msp_get_num_migration_events(msp_t *self, size_t *num_migration_events);
int msp_get_population_configuration(msp_t *self, size_t population_id,
double *initial_size, double *growth_rate, int *state);
int msp_compute_population_size(
msp_t *self, size_t population_id, double time, double *pop_size);
int msp_is_completed(msp_t *self);
simulation_model_t *msp_get_model(msp_t *self);
const char *msp_get_model_name(msp_t *self);
bool msp_get_store_migrations(msp_t *self);
double msp_get_time(msp_t *self);
size_t msp_get_num_samples(msp_t *self);
size_t msp_get_num_loci(msp_t *self);
size_t msp_get_num_populations(msp_t *self);
size_t msp_get_num_labels(msp_t *self);
size_t msp_get_num_population_ancestors(msp_t *self, tsk_id_t population);
size_t msp_get_num_ancestors(msp_t *self);
size_t msp_get_num_breakpoints(msp_t *self);
size_t msp_get_num_nodes(msp_t *self);
size_t msp_get_num_edges(msp_t *self);
size_t msp_get_num_migrations(msp_t *self);
size_t msp_get_num_avl_node_blocks(msp_t *self);
size_t msp_get_num_node_mapping_blocks(msp_t *self);
size_t msp_get_num_segment_blocks(msp_t *self);
size_t msp_get_num_common_ancestor_events(msp_t *self);
size_t msp_get_num_rejected_common_ancestor_events(msp_t *self);
size_t msp_get_num_recombination_events(msp_t *self);
size_t msp_get_num_gene_conversion_events(msp_t *self);
size_t msp_get_num_internal_gene_conversion_events(msp_t *self);
size_t msp_get_num_noneffective_gene_conversion_events(msp_t *self);
double msp_get_sum_internal_gc_tract_lengths(msp_t *self);
int matrix_mutation_model_factory(mutation_model_t *self, int model);
int matrix_mutation_model_alloc(mutation_model_t *self, size_t num_alleles,
char **alleles, size_t *allele_length, double *root_distribution,
double *transition_matrix);
int slim_mutation_model_alloc(mutation_model_t *self, int32_t mutation_type_id,
int64_t next_mutation_id, int32_t slim_generation, size_t block_size);
int infinite_alleles_mutation_model_alloc(
mutation_model_t *self, uint64_t start_allele, tsk_flags_t options);
int mutation_model_free(mutation_model_t *self);
void mutation_model_print_state(mutation_model_t *self, FILE *out);
int mutgen_alloc(mutgen_t *self, gsl_rng *rng, tsk_table_collection_t *tables,
mutation_model_t *model, size_t mutation_block_size);
int mutgen_set_time_interval(mutgen_t *self, double start_time, double end_time);
int mutgen_set_rate(mutgen_t *self, double rate);
int mutgen_set_rate_map(mutgen_t *self, size_t size, double *position, double *rate);
int mutgen_free(mutgen_t *self);
int mutgen_generate(mutgen_t *self, int flags);
void mutgen_print_state(mutgen_t *self, FILE *out);
/* Functions exposed here for unit testing. Not part of public API. */
int msp_multi_merger_common_ancestor_event(
msp_t *self, avl_tree_t *ancestors, avl_tree_t *Q, uint32_t k, uint32_t num_pots);
#endif /*__MSPRIME_H__*/