In [1]:
import tskit
import msprime
import time

### Simulate ancestry and mutations along the ancestry for chr20 using human-like parameters, assuming a panmictic population (Ne = 10,000)

In [2]:
# Set msprime simulation parameters
full_cohort_size = 20_000
ploidy_level = 2
eff_pop_size = 10_000

In [3]:
# Set the recombination map to the HapMap Phase II map.
# The map was downloaded from https://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/
# 
# Notes:
# 1. 'sequence_length' is automatically set to the end coordinate in the HapMap file.
# 2. The default map is "flat", i.e., the recombination rate is uniform along the entire genome.
rate_map = msprime.RateMap.read_hapmap(
    fileobj = "../hapmap/genetic_map_GRCh37_chr20_reduced.txt"
)

In [4]:
# Simulate a tree sequence that contains both the ancestry and mutations along the ancestry.
# 
# Notes:
# 1. In humans, the genome-wide recombination rate and mutation rate are roughly equal (~1e-8).
# 2. The Hudson coalescent approximates the discrete-time Wright-Fisher model.
# 
# Simulate only geneaological trees
tic = time.time()

ts = msprime.sim_ancestry(
    samples = full_cohort_size, # i.e., individuals
    population_size = eff_pop_size, # effective population size
    ploidy = ploidy_level,
    model = "hudson",
    recombination_rate = rate_map,
    
)

# Simulate mutations along the trees in the tree sequence
mts = msprime.sim_mutations(
    ts,
    rate = 1e-8 # human-like genome-wide mutation rate
)

toc = time.time()
print(f"Simulating ancestry and mutations took {round(toc - tic)} seconds.")

The provenance information for the resulting tree sequence is 2.76MB. This is nothing to worry about as provenance is a good thing to have, but if you want to save this memory/storage space you can disable provenance recording by setting record_provenance=False


Simulating ancestry and mutations took 60 seconds.


In [5]:
# Check the tree sequences
print(f"Number of individuals of {ploidy_level}x ploidy is {ts.num_individuals}.")
print(f"Number of sample nodes is {ts.num_samples}.")
print(f"Number of all nodes is {ts.num_nodes}.")
# ts.num_nodes - ts.num_samples equal the number of non-sample nodes
print(f"Number of trees in the tree sequence is {ts.num_trees}.")

print(f"Number of mutations before mutation simulation is {ts.num_mutations}.")
print(f"Number of mutations after  mutation simulation is {mts.num_mutations}.")

print(f"Number of sites with genetic variation before mutation simulation is {ts.num_sites}.")
print(f"Number of sites with genetic variation after mutation simulation  is {mts.num_sites}.")

print(f"Number of bytes to store the tree sequence without mutations is {ts.nbytes / 1e6}.")
print(f"Number of bytes to store the tree sequence with mutations is {mts.nbytes / 1e6}.")

Number of individuals of 2x ploidy is 20000.
Number of sample nodes is 40000.
Number of all nodes is 371730.
Number of trees in the tree sequence is 438836.
Number of mutations before mutation simulation is 0.
Number of mutations after  mutation simulation is 281568.
Number of sites with genetic variation before mutation simulation is 0.
Number of sites with genetic variation after mutation simulation  is 280901.
Number of bytes to store the tree sequence without mutations is 86.381855.
Number of bytes to store the tree sequence with mutations is 103.823137.


In [6]:
# Store the simulated tree sequence
mts_file = "./data/simulated_tree_sequences/simulated_source_panmictic.trees"
mts.dump(mts_file)