In [10]:
%env PYTHONX=-Xfrozen_modules=off
! pip install -r workflow/requirements.txt

Collecting pyslim==1.0.4 (from -r workflow/requirements.txt (line 5))
  Using cached pyslim-1.0.4-py3-none-any.whl.metadata (1.3 kB)
Collecting matplotlib==3.8.1 (from -r workflow/requirements.txt (line 7))
  Using cached matplotlib-3.8.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting Ipython==8.14.0 (from -r workflow/requirements.txt (line 8))
  Using cached ipython-8.14.0-py3-none-any.whl.metadata (5.8 kB)
Using cached pyslim-1.0.4-py3-none-any.whl (65 kB)
Using cached matplotlib-3.8.1-cp311-cp311-macosx_11_0_arm64.whl (7.5 MB)
Using cached ipython-8.14.0-py3-none-any.whl (798 kB)
Installing collected packages: matplotlib, Ipython, pyslim
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.8.2
    Uninstalling matplotlib-3.8.2:
      Successfully uninstalled matplotlib-3.8.2
Successfully installed Ipython-8.14.0 matplotlib-3.8.1 pyslim-1.0.4


In [11]:
import pyslim
import tskit
import msprime
import pandas as pd
import numpy as np
import io
import random
import matplotlib.pyplot as plt
from IPython.display import display,SVG

<!-- parameters -->
- sample_size: 50
- sequence_length: 100000
- mutation_rate: 3e-08
- recombination_rate: 3e-09
- N: 50000
- selection_coefficient : 1

In [12]:
random_seed = random.randint(1, 1000000)
output_pdf_path = f"output/output_selection/selection_txt/selection_{selection_coefficient}.txt"
with open(output_pdf_path,"a") as file:
    file.write(f" the random seed used for this simulation is {random_seed} \n")

So we are going to use the pyslim package in this jupyter-notebook , So e already had simulated trees after selective sweep but we simulated it less then 2Ne generation so there is high chance we don't have the Most recent common ancestor.

The pyslim helps to perform the coalescent burn-in on the data until we find the MCRA (Most recent common ancestor) and then put the neutral mutations on the tree.

There are three steps in this case :
1. Recapitation 
2. Simplification
3. Neutral Mutation

# Recapitate
The population may not have coalesced entirely (reached the demographic equilibrium). This performs the coalescent burn based on the parameters we provide.
# Simplification
It reduces the number of external nodes to few as we only need a sample then the whole population
# Neutral Mutation
At the end of this the muatations are added on the treesequence.




In [19]:
# Loading the sequence in the ts format
ts_load = tskit.load(f"output/output_selection/selection_trees/selection_{selection_coefficient}.trees")

In [18]:
# Recapitating the data

rts = pyslim.recapitate(ts_load,
                        recombination_rate = recombination_rate,
                        ancestral_Ne = N,
                        random_seed =random_seed)
orig_max_roots = max(t.num_roots for t in ts_load.trees()) 
recap_max_roots = max(t.num_roots for t in rts.trees())
print(f"Maximum number of roots before recapitation: {orig_max_roots}\n"
      f"After recapitation: {recap_max_roots}")
with open(output_pdf_path,"a") as txt_file:
    txt_file.write(f"Maximum number of roots before recapitation: {orig_max_roots}\n"
                   f"After recapitation: {recap_max_roots} \n")





Maximum number of roots before recapitation: 1
After recapitation: 1


In [20]:
# Simplification of the data

rng = np.random.default_rng(seed=random_seed)
alive_inds = pyslim.individuals_alive_at(rts, 0)
keep_indivs = rng.choice(alive_inds, sample_size, replace=False)
keep_nodes = []
for i in keep_indivs:
  keep_nodes.extend(rts.individual(i).nodes)

sts = rts.simplify(keep_nodes, keep_input_roots=True)

with open(output_pdf_path,"a") as txt_file:
  txt_file.write((f"Before, there were {rts.num_samples} sample nodes (and {rts.num_individuals} individuals)\n"
                  f"in the tree sequence, and now there are {sts.num_samples} sample nodes\n"
                  f"(and {sts.num_individuals} individuals).\n"))

NameError: name 'sample_size' is not defined

In [3]:
# Adding the mutations to the tree
next_id = pyslim.next_slim_mutation_id(sts)
ts = msprime.sim_mutations(sts,
                           rate=mutation_rate,
                           model=msprime.SLiMMutationModel(type=0, next_id=next_id),
                           keep=True)
                           


NameError: name 'pyslim' is not defined

In [None]:
# Turning the tree sequences into the vcf file
nts = pyslim.generate_nucleotides(ts)
nts = pyslim.convert_alleles(nts)

vcf_file_path = f"output/output_selection/vcf/selection_{selection_coefficient}.vcf"
with open(vcf_file_path, "w") as vcf_file:
    nts.write_vcf(vcf_file)
fasta_file_path= f"output/output_selection/fasta/selection_{selection_coefficient}.fasta"
with open(fasta_file_path,"w") as fasta_file:
    nts.write_fasta(fasta_file)


In [None]:
# Computing the different statistics
ns = nts.segregating_sites(span_normalise = False)
k = nts.allele_frequency_spectrum(polarised = True,span_normalise=False,mode="site")/ns
k = k[2:]
plt.ioff()
plt.subplot(2,2,1)
plt.stairs(k)
plt.xlabel("The categories from 1 to 100")
plt.ylabel("Count")
plt.title(f"Allele frequency spectrum ")
with open(output_pdf_path,"a") as txt_file:
  txt_file.write((f" These are the population statistics related to this data : - \n"
                  f" The allele frequency numbers are :-  \n{k}\n"
                  f"1. The Tajima'D for the given sequence is {nts.Tajimas_D()}\n "))
plt.savefig(f"output/output_selection/plots/allele_frequency_spectrum_{selection_coefficient}.jpg")


In [None]:
meta_file_statistics = "output/output_selection/meta_data/meta_Stats.tab"

with open(meta_file_statistics, "a") as file:
    file.write(f"{fasta_file_path}\t{selection_coefficient}\t{nts.Tajimas_D()}\t{nts.diversity()}\t{ns}\n")




In [None]:
num_windows = 100
k = nts.allele_frequency_spectrum(polarised = True,
                                  windows=np.linspace(0, ts.sequence_length, num_windows + 1),
                                  span_normalise=True,
                                  mode="site")

file_path = f"output/output_selection/arrays/array_data_{selection_coefficient}.txt"

np.savetxt(file_path, k)