/
iso_dadi.py
185 lines (161 loc) · 6.93 KB
/
iso_dadi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import scipy
import numpy
import dadi
import matplotlib
import matplotlib.pyplot as pyplot
import sys
import re
import pylab
from dadi import Numerics, PhiManip, Integration, Spectrum
## get population ids, 2n sample sizes, and vcf file from the command line
p1 = sys.argv[1];
p2 = sys.argv[2];
n1 = sys.argv[3];
n2 = sys.argv[4];
vv = sys.argv[5];
#n1 = 40;
#n2 = 40;
#p1 = "poppensis_TBARN_DF";
#p2 = "poppensis_FROCK_DF";
ns = numpy.array([int(n1), int(n2)])
## read data from filtered vcf and create downsampled/projected 2d sfs
dd = dadi.Misc.make_data_dict_vcf(vv,"/uufs/chpc.utah.edu/common/home/gompert-group3/projects/timema_host_ri/dadi/ids.txt")
#dd = dadi.Misc.make_data_dict_vcf("fst_gsi_admixture_stages_speciation/fst/variants.flt.100kmreads_10kdepth_20QS.90cov.vcf","/uufs/chpc.utah.edu/common/home/gompert-group3/projects/timema_host_ri/dadi/ids.txt")
fs = dadi.Spectrum.from_data_dict(dd,[p1, p2],projections=ns, polarized=False)
## downsample to 80% of minimum
downs = ns * 0.8
dns = numpy.array([int(downs[0]), int(downs[1])])
fs = fs.project(dns)
of = "dd_80_"+p1+"_"+p2+"_iso.txt"
pngf = "dd_80_"+p1+"_"+p2+"_iso.png"
ofile = open(of,"w")
ns = fs.sample_sizes
## define split model, similar to dadi website, includes exponential growth
def split(params, ns, pts):
s, nu1, nu2, T = params
xx = Numerics.default_grid(pts)
phi = PhiManip.phi_1D(xx)
phi = PhiManip.phi_1D_to_2D(xx, phi)
nu1_func = lambda t : s * (nu1/s) ** (t/T)
nu2_func = lambda t : (1-s) * (nu2/(1-s)) ** (t/T)
phi = Integration.two_pops(phi, xx, T, nu1_func, nu2_func, m12 = 0, m21 = 0)
fs = Spectrum.from_phi(phi, ns, (xx, xx))
return fs
# These are the grid point settings will use for extrapolation.
# smallest a bit bigger than mean sample size, this should be close
mnss = int(downs.mean())
pts_l = [mnss, mnss+10, mnss+20]
func = split
# Now let's optimize parameters for this model.
# The upper_bound and lower_bound lists are for use in optimization.
# Occasionally the optimizer will try wacky parameter values. We in particular
# want to exclude values with very long times, very small population sizes, or
# very high migration rates, as they will take a long time to evaluate.
# Parameters are: (s, nu1, nu2, T)
upper_bound = [.9, 10, 10, 10]
lower_bound = [.1, 0.1, 0.1, 0]
# This is our initial guess for the parameters, which is somewhat arbitrary.
p0 = [0.5, 0.8, 1.2, 0.4]
# Make the extrapolating version of our demographic model function.
func_ex = dadi.Numerics.make_extrap_log_func(func)
# trying 3 rounds of optimization
ofile = open(of,"w")
print(ofile)
ofile.write("{0}".format(fs.Fst()))
ofile.write("\n")
## initial defaults
ll_opt = -999999999999
theta = 1.0
mpopt = p0
for x in range(20):
# Perturb our parameters before optimization. This does so by taking each
# parameter a up to a factor of two up or down.
p0 = dadi.Misc.perturb_params(p0, fold=1, upper_bound=upper_bound,lower_bound=lower_bound)
# Do the optimization. By default we assume that theta is a free parameter,
# since it's trivial to find given the other parameters. If you want to fix
# theta, add a multinom=False to the call.
# The maxiter argument restricts how long the optimizer will run. For real
# runs, you will want to set this value higher (at least 10), to encourage
# better convergence. You will also want to run optimization several times
# using multiple sets of intial parameters, to be confident you've actually
# found the true maximum likelihood parameters.
popt = dadi.Inference.optimize_log(p0, fs, func_ex, pts_l, lower_bound=lower_bound, upper_bound=upper_bound,verbose=1, maxiter=20)
model = func_ex(popt, dns, pts_l)
ll_model = dadi.Inference.ll_multinom(model,fs)
if(ll_model > ll_opt):
ll_opt = ll_model
mpopt = popt
theta = dadi.Inference.optimal_sfs_scaling(model,fs)
## write round 1 max
ofile.write("{0}".format(ll_opt))
ofile.write(" {0}".format(theta))
for a in range(4):
ofile.write(" {0}".format(mpopt[a]))
ofile.write("\n")
## start round 2 at max
p0 = mpopt
for x in range(10):
# Perturb our parameters before optimization. This does so by taking each
# parameter a up to a factor of two up or down.
p0 = dadi.Misc.perturb_params(p0, fold=1, upper_bound=upper_bound,lower_bound=lower_bound)
# Do the optimization. By default we assume that theta is a free parameter,
# since it's trivial to find given the other parameters. If you want to fix
# theta, add a multinom=False to the call.
# The maxiter argument restricts how long the optimizer will run. For real
# runs, you will want to set this value higher (at least 10), to encourage
# better convergence. You will also want to run optimization several times
# using multiple sets of intial parameters, to be confident you've actually
# found the true maximum likelihood parameters.
popt = dadi.Inference.optimize_log(p0, fs, func_ex, pts_l, lower_bound=lower_bound, upper_bound=upper_bound,verbose=1, maxiter=30)
model = func_ex(popt, dns, pts_l)
ll_model = dadi.Inference.ll_multinom(model,fs)
if(ll_model > ll_opt):
ll_opt = ll_model
mpopt = popt
theta = dadi.Inference.optimal_sfs_scaling(model,fs)
## write round 2 max
ofile.write("{0}".format(ll_opt))
ofile.write(" {0}".format(theta))
for a in range(4):
ofile.write(" {0}".format(mpopt[a]))
ofile.write("\n")
## start round 3 at max
p0 = mpopt
mtheta = theta
for x in range(5):
# Perturb our parameters before optimization. This does so by taking each
# parameter a up to a factor of two up or down.
p0 = dadi.Misc.perturb_params(p0, fold=1, upper_bound=upper_bound,lower_bound=lower_bound)
# Do the optimization. By default we assume that theta is a free parameter,
# since it's trivial to find given the other parameters. If you want to fix
# theta, add a multinom=False to the call.
# The maxiter argument restricts how long the optimizer will run. For real
# runs, you will want to set this value higher (at least 10), to encourage
# better convergence. You will also want to run optimization several times
# using multiple sets of intial parameters, to be confident you've actually
# found the true maximum likelihood parameters.
popt = dadi.Inference.optimize_log(p0, fs, func_ex, pts_l, lower_bound=lower_bound, upper_bound=upper_bound,verbose=1, maxiter=50)
model = func_ex(popt, dns, pts_l)
ll_model = dadi.Inference.ll_multinom(model,fs)
if(ll_model > ll_opt):
ll_opt = ll_model
mpopt = popt
mtheta = dadi.Inference.optimal_sfs_scaling(model,fs)
## write each round
theta = dadi.Inference.optimal_sfs_scaling(model,fs)
ofile.write("{0}".format(ll_model))
ofile.write(" {0}".format(theta))
for a in range(4):
ofile.write(" {0}".format(popt[a]))
ofile.write("\n")
## write round 3 max... this is the answer
ofile.write("{0}".format(ll_opt))
ofile.write(" {0}".format(mtheta))
for a in range(4):
ofile.write(" {0}".format(mpopt[a]))
ofile.write("\n")
model = func_ex(mpopt, dns, pts_l)
pylab.figure(figsize=(8,6))
dadi.Plotting.plot_2d_comp_multinom(model,fs,vmin=1,resid_range=10,show=False)
pylab.savefig(pngf, dpi=400)
ofile.close()