add seed option to lc sim functions

ywx649999311 · Dec 12, 2023 · e7e3b8c · e7e3b8c
1 parent 3646947
commit e7e3b8c
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 17 deletions.
diff --git a/src/eztao/ts/carma_sim.py b/src/eztao/ts/carma_sim.py
@@ -12,7 +12,27 @@
 __all__ = ["gpSimFull", "gpSimRand", "gpSimByTime", "addNoise", "pred_lc"]
 
 
-def gpSimFull(carmaTerm, SNR, duration, N, nLC=1, log_flux=True):
+def gp_sample(gp, size=None, seed=None):
+    """Sample Celerite GP with a fixed seed"""
+
+    if seed is not None:
+        rng = np.random.default_rng(seed=seed)
+    else:
+        rng = np.random.default_rng()
+
+    gp._recompute()
+    if size is None:
+        n = rng.standard_normal(len(gp._t))
+    else:
+        n = rng.standard_normal((len(gp._t), size))
+
+    n = gp.solver.dot_L(n)
+    if size is None:
+        return gp.mean.get_value(gp._t) + n[:, 0]
+    return gp.mean.get_value(gp._t)[None, :] + n.T
+
+
+def gpSimFull(carmaTerm, SNR, duration, N, nLC=1, log_flux=True, lc_seed=None):
     """
     Simulate CARMA time series using uniform sampling.
 
@@ -25,6 +45,7 @@ def gpSimFull(carmaTerm, SNR, duration, N, nLC=1, log_flux=True):
         nLC (int, optional): Number of time series to simulate. Defaults to 1.
         log_flux (bool): Whether the flux/y values are in astronomical magnitude.
             This argument affects how errors are assigned. Defaults to True.
+        lc_seed (int): Random seed for time series simulation. Defaults to None.
 
     Raises:
         RuntimeError: If the input CARMA term/model is not stable, thus cannot be
@@ -44,9 +65,14 @@ def gpSimFull(carmaTerm, SNR, duration, N, nLC=1, log_flux=True):
             "The covariance matrix of the provided CARMA term is not positive definite!"
         )
 
+    if lc_seed is not None:
+        rng = np.random.default_rng(seed=lc_seed)
+    else:
+        rng = np.random.default_rng()
+
     t = np.linspace(0, duration, N)
     noise = carmaTerm.get_rms_amp() / SNR
-    yerr = np.random.lognormal(0, 0.25, N) * noise
+    yerr = rng.lognormal(0, 0.25, N) * noise
     yerr = yerr[np.argsort(np.abs(yerr))]  # small->large
 
     # init GP and solve matrix
@@ -55,7 +81,7 @@ def gpSimFull(carmaTerm, SNR, duration, N, nLC=1, log_flux=True):
 
     # simulate, assign yerr based on y
     t = np.repeat(t[None, :], nLC, axis=0)
-    y = gp_sim.sample(size=nLC)
+    y = gp_sample(gp_sim, size=nLC, seed=lc_seed)
 
     # format yerr making it heteroscedastic
     yerr = np.repeat(yerr[None, :], nLC, axis=0)
@@ -77,7 +103,16 @@ def gpSimFull(carmaTerm, SNR, duration, N, nLC=1, log_flux=True):
 
 
 def gpSimRand(
-    carmaTerm, SNR, duration, N, nLC=1, log_flux=True, season=True, full_N=10_000
+    carmaTerm,
+    SNR,
+    duration,
+    N,
+    nLC=1,
+    log_flux=True,
+    season=True,
+    full_N=10_000,
+    lc_seed=None,
+    downsample_seed=None,
 ):
     """
     Simulate CARMA time series randomly downsampled from a much denser full time series.
@@ -95,12 +130,17 @@ def gpSimRand(
             to True.
         full_N (int, optional): The number of data points in the full time series
             (before downsampling). Defaults to 10_000.
+        lc_seed (int): Random seed for full time series simulation. Defaults to None.
+        downsample_seed (int): Random seed for downsampling the simulated full time
+            series. Defaults to None.
 
     Returns:
         (array(float), array(float), array(float)): Time stamps (default in day), y
         values and measurement errors of the simulated time series.
     """
-    t, y, yerr = gpSimFull(carmaTerm, SNR, duration, full_N, nLC=nLC, log_flux=log_flux)
+    t, y, yerr = gpSimFull(
+        carmaTerm, SNR, duration, full_N, nLC=nLC, log_flux=log_flux, lc_seed=lc_seed
+    )
     t = np.atleast_2d(t)
     y = np.atleast_2d(y)
     yerr = np.atleast_2d(yerr)
@@ -116,7 +156,7 @@ def gpSimRand(
             mask1 = add_season(t[i])
         else:
             mask1 = np.ones(t[i].shape[0], dtype=bool)
-        mask2 = downsample_byN(t[i, mask1], N)
+        mask2 = downsample_byN(t[i, mask1], N, seed=downsample_seed)
         tOut[i, :] = t[i, mask1][mask2]
         yOut[i, :] = y[i, mask1][mask2]
         yerrOut[i, :] = yerr[i, mask1][mask2]
@@ -127,7 +167,7 @@ def gpSimRand(
         return tOut, yOut, yerrOut
 
 
-def gpSimByTime(carmaTerm, SNR, t, factor=10, nLC=1, log_flux=True):
+def gpSimByTime(carmaTerm, SNR, t, factor=10, nLC=1, log_flux=True, lc_seed=None):
     """
     Simulate CARMA time series at desired time stamps.
 
@@ -147,6 +187,7 @@ def gpSimByTime(carmaTerm, SNR, t, factor=10, nLC=1, log_flux=True):
         nLC (int, optional): Number of time series to simulate. Defaults to 1.
         log_flux (bool): Whether the flux/y values are in astronomical magnitude.
             This argument affects how errors are assigned. Defaults to True.
+        lc_seed (int): Random seed for time series simulation. Defaults to None.
 
     Returns:
         (array(float), array(float), array(float)): Time stamps (default in day), y
@@ -158,7 +199,7 @@ def gpSimByTime(carmaTerm, SNR, t, factor=10, nLC=1, log_flux=True):
 
     # simulate full LC
     tFull, yFull, yerrFull = gpSimFull(
-        carmaTerm, SNR, duration, N=N, nLC=nLC, log_flux=log_flux
+        carmaTerm, SNR, duration, N=N, nLC=nLC, log_flux=log_flux, lc_seed=lc_seed
     )
     tFull = np.atleast_2d(tFull)
     yFull = np.atleast_2d(yFull)
@@ -177,20 +218,27 @@ def gpSimByTime(carmaTerm, SNR, t, factor=10, nLC=1, log_flux=True):
         return tOut, yOut, yerrOut
 
 
-def addNoise(y, yerr):
+def addNoise(y, yerr, seed=None):
     """
     Add (gaussian) noise to the input simulated time series given the measurement uncertainties.
 
     Args:
         y (array(float)): The 'clean' time series.
         yerr (array(float)): The measurement uncertainties for the input
             time series.
+    seed (int): Random seed for simulating noise. Defaults to None.
 
     Returns:
         array(float): A new time series with simulated (gaussian ) noise added
         on top.
     """
-    vec_norm = np.vectorize(np.random.normal, signature="(n),(n)->(n)")
+
+    if seed is not None:
+        rng = np.random.default_rng(seed=seed)
+    else:
+        rng = np.random.default_rng()
+
+    vec_norm = np.vectorize(rng.normal, signature="(n),(n)->(n)")
     noise = vec_norm(np.zeros_like(y), yerr)
 
     return y + noise

diff --git a/src/eztao/ts/utils.py b/src/eztao/ts/utils.py
@@ -9,21 +9,28 @@
 __all__ = ["downsample_byN", "add_season", "downsample_byTime", "median_clip"]
 
 
-def downsample_byN(t, nObs):
+def downsample_byN(t, nObs, seed):
     """
     Randomly choose N data points from a given time series.
 
     Args:
         t (array(float)): Time stamps of the original time series.
         N (int): The number of entries in the final time series.
+        seed (int): Random seed for downsampling. Defaults to None.
 
     Returns:
         A 1d array booleans indicating which data points to keep.
     """
+
+    if seed is not None:
+        rng = np.random.default_rng(seed=seed)
+    else:
+        rng = np.random.default_rng()
+
     # random choose index
     idx = np.arange(len(t))
     mask = np.zeros_like(idx, dtype=np.bool_)
-    true_idx = np.random.choice(idx, nObs, replace=False)
+    true_idx = rng.choice(idx, nObs, replace=False)
 
     # assign chosen index to 1/True
     mask[true_idx] = 1

diff --git a/tests/test_CARMAFit.py b/tests/test_CARMAFit.py
@@ -141,7 +141,9 @@ def test_drwFit():
 
 
 def test_dhoFit():
-    t1, y1, yerr1 = gpSimRand(dho1, 100, 365 * 10.0, 200, nLC=100, season=False)
+    t1, y1, yerr1 = gpSimRand(
+        dho1, 100, 365 * 10.0, 200, nLC=100, season=False, lc_seed=10
+    )
     best_fit_dho1 = np.array(
         Parallel(n_jobs=-1)(
             delayed(dho_fit)(t1[i], y1[i], yerr1[i]) for i in range(len(t1))
@@ -156,7 +158,9 @@ def test_dhoFit():
     assert np.percentile(diff1, 75) < 0.25
 
     # the second test will down scale lc by 1e6
-    t2, y2, yerr2 = gpSimRand(dho2, 100, 365 * 10.0, 200, nLC=100, season=False)
+    t2, y2, yerr2 = gpSimRand(
+        dho2, 100, 365 * 10.0, 200, nLC=100, season=False, lc_seed=10
+    )
     best_fit_dho2 = np.array(
         Parallel(n_jobs=-1)(
             delayed(dho_fit)(t2[i], y2[i] / 1e6, yerr2[i] / 1e6) for i in range(len(t2))
@@ -172,7 +176,9 @@ def test_dhoFit():
 
 
 def test_carmaFit():
-    t1, y1, yerr1 = gpSimRand(carma30, 200, 365 * 10.0, 1000, nLC=150, season=False)
+    t1, y1, yerr1 = gpSimRand(
+        carma30, 200, 365 * 10.0, 1000, nLC=150, season=False, lc_seed=10
+    )
     best_fit_carma1 = np.array(
         Parallel(n_jobs=-1)(
             delayed(carma_fit)(t1[i], y1[i] / 1e-6, yerr1[i] / 1e-6, 3, 0, n_opt=30)
@@ -189,7 +195,9 @@ def test_carmaFit():
     assert np.percentile(diff1, 75) < 0.4
 
     # the second test will down scale lc by 1e6
-    t2, y2, yerr2 = gpSimRand(carma31, 300, 365 * 10.0, 1000, nLC=150, season=False)
+    t2, y2, yerr2 = gpSimRand(
+        carma31, 300, 365 * 10.0, 1000, nLC=150, season=False, lc_seed=10
+    )
     best_fit_carma2 = np.array(
         Parallel(n_jobs=-1)(
             delayed(carma_fit)(t2[i], y2[i], yerr2[i], 3, 1, n_opt=50)

diff --git a/tests/test_CARMASim.py b/tests/test_CARMASim.py
@@ -26,9 +26,30 @@ def test_invalidSim():
         t, y, yerr = gpSimFull(carma_invalid, 20, 365 * 10.0, 10000)
 
 
+def test_simFullSeed():
+    """Test the lc_seed option of gpSimFull."""
+
+    # SNR = 20
+    nLC = 5
+    for kernel in test_kernels:
+        t, y1, yerr1 = gpSimFull(kernel, 20, 365 * 10.0, 1000, nLC=nLC, lc_seed=10)
+        t, y2, yerr2 = gpSimFull(kernel, 20, 365 * 10.0, 1000, nLC=nLC, lc_seed=10)
+
+        # agree between two separate calls
+        for i in range(nLC):
+            assert np.allclose(y1[i], y2[i])
+            assert np.allclose(yerr1[i], yerr2[i])
+
+        # disagree between different simulations from one call
+        assert not np.allclose(y1[0], y1[nLC - 1])
+        assert not np.allclose(y1[1], y1[nLC - 2])
+        assert not np.allclose(yerr1[0], yerr1[nLC - 1])
+        assert not np.allclose(yerr1[1], yerr1[nLC - 2])
+
+
 def test_simRand():
     """Test function gpSimRand."""
-    # SNR = 10
+    # SNR = 20
     for kernel in test_kernels:
         t, y, yerr = gpSimRand(kernel, 20, 365 * 10.0, 150, nLC=100, season=False)
         log_amp = np.log(kernel.get_rms_amp())
@@ -53,6 +74,26 @@ def test_simRand():
     assert (np.argsort(yF) == np.argsort(-yerrF)).all()
 
 
+def test_simRandSeed():
+    """Test the downsample_seed option of gpSimRand."""
+
+    # SNR = 20
+    nLC = 1
+    for kernel in test_kernels:
+        t1, y1, yerr1 = gpSimRand(
+            kernel, 20, 365 * 10.0, 150, nLC=nLC, lc_seed=10, downsample_seed=10
+        )
+        t2, y2, yerr2 = gpSimRand(
+            kernel, 20, 365 * 10.0, 150, nLC=nLC, lc_seed=10, downsample_seed=10
+        )
+        t3, y3, yerr3 = gpSimRand(
+            kernel, 20, 365 * 10.0, 150, nLC=nLC, lc_seed=10, downsample_seed=20
+        )
+
+        assert np.allclose(t1, t2)
+        assert not np.allclose(t1, t3)
+
+
 def test_simByTime():
     """Test function gpSimByTime."""
     t = np.sort(np.random.uniform(0, 3650, 5000))
@@ -72,3 +113,22 @@ def test_simByTime():
     # test single LC simulation
     tOut, yOut, yerrOut = gpSimByTime(dho2, SNR, t, nLC=1)
     assert tOut.shape[0] == yOut.shape[0] == yerrOut.shape[0] == t.shape[0]
+
+
+def test_addNoiseSeed():
+    """Test the seed option of addNoise."""
+
+    nLC = 1
+    t1, y1, yerr1 = gpSimRand(
+        test_kernels[-1], 20, 365 * 10.0, 150, nLC=nLC, lc_seed=10, downsample_seed=10
+    )
+    t2, y2, yerr2 = gpSimRand(
+        test_kernels[-1], 20, 365 * 10.0, 150, nLC=nLC, lc_seed=10, downsample_seed=10
+    )
+
+    noise_y1 = addNoise(y1, yerr1, seed=10)
+    noise_y2 = addNoise(y2, yerr2, seed=10)
+    noise_y3 = addNoise(y1, yerr1, seed=20)
+
+    assert np.allclose(noise_y1, noise_y2)
+    assert not np.allclose(noise_y1, noise_y3)