TODO: Split data in train, validation, test 

In [1]:
data_dir = "../nuclear-fusion/data/preprocessed/"

In [2]:
# Import dataset
from dataset import SimulationDataset
simulationdataset = SimulationDataset(data_dir)

In [3]:
# Import sampler that uses Time Adjusted Sampling as described in Appendix B
from sampler import TimeAdjustedSampler
sampler = TimeAdjustedSampler(simulationdataset, batch_size=16, omega=20)

In [4]:
# Setup dataloader
from torch.utils.data import DataLoader 
dataloader = DataLoader(simulationdataset, batch_sampler=sampler, num_workers=4)

In [5]:
for batch in dataloader:
    break

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/coder/.local/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/coder/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/coder/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/coder/persistent/TokamakSimulation/dataset.py", line 123, in __getitem__
    temp_idx = np.searchsorted(self.cumulative_lengths, idx, side='right') - 1
  File "/home/coder/.local/lib/python3.10/site-packages/numpy/_core/fromnumeric.py", line 1534, in searchsorted
    return _wrapfunc(a, 'searchsorted', v, side=side, sorter=sorter)
  File "/home/coder/.local/lib/python3.10/site-packages/numpy/_core/fromnumeric.py", line 54, in _wrapfunc
    return _wrapit(obj, method, *args, **kwds)
  File "/home/coder/.local/lib/python3.10/site-packages/numpy/_core/fromnumeric.py", line 46, in _wrapit
    result = getattr(arr, method)(*args, **kwds)
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.


In [6]:
import numpy as np

# Sampling strategy

batch_size = 16
omega = 20

# get available timesteps and sampling distribution
timesteps = simulationdataset.timesteps
p_t = simulationdataset.calculate_p_t(omega=omega)

# sample
sample = np.random.choice(len(p_t), p=p_t)
timestep = timesteps[sample]

# find idxs of simulation of length t
idx_start = simulationdataset.cumulative_lengths[sample]
idx_end = simulationdataset.cumulative_lengths[sample + 1]

# uniformly choose idx to sample and get its simulation
idx = np.random.randint(idx_start, idx_end)
output, forcing = simulationdataset.__getitem__(idx)

batches = []

t_start = 0
while True:
    t_middle = t_start + omega
    t_end = t_start + 2*omega

    # check if there is enough room for, if not, break loop
    if t_end > output.shape[0]:
        break

    # fetch X data
    X = output[t_start:t_middle,:,:]
    F = forcing[t_start:t_end,:,:]
    Y = output[t_middle:t_end,:,:]
    
    # append to batch
    batches.append([X, F, Y])

    # go to next batch
    t_start += 2*omega

print("timesteps available", timesteps)
print("probability distribution from which we sample length: ", p_t)
print(f"Sampled length in timesteps: {timestep} for sample outcome {sample}")
print(f"Correspondings idxs: {idx_start}-{idx_end}")
print("Sampled idx in range: ", idx)
print("Shape of output: ", output.shape)
print("Number of batches taken from simulation: ", len(batches))
print("\tShape of batch X: ", batches[0][0].shape)

timesteps available [100, 500, 1000]
probability distribution from which we sample length:  [0.0625 0.3125 0.625 ]
Sampled length in timesteps: 1000 for sample outcome 2
Correspondings idxs: 192-288
Sampled idx in range:  275
Shape of output:  (991, 500, 6)
Number of batches taken from simulation:  24
	Shape of batch X:  (20, 500, 6)


In [8]:
print("\tShape of batch X: ", batches[1][1].shape)

	Shape of batch X:  (40, 500, 2)
