## Aaaargh new data

## Step 1: Export new training data for tasks as separate files

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from utils import data_dir
import os

from tqdm import tqdm

2023-10-29 01:59:17.287016: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
task_name = "axion2"

In [3]:
def data_split(X, Y, n_test):
    """
    Split X and Y (ndarrays where leading dimension is examples)
    Returns a 6-tuple (X_train, X_test, Y_train, Y_test).
    """
    assert(len(X) == len(Y)), "X and Y should have same length."
    assert(n_test < len(X)), "n_test should comprise less than total."

    N = len(X)
    train = N - n_test
    
    X_train, Y_train = X[:train], Y[:train]
    X_test, Y_test = X[train:], Y[train:]

    return (X_train, X_test,
            Y_train, Y_test)

In [4]:
def get_data(task):
    """
    Return 4-tuple of data.
    task should be one of "scalar1", "axion1", or "axion2".
    """
    cloud_paths = ["pi0_cloud.npy", "gamma_cloud.npy", f"{task}_cloud.npy"]
    
    print(f"Fetching all clouds...")
    X = np.concatenate([
        np.load(f"{data_dir}/processed/{path}") \
        for path in cloud_paths
    ], axis=0)
    
    N = 100000  # Size of each dataset
    assert(len(X) == 3 * N)  # Assumption about data size
    
    Y = to_categorical((0,) * N + (1,) * N + (2,) * N)
    
    # Scramble in the same order
    print(f"Scrambling order...")
    rng = np.random.default_rng(0)
    permutation = np.random.permutation(3 * N)
    X = X[permutation]
    Y = Y[permutation]
    
    n_test = round(0.3 * 3 * N)
    
    return data_split(X, Y, n_test=n_test)

In [5]:
# ~40 sec
(X_train, X_test, Y_train, Y_test) = get_data(task_name)

Fetching all clouds...
Scrambling order...


## Use a different method?

In [6]:
tf.executing_eagerly()

True

In [7]:
for name, arr in [("X_train", X_train), ("Y_train", Y_train), \
                  ("X_test", X_test), ("Y_test", Y_test)]:
    dataset = tf.data.Dataset.from_tensor_slices(arr)
    dataset.save(f"{data_dir}/processed/tf_dataset/{task_name}/{name}")

2023-10-29 02:00:12.348761: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 6451200000 exceeds 10% of free system memory.
2023-10-29 02:00:16.702277: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 6451200000 exceeds 10% of free system memory.
2023-10-29 02:00:35.694473: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2764800000 exceeds 10% of free system memory.
2023-10-29 02:00:37.409059: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2764800000 exceeds 10% of free system memory.
