In [1]:
import torch
import random

random_seed = 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(random_seed)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(random_seed)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(random_seed)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(random_seed)
 
# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

2024-06-06 08:43:28.355651: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-06 08:43:28.404010: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




Using device: cuda


In [2]:
from pan21_functions import *
import datetime
from keras.callbacks import CSVLogger
from pathlib import Path

In [3]:
# Pan21PyDataset("pan21/train", "pan21/train", "train_ds_uncompressed").to_file()
# Pan21PyDataset("pan21/validation", "pan21/validation", "val_ds_uncompressed").to_file()

In [4]:
# Make sure the files are good
def test_load(path, name):
    np.load(path)[name]

# _ = Parallel(n_jobs=-1)(delayed(test_load)(Path("train_ds") / f"{i}.npz", "batch_x") for i in tqdm(range(350)))

In [5]:
# Pan21FourierDataset("pan21/train", "pan21/train", "train_ds_uncompressed", num_fourier_features=512).to_file()
# Pan21FourierDataset("pan21/validation", "pan21/validation", "val_ds_uncompressed", num_fourier_features=512).to_file()

In [6]:
# _ = Parallel(n_jobs=-1)(delayed(test_load)(Path("train_ds") / "fourier" / f"{i}.npz", "fourier_batch_x") for i in tqdm(range(350)))

In [7]:
# import time

# start_idx = 0
# limit = 5
# # limit = len(train_ds)

# before = time.time()
# for i in range(start_idx, start_idx+limit):
#     batch_x, batch_y = train_ds.__getitem__(i, force_compute=True)
# after = time.time()
# for i in range(start_idx, start_idx+limit):
#     batch_x, batch_y = train_ds.__getitem__(i, force_compute=False)
# after_after = time.time()

# print(f"Compute: {round((after - before)/limit, 2)}s vs File read: {round((after_after - after)/limit, 2)}s")
# # Compute: 8.95s per batch
# # Compute with compression: ~17s
# # Uncompressed read: .7s per batch
# # Compressed read: 1.6s per batch
# # Compressed is 1/10 the size of uncompressed, but takes ~twice as long to precompute and save
# # Compressed 512D Fourier takes ~30s per batch
# # Compressed 512D Fourier is about 500-700MB per batch

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Input, Flatten
from tensorflow.keras import optimizers, losses, metrics

# Code implementation of the RNN for sequence labeling
def create_rnn_model(num_labels, embedding_dim, max_input_length):
    model = Sequential()
    model.add(Input(shape=(max_input_length*2, embedding_dim)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Flatten())
    model.add(Dense(num_labels, activation='sigmoid'))

    model.compile(
        optimizer=optimizers.RMSprop(),  # Optimizer
        # Loss function to minimize
        loss=losses.BinaryCrossentropy(),
        # List of metrics to monitor
        metrics=[metrics.BinaryAccuracy(), metrics.AUC()],
        jit_compile=True
    )

    return model
 
num_labels = 1
embedding_dim = 768

In [13]:
def train_model_num_ff(train_ds, val_ds, model_name, epochs=5):
    max_input_length = train_ds.max_input_length
    # print(f"{max_input_length=}")

    print(model_name)
    time_string = f"{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}"
    model_name.mkdir(exist_ok=True)
    checkpoint_name_format = time_string + "_cp-{epoch:02d}.weights.h5"
    checkpoint_path = model_name / checkpoint_name_format
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        checkpoint_path, 
        verbose=1, 
        save_weights_only=True,
        # Save weights, every epoch.
        save_freq='epoch')

    model = create_rnn_model(num_labels, embedding_dim, max_input_length)
    csv_logger = CSVLogger(f'{model_name}_{time_string}.log', separator=',', append=False)

    already_trained_epochs = 0
    if model_name.exists():
        checkpoints = list(Path(model_name).glob('*.weights.h5'))
        if checkpoints:
            already_trained_epochs = len(checkpoints)
            model.load_weights(checkpoints[-1])

    with tf.device("/device:GPU:0"):
        history = model.fit(
            train_ds,
            epochs=epochs - already_trained_epochs,
            validation_data=val_ds,
            verbose=1,
            callbacks=[csv_logger, cp_callback]
        )

    model.save(f"{model_name}.keras")

In [None]:
# Does using the frequency domain spectra provide usefule information?
model_dir = Path(f"models/num_fourier_features")
if not model_dir.exists():
    model_dir.mkdir(parents=True)

num_fourier_features = [0, 512//4, 512//2, 512]
for num_ff in tqdm(num_fourier_features):
    fourier_train_ds = Pan21FourierDataset("pan21/train", "pan21/train", "train_ds_uncompressed", num_fourier_features=num_ff)
    fourier_val_ds = Pan21FourierDataset("pan21/validation", "pan21/validation", "val_ds_uncompressed", num_fourier_features=num_ff)

    model_name = model_dir / f"{num_ff}_{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}"

    train_model_num_ff(fourier_train_ds, fourier_val_ds, model_name)

In [None]:
# How many epochs should be trained?
model_dir = Path(f"models/num_epochs")
if not model_dir.exists():
    model_dir.mkdir(parents=True)

num_fourier_features = [0, 512//4]
for num_ff in tqdm(num_fourier_features):
    train_model_num_ff(num_ff, epochs=15)

In [15]:
import importlib; import pan21_functions as p21; importlib.reload(p21)

<module 'pan21_functions' from '/home/ubuntu/tar/pan21_functions.py'>

In [16]:
# Does filtering help?
model_dir = Path(f"models/cutoffs")
if not model_dir.exists():
    model_dir.mkdir(parents=True)

N = 4 # High pass, band stop, band stop, low pass
# Frequencies respresented as percent of Nyquist frequency
cutoff_frequencies = [(i / N, (i + 1) / N) for i in range(N)]
for cutoff in tqdm(cutoff_frequencies):
    filter_train_ds = p21.Pan21FourierFilterDataset("pan21/train", "pan21/train", cutoff)
    filter_val_ds = p21.Pan21FourierFilterDataset("pan21/validation", "pan21/validation", cutoff)

    model_name = model_dir / f"{cutoff[0]}_{cutoff[1]}"

    train_model_num_ff(filter_train_ds, filter_val_ds, model_name, epochs = 3)

  0%|          | 0/4 [00:00<?, ?it/s]

models/cutoffs/0.0_0.25
Epoch 1/3
[1m 487/2186[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m34:15[0m 1s/step - auc_3: 0.5228 - binary_accuracy: 0.5374 - loss: 0.7037




[1m2185/2186[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1s[0m 1s/step - auc_3: 0.5837 - binary_accuracy: 0.5689 - loss: 0.6764
Epoch 1: saving model to models/cutoffs/0.0_0.25/2024_06_06-08_48_44_AM_cp-01.weights.h5
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3396s[0m 2s/step - auc_3: 0.5837 - binary_accuracy: 0.5690 - loss: 0.6764 - val_auc_3: 0.7190 - val_binary_accuracy: 0.6515 - val_loss: 0.6172
Epoch 2/3


2024-06-06 09:45:33.350853: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:12: Filling up shuffle buffer (this may take a while): 7 of 8


[1m   1/2186[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7:41:26[0m 13s/step - auc_3: 0.6727 - binary_accuracy: 0.6250 - loss: 0.6488

2024-06-06 09:45:34.840486: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


[1m2185/2186[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1s[0m 1s/step - auc_3: 0.7193 - binary_accuracy: 0.6596 - loss: 0.6095
Epoch 2: saving model to models/cutoffs/0.0_0.25/2024_06_06-08_48_44_AM_cp-02.weights.h5
[1m2186/2186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3251s[0m 1s/step - auc_3: 0.7193 - binary_accuracy: 0.6596 - loss: 0.6095 - val_auc_3: 0.7363 - val_binary_accuracy: 0.6577 - val_loss: 0.6026
Epoch 3/3
[1m 167/2186[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m42:20[0m 1s/step - auc_3: 0.7695 - binary_accuracy: 0.6986 - loss: 0.5684