In [1]:
import torch
import random

random_seed = 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(random_seed)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(random_seed)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(random_seed)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(random_seed)
 
# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

2024-06-04 18:35:13.477213: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-04 18:35:13.521109: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


In [2]:
from pan21_functions import *
import datetime
from keras.callbacks import CSVLogger
from pathlib import Path

In [3]:
# Pan21PyDataset("pan21/train", "pan21/train", "train_ds_uncompressed").to_file()
# Pan21PyDataset("pan21/validation", "pan21/validation", "val_ds_uncompressed").to_file()

In [4]:
# Make sure the files are good
def test_load(path, name):
    np.load(path)[name]

# _ = Parallel(n_jobs=-1)(delayed(test_load)(Path("train_ds") / f"{i}.npz", "batch_x") for i in tqdm(range(350)))

In [5]:
# Pan21FourierDataset("pan21/train", "pan21/train", "train_ds_uncompressed", num_fourier_features=512).to_file()
# Pan21FourierDataset("pan21/validation", "pan21/validation", "val_ds_uncompressed", num_fourier_features=512).to_file()

In [6]:
# _ = Parallel(n_jobs=-1)(delayed(test_load)(Path("train_ds") / "fourier" / f"{i}.npz", "fourier_batch_x") for i in tqdm(range(350)))

In [7]:
# import time

# start_idx = 0
# limit = 5
# # limit = len(train_ds)

# before = time.time()
# for i in range(start_idx, start_idx+limit):
#     batch_x, batch_y = train_ds.__getitem__(i, force_compute=True)
# after = time.time()
# for i in range(start_idx, start_idx+limit):
#     batch_x, batch_y = train_ds.__getitem__(i, force_compute=False)
# after_after = time.time()

# print(f"Compute: {round((after - before)/limit, 2)}s vs File read: {round((after_after - after)/limit, 2)}s")
# # Compute: 8.95s per batch
# # Compute with compression: ~17s
# # Uncompressed read: .7s per batch
# # Compressed read: 1.6s per batch
# # Compressed is 1/10 the size of uncompressed, but takes ~twice as long to precompute and save
# # Compressed 512D Fourier takes ~30s per batch
# # Compressed 512D Fourier is about 500-700MB per batch

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Input, Flatten
from tensorflow.keras import optimizers, losses, metrics

# Code implementation of the RNN for sequence labeling
def create_rnn_model(num_labels, embedding_dim, max_input_length):
    model = Sequential()
    model.add(Input(shape=(max_input_length*2, embedding_dim)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Flatten())
    model.add(Dense(num_labels, activation='sigmoid'))

    model.compile(
        optimizer=optimizers.RMSprop(),  # Optimizer
        # Loss function to minimize
        loss=losses.BinaryCrossentropy(),
        # List of metrics to monitor
        metrics=[metrics.BinaryAccuracy(), metrics.AUC()],
        jit_compile=True
    )

    return model
 
num_labels = 1
embedding_dim = 768

In [9]:
def train_model_num_ff(train_ds, val_ds, model_name, epochs=5):
    max_input_length = train_ds.max_input_length
    # print(f"{max_input_length=}")

    print(model_name)
    model = create_rnn_model(num_labels, embedding_dim, max_input_length)
    csv_logger = CSVLogger(f'{model_name}.log', separator=',', append=False)

    with tf.device("/device:GPU:0"):
        history = model.fit(
            train_ds,
            epochs=epochs,
            validation_data=val_ds,
            verbose=1,
            callbacks=[csv_logger]
        )

    model.save(f"{model_name}.keras")

In [None]:
# Does using the frequency domain spectra provide usefule information?
model_dir = Path(f"models/num_fourier_features")
if not model_dir.exists():
    model_dir.mkdir(parents=True)

num_fourier_features = [0, 512//4, 512//2, 512]
for num_ff in tqdm(num_fourier_features):
    free_memory()

    fourier_train_ds = Pan21FourierDataset("pan21/train", "pan21/train", "train_ds_uncompressed", num_fourier_features=num_ff)
    fourier_val_ds = Pan21FourierDataset("pan21/validation", "pan21/validation", "val_ds_uncompressed", num_fourier_features=num_ff)

    model_name = model_dir / f"{num_ff}_{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}"

    train_model_num_ff(fourier_train_ds, fourier_val_ds, model_name)

In [None]:
# How many epochs should be trained?
model_dir = Path(f"models/num_epochs")
if not model_dir.exists():
    model_dir.mkdir(parents=True)

num_fourier_features = [0, 512//4]
for num_ff in tqdm(num_fourier_features):
    train_model_num_ff(num_ff, epochs=15)

In [10]:
# Does filtering help?
model_dir = Path(f"models/cutoffs")
if not model_dir.exists():
    model_dir.mkdir(parents=True)

cutoff_frequencies = [[1, 64], # High pass 
                    [192, 512/2], # Low Pass
                    [128, 192],
                    [64, 128],
                    ]
for cutoff in tqdm(cutoff_frequencies):
    free_memory()
    filter_train_ds = Pan21FourierFilterDataset("pan21/train", "pan21/train", "train_ds_uncompressed", cutoff)
    filter_val_ds = Pan21FourierFilterDataset("pan21/validation", "pan21/validation", "val_ds_uncompressed", cutoff)

    model_name = model_dir / f"{cutoff[0]}_{cutoff[1]}_{datetime.datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')}"

    train_model_num_ff(filter_train_ds, filter_val_ds, model_name)

  0%|          | 0/4 [00:00<?, ?it/s]

models/cutoffs/1_64_2024_06_04-06_36_43_PM


2024-06-04 18:36:43.231621: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-04 18:36:43.234811: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-04 18:36:43.236240: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

32
Epoch 1/5


2024-06-04 18:37:11.360466: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:2: Filling up shuffle buffer (this may take a while): 2 of 8
2024-06-04 18:37:21.510301: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:2: Filling up shuffle buffer (this may take a while): 3 of 8
2024-06-04 18:37:45.400298: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:2: Filling up shuffle buffer (this may take a while): 6 of 8
2024-06-04 18:37:58.897153: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.
I0000 00:00:1717526279.070201  988027 service.cc:145] XLA service 0x7fea1c003e00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1717526279.070239  988027 service.cc:153]   StreamExecutor device (0): NVIDIA A10, Compute Capability 8.6
2024-06-04 18:37:59.106894: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR cr

[1m  1/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6:19:07[0m 65s/step - auc: 0.5667 - binary_accuracy: 0.5514 - loss: 1.4266

I0000 00:00:1717526281.461939  988027 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



[1m  2/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m51:31[0m 9s/step - auc: 0.5677 - binary_accuracy: 0.5500 - loss: 8.6166   




[1m  3/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m41:57[0m 7s/step - auc: 0.5666 - binary_accuracy: 0.5425 - loss: 9.3758




[1m  4/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m42:34[0m 7s/step - auc: 0.5617 - binary_accuracy: 0.5397 - loss: 9.1171




[1m  5/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m43:32[0m 8s/step - auc: 0.5571 - binary_accuracy: 0.5354 - loss: 8.8079





[1m  6/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m45:11[0m 8s/step - auc: 0.5527 - binary_accuracy: 0.5325 - loss: 8.4487




[1m  7/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m43:34[0m 8s/step - auc: 0.5483 - binary_accuracy: 0.5276 - loss: 8.0866




[1m  8/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m44:40[0m 8s/step - auc: 0.5465 - binary_accuracy: 0.5260 - loss: 7.7525




[1m  9/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m46:56[0m 8s/step - auc: 0.5452 - binary_accuracy: 0.5254 - loss: 7.4268




[1m 10/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m45:12[0m 8s/step - auc: 0.5438 - binary_accuracy: 0.5243 - loss: 7.1252




[1m 11/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m44:19[0m 8s/step - auc: 0.5420 - binary_accuracy: 0.5229 - loss: 6.8510




[1m 12/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m45:17[0m 8s/step - auc: 0.5405 - binary_accuracy: 0.5217 - loss: 6.5962




[1m 13/350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m46:00[0m 8s/step - auc: 0.5393 - binary_accuracy: 0.5209 - loss: 6.3646

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)
