In [None]:
%matplotlib inline

*   YouTube video explaining Transformers: [https://www.youtube.com/watch?v=TQQlZhbC5ps&list=TLPQMDYwNzIwMjFuBc39xf3IYg&index=9&ab_channel=CodeEmporium](https://www.youtube.com/watch?v=TQQlZhbC5ps&list=TLPQMDYwNzIwMjFuBc39xf3IYg&index=9&ab_channel=CodeEmporium)
*   Original Transformers paper: [https://arxiv.org/pdf/1706.03762.pdf](https://arxiv.org/pdf/1706.03762.pdf)

# Import dependencies

In [None]:
# Data & storage
import os
import glob
import hashlib
from google.colab import drive
from torch.utils.data import random_split, DataLoader
from torch.utils.data.distributed import DistributedSampler 


# Analysis
import numpy as np
import pandas as pd
from pandas import read_csv

# Visualizations
from matplotlib import pyplot as plt
from tqdm import tqdm

# Deep learning
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Distributed training (TPUs)
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
import warnings
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
import warnings
warnings.filterwarnings("ignore")

# Miscellaneous
from typing import Optional, Union

Collecting torch-xla==1.9
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl (149.9 MB)
[K     |████████████████████████████████| 149.9 MB 50 kB/s 
[?25hCollecting cloud-tpu-client==0.10
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 2.3 MB/s 
Installing collected packages: google-api-python-client, torch-xla, cloud-tpu-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.8
    Uninstalling google-api-python-client-1.12.8:
      Successfully uninstalled google-api-python-client-1.12.8
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
earthengine-api 0.1.272 requir



# Download Data



To download the Kaggle dataset, we must first mount our Google Drive to this Colab notebook.

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


Then, we specify the config path to our Kaggle API token (in the form of a `kaggle.json` file), and change the current working directory to that path.

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/Research/Dynamic Spectra Sequence Modeling/Data/Kaggle'
%cd '/content/drive/MyDrive/Research/Dynamic Spectra Sequence Modeling/Data'

/content/drive/MyDrive/Research/Transformers/Code/Data


Finally, we copy and run the API command for the BL Kaggle competition to download the datasets (remember to unzip the files).

In [None]:
if not os.listdir():
  # Note, if you're getting the error message "429 - Too Many Requests", try running the following commands before the API command:
  # !pip uninstall -y kaggle
  # !pip install --upgrade pip
  # !pip install kaggle==1.5.6
  !kaggle competitions download -c seti-breakthrough-listen
  !unzip seti-breakthrough-listen.zip

# Prep Data

We want to create lookup tables in the form of Python dictionaries, with ID-target key-value pairs, for both the training and test data. 

To do so for the training data is quite straightforward. Note however, that the test data IDs have been hashed for security purposes, hence we must go through some extra steps beforehand.

In [None]:
train_labels = read_csv('train_labels.csv')
train_dict = dict(zip(train_labels.id, train_labels.target))

original_labels = read_csv('sample_submission.csv')['id']
hash_labels = read_csv('masked_labels.csv')
test_dict = {}
keyword = input('Enter keyword: ')
for labels in tqdm(original_labels):
  m = hashlib.md5(keyword.encode("utf-8"))
  m.update(bytes.fromhex("0" + labels))
  hashed_id = m.hexdigest()
  test_dict[labels] = hash_labels.loc[hash_labels['id'] == hashed_id, 'target'].item()

Enter keyword: zach


100%|██████████| 39995/39995 [01:51<00:00, 359.27it/s]


Split the training set into non-overlapping new datasets for cross-validation. Note that `x_train` and `x_valid` will hold the ID values, whereas `y_train` and `y_valid` will hold the target values (both with lengths `(48000, 12000)`, respectively). Since our model is self-supervised, we'll only use `y_train` and `y_valid` for validation using downstream tasks.

In [None]:
len_train = int(len(train_labels) * 0.8)
len_valid = int(len(train_labels) * 0.2)

x_train, x_valid = random_split(train_labels['id'], (len_train, len_valid))
y_train, y_valid = random_split(train_labels['target'], (len_train, len_valid))

# Train

Initialize the random seed.

In [None]:
# Random Seed Initialize
RANDOM_SEED = 11
def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything()

Set the model parameters.

In [None]:
checkpoint_path = '../Code/Checkpoints/.'
num_cores = 8
num_workers = 0
epochs = 30
batch_size = 128
learning_rate = 1e-4


d_model = 256   # Latent dim
d_input = (6, t, 256)   # Input dim (from dataset), where 0<= t <= 6*273
d_output = (6, 1, 256)   # Output dim (from dataset)
q = 8   # Query size
v = 8   # Value size
h = 8   # Number of heads
N = 4   # Number of decoder blocks to stack
attention_size = 12   # Attention window size
dropout = 0.2   # Dropout rate
pe = None
chunk_mode = None

# training_params = {
#     'checkpoint_path': '../Checkpoints/.',
#     'num_cores': 8,
#     'num_workers': 0,
#     'epochs': 30,
#     'batch_size': 128,
#     'learning_rate': 1e-4
# }

# # Dimensions for data are (6, 273, 256), i.e. 6 snippets of 273 timesteps and 256 frequency channels
# model_params = {
#     'd_model': 256, # Latent dim
#     'd_input': (6, t, 256), # Input dim (from dataset), where 0<= t <= 6*273
#     'd_output': (6, 1, 256), # Output dim (from dataset)
#     'q': 8, # Query size
#     'v': 8, # Value size
#     'h': 8, # Number of heads
#     'N': 4, # Number of decoder blocks to stack
#     'attention_size': 12, # Attention window size
#     'dropout': 0.2, # Dropout rate
#     'pe': None,
#     'chunk_mode': None
# }

NameError: ignored

## Configuring Colab's Cloud TPUs



Colab provides a free Cloud TPU system (a remote CPU host + four TPU chips with two cores each). To gain access to a TPU on Colab, on the main menu, click Runtime > Change runtime type > set "TPU" as the hardware accelerator.

The PyTorch/XLA package lets PyTorch connect to Cloud TPUs (It's named PyTorch/XLA, not PyTorch/TPU, because XLA is the name of the TPU compiler), and makes TPU cores available as PyTorch devices, which lets PyTorch create and manipulate tensors on TPUs.

In [None]:
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

`torch.utils.data.distributed.DistrubutedSampler()` distributes the training data evenly (with no replicas) to all 8 TPU cores that Colab provides. Note that `xm.xrt_world_size()` retrieves the number of devices that are taking part in the replication (basically the number of cores), and `xm.get_ordinal()` retrieves the replication ordinal of the current process. The ordinals range from `0` to `xrt_world_size()-1`.

In [None]:
train_sampler = DistributedSampler(
    x_train,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=True)
     
valid_sampler = DistributedSampler(
    x_valid,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=False)

After the data has been distributed, we can create dataloaders using `ParallelLoader`.

In [None]:
train_loader = DataLoader(
    x_train,
    batch_size=training_params['batch_size'],
    sampler=train_sampler,
    num_workers=training_params['num_workers'],
    shuffle=True,
    drop_last=True)

valid_loader = DataLoader(
    x_valid,
    batch_size=training_params['batch_size'],
    sampler=train_sampler,
    num_workers=training_params['num_workers'],
    shuffle=False,
    drop_last=True)

# drop_last = True drops the last incomplete batch if the dataset size is not divisible by the batch size
# drop_last = False will cause the last batch to be smaller if the size of dataset is not divisible by the batch size

Optimize parameters for distributed training on TPU cores (remember `xm.xrt_world_size()` returns the number of TPU cores, which for our case is 8).

In [None]:
# Scale learning rate to world size
lr = training_params['learning_rate'] * xm.xrt_world_size()

# Get loss function, optimizer, and model
device = xm.xla_device()
model = Transformer().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = OZELoss(alpha=0.3)

## Loop