In [1]:
%matplotlib inline

# Import Dependencies

In [2]:
# Data Wrangling
import os
from google.colab import drive

# Data Analysis
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# ML
import torch
from torch.utils.data import Dataset, DataLoader

# Huggingface
!pip install transformers
from transformers import BertConfig, BertModel

# Miscellaneous
from typing import List

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 284 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 44.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

# Data Exploration

Mount our Google Drive to this Colab instance, as if it were a local file system, and switch to the directory where the data are stored.

In [3]:
drive._mount('/content/drive')
%cd '/content/drive/MyDrive/Research/Ongoing/Dynamic Spectra Sequence Modeling/Data'

Mounted at /content/drive
/content/drive/MyDrive/Research/Ongoing/Dynamic Spectra Sequence Modeling/Data


In [4]:
%ls

[0m[01;34mKaggle[0m/            sample_submission.csv  [01;34mtrain[0m/
masked_labels.csv  [01;34mtest[0m/                  train_labels.csv


Here, we have five items of interest:


* `train/` - a training set of cadence snippet files stored in `numpy` `float16` format (v1.20.1), one file per cadence snippet id, with corresponding labels found in the `train_labels.csv` file. Each file has dimension `(6, 273, 256)`, with the 1st dimension representing the 6 positions of the cadence, and the 2nd and 3rd dimensions representing the 2D spectrogram, i.e. frequency as a function of time.
* `test/` - the test set cadence snippet files; our goal is to predict whether or not the cadence contains a "needle".
* `train_labels.csv` - targets corresponding (by id) to the cadence snippet files found in the `train/` folder (1 if cadence contains a "needle", 0 if not).
* `sample_submission.csv` - a sample submission file in the correct format.
* `masked_labels.csv` - **[I FORGOT WHAT THIS IS, ASK YUHONG]**




Read in both csv files into a `pd.DataFrame`, and check for any missing values.

In [36]:
df_train = pd.read_csv('train_labels.csv')
df_train.head()

Unnamed: 0,id,target
0,0000799a2b2c42d,0
1,00042890562ff68,0
2,0005364cdcb8e5b,0
3,0007a5a46901c56,0
4,0009283e145448e,0


In [6]:
df_train.shape

(60000, 2)

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      60000 non-null  object
 1   target  60000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 937.6+ KB


In [8]:
df_test = pd.read_csv('sample_submission.csv')
df_test.head()

Unnamed: 0,id,target
0,000bf832cae9ff1,0.5
1,000c74cc71a1140,0.5
2,000f5f9851161d3,0.5
3,000f7499e95aba6,0.5
4,00133ce6ec257f9,0.5


In [9]:
df_test.shape

(39995, 2)

In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39995 entries, 0 to 39994
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      39995 non-null  object 
 1   target  39995 non-null  float64
dtypes: float64(1), object(1)
memory usage: 625.0+ KB


# Feature Engineering

Since there are no missing values to handle, collect the filepaths to each of the 60,000 train cadence snippets, and the 39,995 test cadence snippets.

In [39]:
train_filepaths, test_filepaths = [], []

for root, dirs, files in os.walk('train'):
  for filename in files:
    train_filepaths.append(os.path.join(root, filename))

for root, dirs, files in os.walk('test'):
  for filename in files:
    test_filepaths.append(os.path.join(root, filename))

# Verify that we have the expected number of filepaths
print(f'train: {len(train_filepaths)}\ntest: {len(test_filepaths)}')

  0%|          | 0/60000 [00:00<?, ?it/s]

train: 60000
test: 39995


Construct a new `pd.DataFrame` with "filepath" included alongside "id" and "target".


In [37]:
# Strip the ID from each filepath
train_id = [path[8:-4] for path in train_filepaths]
test_id = [path[7:-4] for path in test_filepaths]

# Look up target values from the previous DataFrame using ID values
train_target = [df_train.loc[df_train['id'] == id]['target'].values[0] for _, id in tqdm(enumerate(train_id), total=60000)]
test_target = [df_test.loc[df_test['id'] == id]['target'].values[0] for _, id in tqdm(enumerate(test_id), total=39995)]

# Verify that we have the expected number of ID and targets
print(f'train: {len(train_id)}, {len(train_target)}\ntest: {len(test_id)}, {len(test_target)}')

  0%|          | 0/60000 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [13]:
df_train = pd.DataFrame({'ID': train_id,
                         'Filepath': train_filepaths,
                         'Target': train_target})
df_train.head()

Unnamed: 0,ID,Filepath,Target
0,db805c5b0201ffd,train/d/db805c5b0201ffd.npy,0
1,db72b5d462d6dd4,train/d/db72b5d462d6dd4.npy,0
2,db77389bd6ab27f,train/d/db77389bd6ab27f.npy,0
3,db7c72aad96f575,train/d/db7c72aad96f575.npy,0
4,db7ea5d75980af2,train/d/db7ea5d75980af2.npy,0


In [14]:
df_test = pd.DataFrame({'ID': test_id,
                        'Filepath': test_filepaths,
                        'Target': test_target})
df_test.head()

Unnamed: 0,ID,Filepath,Target
0,99d159554adf56a,test/9/99d159554adf56a.npy,0.5
1,99b96f0033882f1,test/9/99b96f0033882f1.npy,0.5
2,99bce84b9aeb210,test/9/99bce84b9aeb210.npy,0.5
3,99c04284d16acf1,test/9/99c04284d16acf1.npy,0.5
4,99d7868c21e1ce1,test/9/99d7868c21e1ce1.npy,0.5


Finally, read in a single cadence snippet using `np.load()`, and check that its dimensions are indeed `(6, 273, 256)`.

In [15]:
ndarray = np.load(df_train['Filepath'][0])
ndarray.shape

(6, 273, 256)

# Dataset & DataLoader

Here we create a custom object with inheritance from PyTorch's `Dataset` class, to retrieve our data’s samples and corresponding labels, one at a time.

In [16]:
class KaggleDataset(Dataset):
  def __init__(self, 
               paths: List[str], 
               labels: List[int], 
               transform=None,
               target_transform=None):
    '''
    The __init__ function is called when you instantiate the class.

    Args:
    - self: the instance of the class.
    - paths: list of paths to the data, which are each cadence snippet files stored in np.float16 format, with dimensions (6, 273, 256).
    - labels: list of snippet labels to the cadence snippet files (1 if cadence contains a "needle", 0 if not).
    - transform: optional transformation to be performed on a cadence snippet.
    - target_transform: optional transformation to be performed on a snippet label.
    '''
    self.paths = paths
    self.labels = labels
    self.transform = transform
    self.target_transform = target_transform

  def __len__(self) -> int:
    '''
    The __len__ function returns the number of samples in our dataset.
    '''
    return len(self.labels)

  def __getitem__(self, 
                  idx: int) -> (torch.Tensor, int):
    '''
    The __getitem__ function loads a cadence snippet from the dataset at the given index, converts it into a torch.Tensor,
    retrieves the corresponding label from the list of labels provided when initializing the class, 
    calls the transform functions on them (if provided), and returns the tensor and corresponding label as a tuple.
    '''
    ndarray = np.load(self.paths[idx])
    x_np = torch.from_numpy(ndarray)
    label = self.labels[idx]
    if self.transform:
      x_np = self.transform(x_np)
    if self.target_transform:
      label = self.target_transform(label)
    return x_np, label

In [17]:
train_data = KaggleDataset(paths=df_train['Filepath'].tolist(), 
                           labels=df_train['Target'].tolist())

test_data = KaggleDataset(paths=df_test['Filepath'].tolist(), 
                          labels=df_test['Target'].tolist())

During training, we want to pass the cadence snippets in “minibatches”, reshuffle the data at every epoch to reduce model overfitting, and use Python’s `multiprocessing` to speed up data retrieval. `DataLoader` is an iterable that abstracts this complexity for us in an easy API.

In [19]:
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

Now that we've loaded the data into the `DataLoader` and can iterate through the dataset as needed, each iteration below returns a batch of `train_features` and `train_labels` (containing `batch_size=64` features and labels, respectively). Note that because we specified `shuffle=True`, after we iterate over all batches, the data is shuffled.

In [20]:
train_features, train_labels = next(iter(train_dataloader))
print(f'Feature batch shape: {train_features.size()}')
print(f'Labels batch shape: {train_labels.size()}')

Feature batch shape: torch.Size([64, 6, 273, 256])
Labels batch shape: torch.Size([64])


Let's try visualizing some of the cadence snippets.

In [21]:
# Define function to plot samples (expected input: arrays of shape [6, 273, 256])
def plot_cadence_snippet(cad_snippet, title=None):
    num_obs, h, w = cad_snippet.shape
    fig, axes = plt.subplots(6, 1, figsize=(10, 5))
    axes = axes.flatten()
    if title:
        fig.suptitle(title)
    for i in range(6):
        axes[i].imshow(cad_snippet[i])
        axes[i].axis('off')
        axes[i].set_aspect('auto')
    plt.show()

In [34]:
img = train_features[0].numpy()
label = train_labels[0].numpy()
plot_cadence_snippet(img)

ValueError: ignored

<Figure size 720x360 with 6 Axes>

# Data Preprocessing

For preprocessing, we perform the following transformations:


1.   Concatenate the cadence snippets pointing-wise, i.e. from `(6, 273, 256)` to `(1638, 256)`.
2.   Downsample the time-axis by a factor of 4. **[SINCE 1638 IS NOT DIVISIBLE BY 4, IN THE FUTURE, SWITCH TO USING LINEAR INTERPOLATION]**



In [None]:
def preprocessing(filepath):

In [None]:
for idx, path in enumerate(train_filepaths):
  ndarray = np.load(path)
  # Step 1
  ndarray = ndarray.reshape((1638, 256))
  # Step 2
  ndarray = ndarray[::4, :]
  # Rewrite

# Set Hyperparameters

**[MOVE THIS SECTION UP MAYBE?]**

In [None]:
batch_size = 64

# Inference

Using [HuggingFace](https://huggingface.co/)'s pre-trained BERT model, we perform inference

In [None]:
# Initializing a custom BERT configuration, where the number of attention heads must be a multiple of the hidden size
configuration = BertConfig(hidden_size=256, num_attention_heads=16)

# Initializing a model from the custom BERT configuration
model = BertModel(configuration)

# Forward pass of the model
outputs = model.forward(inputs_embeds=)

# Collect the final classification tokens
pooler_outputs = outputs.pooler_output