In [1]:
%matplotlib inline

# Import Dependencies

In [45]:
# File management
import os
from google.colab import drive

# Data analysis
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# ML
import torch
from torch.utils.data import Dataset, DataLoader

# Huggingface
!pip install transformers
from transformers import BertConfig, BertModel



# Data Exploration

Mount our Google Drive to this Colab instance, as if it were a local file system, and switch to the directory where the data are stored.

In [7]:
drive._mount('/content/drive')
%cd '/content/drive/MyDrive/Research/Ongoing/Dynamic Spectra Sequence Modeling/Data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Research/Ongoing/Dynamic Spectra Sequence Modeling/Data


In [8]:
%ls

[0m[01;34mKaggle[0m/            sample_submission.csv  [01;34mtrain[0m/
masked_labels.csv  [01;34mtest[0m/                  train_labels.csv


Here, we have five items of interest:


* `train/` - a training set of cadence snippet files stored in `numpy` `float16` format (v1.20.1), one file per cadence snippet id, with corresponding labels found in the `train_labels.csv` file. Each file has dimension `(6, 273, 256)`, with the 1st dimension representing the 6 positions of the cadence, and the 2nd and 3rd dimensions representing the 2D spectrogram, i.e. frequency as a function of time.
* `test/` - the test set cadence snippet files; our goal is to predict whether or not the cadence contains a "needle".
* `train_labels.csv` - targets corresponding (by id) to the cadence snippet files found in the `train/` folder (1 if cadence contains a "needle", 0 if not).
* `sample_submission.csv` - a sample submission file in the correct format.
* `masked_labels.csv` - **[I FORGOT WHAT THIS IS, ASK YUHONG]**




Read in both csv files into a `pd.DataFrame`, and check for any missing values.

In [9]:
df_train = pd.read_csv('train_labels.csv')
df_train.head()

Unnamed: 0,id,target
0,0000799a2b2c42d,0
1,00042890562ff68,0
2,0005364cdcb8e5b,0
3,0007a5a46901c56,0
4,0009283e145448e,0


In [10]:
df_train.shape

(60000, 2)

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      60000 non-null  object
 1   target  60000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 937.6+ KB


In [12]:
df_test = pd.read_csv('sample_submission.csv')
df_test.head()

Unnamed: 0,id,target
0,000bf832cae9ff1,0.5
1,000c74cc71a1140,0.5
2,000f5f9851161d3,0.5
3,000f7499e95aba6,0.5
4,00133ce6ec257f9,0.5


In [13]:
df_test.shape

(39995, 2)

In [14]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39995 entries, 0 to 39994
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      39995 non-null  object 
 1   target  39995 non-null  float64
dtypes: float64(1), object(1)
memory usage: 625.0+ KB


Since there are no missing values to handle, collect the filepaths to each of the 60,000 train cadence snippets, and the 39,995 test cadence snippets.

In [15]:
train_filepaths, test_filepaths = [], []

for root, dirs, files in os.walk('train'):
  for filename in files:
    train_filepaths.append(os.path.join(root, filename))

for root, dirs, files in os.walk('test'):
  for filename in files:
    test_filepaths.append(os.path.join(root, filename))

# Verify that we have the expected number of filepaths
print(f'train: {len(train_filepaths)}\ntest: {len(test_filepaths)}')

train: 60000
test: 39995


Engineer a new `pd.DataFrame` with filepath included alongside id and target.


In [41]:
# Strip the ID from each filepath
train_id = [path[8:-4] for path in train_filepaths]
test_id = [path[7:-4] for path in test_filepaths]

# Look up target values from the previous DataFrame using ID values
train_target = [df_train.loc[df_train['id'] == id]['target'].values[0] for _, id in tqdm(enumerate(train_id))]
test_target = [df_test.loc[df_test['id'] == id]['target'].values[0] for _, id in tqdm(enumerate(test_id))]

# Verify that we have the expected number of ID and targets
print(f'train: {len(train_id)}, {len(train_target)}\ntest: {len(test_id)}, {len(test_target)}')

train: 60000, 60000
test: 39995, 39995


In [42]:
df_train = pd.DataFrame({'ID': train_id,
                      'Filepath': train_filepaths,
                      'Target': train_target})
df_train.head()

Unnamed: 0,ID,Filepath,Target
0,db805c5b0201ffd,train/d/db805c5b0201ffd.npy,0
1,db72b5d462d6dd4,train/d/db72b5d462d6dd4.npy,0
2,db77389bd6ab27f,train/d/db77389bd6ab27f.npy,0
3,db7c72aad96f575,train/d/db7c72aad96f575.npy,0
4,db7ea5d75980af2,train/d/db7ea5d75980af2.npy,0


In [43]:
df_test = pd.DataFrame({'ID': test_id,
                     'Filepath': test_filepaths,
                     'Target': test_target})
df_test.head()

Unnamed: 0,ID,Filepath,Target
0,99d159554adf56a,test/9/99d159554adf56a.npy,0.5
1,99b96f0033882f1,test/9/99b96f0033882f1.npy,0.5
2,99bce84b9aeb210,test/9/99bce84b9aeb210.npy,0.5
3,99c04284d16acf1,test/9/99c04284d16acf1.npy,0.5
4,99d7868c21e1ce1,test/9/99d7868c21e1ce1.npy,0.5


Finally, read in a single cadence snippet using `np.load()`, and check that its dimensions are indeed `(6, 273, 256)`.

In [44]:
ndarray = np.load(train['Filepath'][0])
ndarray.shape

(6, 273, 256)

# DataSet & DataLoader

In [None]:
class KaggleDataset(Dataset):
  def __init__(self, 
               paths: list[str], 
               labels: list[int], 
               transform=None, 
               target_transform=None):
    '''
    The __init__ function is called when you instantiate the class.

    Args:
    - self: the instance of the class.
    - paths: list of paths to the data, which are cadence snippet files stored in np.float16 format, with dimensions (6, 273, 256).
    - labels: list of snippet labels to the cadence snippet files (1 if cadence contains a "needle", 0 if not).
    - transform: optional transform to be performed on a cadence snippet.
    - target_transform: optional transform to be performed on a snippet label.
    '''
    self.paths = paths
    self.labels = labels
    self.transform = transform
    self.target_transform = target_transform

  def __len__(self) -> int:
    '''
    The __len__ function returns the number of samples in our dataset.
    '''
    return len(self.labels)

  def __getitem__(self, 
                  idx: int) -> tuple(torch.Tensor, int):
    '''
    The __getitem__ function loads a cadence snippet from the dataset at the given index, 
    retrieves the corresponding label from the list of labels provided when initializing the class, 
    calls the transform functions on them (if provided), and returns the tensor and corresponding label in a tuple.
    '''
    ndarray = np.load(self.paths[idx])
    label = self.labels[idx]
    if self.transform:
      ndarray = self.transform(ndarray)
    if self.target_transform:
      label = self.target_transform(label)
    return ndarray, label

In [None]:
train_data = KaggleDataset(paths=df_train['Filepath'].tolist(), 
                           labels=df_train['Target'].tolit(), 
                           transform=torch.from_numpy())

test_data = KaggleDataset(paths=df_test['Filepath'].tolist(), 
                          labels=df_test['Target'].tolit(), 
                          transform=torch.from_numpy())

In [None]:
train_dataloader = DataLoader(train_data, batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size, shuffle=True)

# Data Preprocessing

For preprocessing, we perform the following transformations:


1.   Concatenate the cadence snippets pointing-wise, i.e. from `(6, 273, 256)` to `(1638, 256)`.
2.   Downsample the time-axis by a factor of 4. **[SINCE 1638 IS NOT DIVISIBLE BY 4, IN THE FUTURE, SWITCH TO USING LINEAR INTERPOLATION]**



In [None]:
for idx, path in enumerate(train_filepaths):
  ndarray = np.load(path)
  # Step 1
  ndarray = ndarray.reshape((1638, 256))
  # Step 2
  ndarray = ndarray[::4, :]
  # Rewrite

# Set Hyperparameters

**[MOVE THIS SECTION UP MAYBE?]**

In [None]:
batch_size = 64

# Inference

Using [HuggingFace](https://huggingface.co/)'s pre-trained BERT model, we perform inference

In [None]:
# Initializing a custom BERT configuration, where the number of attention heads must be a multiple of the hidden size
configuration = BertConfig(hidden_size=256, num_attention_heads=16)

# Initializing a model from the custom BERT configuration
model = BertModel(configuration)

# Forward pass of the model
outputs = model.forward(inputs_embeds=)

# Collect the final classification tokens
pooler_outputs = outputs.pooler_output