In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from glob import glob

from sklearn.metrics import average_precision_score


import warnings


warnings.filterwarnings("ignore")

In [2]:
CUDA_DEV = "cuda:1"
NUM_TAGS = 256

In [3]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [4]:
track_idx2embeds = {}
for fn in tqdm(glob('../data/track_embeddings/*')):
    track_idx = int(fn.split('/')[3].split('.')[0])
    embeds = np.load(fn)
    track_idx2embeds[track_idx] = embeds

  0%|          | 0/76714 [00:00<?, ?it/s]

In [5]:
class TaggingDataset(Dataset):
    def __init__(self, df, testing=False):
        self.df = df
        self.testing = testing
        self.crop_size = 81
        self.stage = "train" if not testing else "val"
        
    def __len__(self):
        return self.df.shape[0]

    def __process_features(self, x: torch.Tensor):
        x = torch.from_numpy(x)
        x = x.permute(1, 0)
        x_len = x.shape[-1]
        if x_len > self.crop_size:
            start = np.random.randint(0, x_len - self.crop_size)
            x = x[..., start : start + self.crop_size]
        else:
            if self.stage == "train":
                i = (
                    np.random.randint(0, self.crop_size - x_len)
                    if self.crop_size != x_len
                    else 0
                )
            else:
                i = (self.crop_size - x_len) // 2
            pad_patern = (i, self.crop_size - x_len - i)
            x = torch.nn.functional.pad(x, pad_patern, "constant").detach()
        # x /= x.max()
        # x = (x - x.mean()) / x.std()
        x = x.permute(1, 0)
        return x

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        track_idx = row.track
        embeds = track_idx2embeds[track_idx]
        embeds = self.__process_features(embeds)
        if self.testing:
            return track_idx, embeds
        tags = [int(x) for x in row.tags.split(',')]
        target = np.zeros(NUM_TAGS)
        target[tags] = 1
        return track_idx, embeds, target


In [6]:
train_dataset = TaggingDataset(df_train)
test_dataset = TaggingDataset(df_test, True)

In [7]:
# class Network(nn.Module):
#     def __init__(
#         self,
#         num_classes = NUM_TAGS,
#         input_dim = 768,
#         hidden_dim = 512
#     ):
#         super().__init__()
#         self.num_classes = num_classes
#         self.bn = nn.LayerNorm(hidden_dim)
#         self.projector =  nn.Linear(input_dim, hidden_dim)
#         self.lin = nn.Sequential(
#             nn.Linear(hidden_dim, hidden_dim),
#             nn.ReLU(),
#             nn.Linear(hidden_dim, hidden_dim),
#             nn.LayerNorm(hidden_dim)
#         )
#         self.fc = nn.Linear(hidden_dim, num_classes)
        

#     def forward(self, embeds):
#         x = [self.projector(x) for x in embeds]  # 768 -> 512
#         x = [v.mean(0).unsqueeze(0) for v in x]
#         x = self.bn(torch.cat(x, dim = 0))
#         x = self.lin(x)
#         outs = self.fc(x)
#         return outs


class Network(nn.Module):
    def __init__(
        self,
        num_classes = NUM_TAGS,
        input_dim = 768,
        hidden_dim = 512
    ):
        super().__init__()
        self.num_classes = num_classes

        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                input_dim, 8, dim_feedforward=2048, dropout=0.2, batch_first=True
            ),
            num_layers=3
        )
        self.pooling = nn.AdaptiveAvgPool2d((1, input_dim))
        self.projector =  nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, num_classes),
        )

    def forward(self, x):
        x = self.transformer(x)
        x = self.pooling(x).squeeze()
        outs = self.projector(x)
        return outs

In [8]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    running_loss = None
    alpha = 0.8
    score = []
    for iteration,data in enumerate(loader):
        optimizer.zero_grad()
        track_idxs, embeds, target = data
        # embeds = [x.to(CUDA_DEV) for x in embeds]
        embeds = embeds.to(CUDA_DEV)
        target = target.to(CUDA_DEV)
        pred_logits = model(embeds)
        pred_probs = torch.sigmoid(pred_logits)
        ce_loss = criterion(pred_logits, target)
        ce_loss.backward()
        optimizer.step()
        score.append(average_precision_score(target.cpu().numpy(), pred_probs.detach().cpu().numpy()))
        
        if running_loss is None:
            running_loss = ce_loss.item()
        else:
            running_loss = running_loss * ce_loss.item() + (1 - alpha) * ce_loss.item()
        if iteration % 100 == 0:
            print('   {} batch {} loss {}'.format(
                datetime.now(), iteration + 1, running_loss
            ))
    print(f"Train AP: {np.mean(score):.6f}")

In [9]:
def predict(model, loader):
    model.eval()
    track_idxs = []
    predictions = []
    with torch.no_grad():
        for data in tqdm(loader):
            track_idx, embeds = data
            # embeds = [x.to(CUDA_DEV) for x in embeds]
            embeds = embeds.to(CUDA_DEV)
            pred_logits = model(embeds)
            pred_probs = torch.sigmoid(pred_logits)
            pred_probs = torch.round(pred_probs, decimals=4)
            predictions.append(pred_probs.cpu().numpy())
            track_idxs.append(track_idx.numpy())
    predictions = np.vstack(predictions)
    track_idxs = np.vstack(track_idxs).ravel()
    return track_idxs, predictions


In [10]:
def collate_fn(b):
    track_idxs = torch.from_numpy(np.vstack([x[0] for x in b]))
    targets = torch.from_numpy(np.vstack([x[2] for x in b]))
    # embeds = [torch.from_numpy(x[1]) for x in b]
    embeds = torch.stack([x[1] for x in b])
    return track_idxs, embeds, targets

def collate_fn_test(b):
    track_idxs = torch.from_numpy(np.vstack([x[0] for x in b]))
    # embeds = [torch.from_numpy(x[1]) for x in b]
    embeds = torch.stack([x[1] for x in b])
    return track_idxs, embeds

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=4, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=4, collate_fn=collate_fn_test)

In [13]:
model = Network()
criterion = nn.BCEWithLogitsLoss()

epochs = 5
model = model.to(CUDA_DEV)
criterion = criterion.to(CUDA_DEV)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [14]:
for epoch in tqdm(range(epochs)):
    train_epoch(model, train_dataloader, criterion, optimizer)


  0%|          | 0/5 [00:00<?, ?it/s]

   2023-11-01 01:52:31.948569 batch 1 loss 0.7539947039454633
   2023-11-01 01:53:26.068232 batch 101 loss 0.014536299839564733
Train AP: 0.037980
   2023-11-01 01:54:22.215248 batch 1 loss 0.06461087436876412
   2023-11-01 01:55:17.336835 batch 101 loss 0.013303131295626406
Train AP: 0.126772
   2023-11-01 01:56:13.674642 batch 1 loss 0.05495387297082388
   2023-11-01 01:57:09.058744 batch 101 loss 0.011191709961157611
Train AP: 0.204586
   2023-11-01 01:58:05.644205 batch 1 loss 0.050117459448933754
   2023-11-01 01:59:01.435471 batch 101 loss 0.010728354985651313
Train AP: 0.237603
   2023-11-01 01:59:58.522706 batch 1 loss 0.04701385704321548
   2023-11-01 02:00:54.019373 batch 101 loss 0.010098447817056067
Train AP: 0.264571


In [None]:
torch.save(model.state_dict(), "transformer_encoder.pt")

In [21]:
track_idxs, predictions = predict(model, test_dataloader)

  0%|          | 0/100 [00:00<?, ?it/s]

In [22]:
predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in zip(track_idxs, predictions)
])

In [23]:
predictions_df.to_csv('prediction_mlp.csv', index=False)