In [1]:
import os
import sys
import csv
import math
import random
import numpy as np
from time import time
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import (
    pack_padded_sequence,
    pad_packed_sequence
)
from torch.utils.data import (
    Dataset,
    DataLoader,
    Sampler,
    BatchSampler
)

from torchtext.vocab import build_vocab_from_iterator

from sklearn.metrics import roc_auc_score

from tokenizers import ByteLevelBPETokenizer

In [2]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(1234)

# avoids parallelism errors when both tokenizers and torch dataloaders use multiprocessing 
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
def read_data(path):
    with open(path, "r") as csvfile:
        train_data = list(csv.reader(csvfile))[1:]  # skip col name
        sents, lbls = [], []
        for s, l in train_data:
            sents.append(s)
            lbls.append(l)
    return sents, lbls


def apply_random_masking(seq, num_tokens):
    """
    Mask `num_tokens` as 1 (i.e. [UNK]) at random positions per sequence.
    """
    dist = torch.rand(seq.shape)
    m, _ = torch.topk(dist, num_tokens)
    return seq * (dist < m) + (dist == m) * 1


def regularized_auc(train_auc, dev_auc, threshold=0.0025):
    """
    Returns development AUC if overfitting is below threshold, otherwise 0.
    """
    return dev_auc if (train_auc - dev_auc) < threshold else 0


def save_metrics(*args, path):
    if not os.path.isfile(path):
        with open(path, "w", newline="\n") as f:
            f.write(
                ",".join(
                    [
                        "fold",
                        "epoch",
                        "train_loss",
                        "train_acc",
                        "train_auc",
                        "val_loss",
                        "val_acc",
                        "val_auc",
                    ]
                )
            )
            f.write("\n")
    if args:
        with open(path, "a", newline="\n") as f:
            f.write(",".join([str(arg) for arg in args]))
            f.write("\n")


def gelu(x):
    """
    Facebook Research implementation of the gelu activation function.
    
    For information: OpenAI GPT's gelu is slightly different
    (and gives slightly different results):
    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

def trainable_model_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def total_model_params(model):
    return sum(p.numel() for p in model.parameters())

In [4]:
class CleavageDataset(Dataset):
    def __init__(self, seq, lbl):
        self.seq = seq
        self.lbl = lbl

    def __getitem__(self, idx):
        return self.seq[idx], self.lbl[idx]

    def __len__(self):
        return len(self.lbl)


class BucketSampler(Sampler):
    def __init__(self, seqs, batch_size):

        # pair each sequence with their *tokenized* length
        indices = [(idx, len(tokenizer.encode(s).ids)) for idx, s in enumerate(seqs)]
        random.shuffle(indices)

        idx_pools = []
        # generate pseudo-random batches of (arbitrary) size batch_size * 100
        # each batch of size batch_size * 100 is sorted in itself by seq length
        for i in range(0, len(indices), batch_size * 100):
            idx_pools.extend(
                sorted(indices[i : i + batch_size * 100], key=lambda x: x[1])
            )

        # filter only indices
        self.idx_pools = [x[0] for x in idx_pools]

    def __iter__(self):
        return iter(self.idx_pools)

    def __len__(self):
        return len(self.idx_pools)


class TrainBatch:
    def __init__(self, batch):
        ordered_batch = list(zip(*batch))
        seq = torch.tensor(
            [s.ids for s in tokenizer.encode_batch(ordered_batch[0])], dtype=torch.int64
        )
        self.seq = apply_random_masking(seq, num_tokens=1)
        self.lbl = torch.tensor([int(l) for l in ordered_batch[1]], dtype=torch.float)
        self.lengths = torch.tensor([self.seq.shape[1]] * self.seq.shape[0], dtype=torch.int64)

    def pin_memory(self):
        self.seq = self.seq.pin_memory()
        self.lbl = self.lbl.pin_memory()
        return self


def train_wrapper(batch):
    return TrainBatch(batch)


class EvalBatch:
    def __init__(self, batch):
        ordered_batch = list(zip(*batch))
        self.seq = torch.tensor(
            [s.ids for s in tokenizer.encode_batch(ordered_batch[0])], dtype=torch.int64
        )
        self.lbl = torch.tensor([int(l) for l in ordered_batch[1]], dtype=torch.float)
        self.lengths = torch.tensor([self.seq.shape[1]] * self.seq.shape[0], dtype=torch.int64)

    def pin_memory(self):
        self.seq = self.seq.pin_memory()
        self.lbl = self.lbl.pin_memory()
        return self


def eval_wrapper(batch):
    return EvalBatch(batch)

In [5]:
class BiLSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        rnn_size1,
        rnn_size2,
        hidden_size,
        dropout,
    ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=1
        )

        self.dropout = nn.Dropout(dropout)

        self.lstm1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=rnn_size1,
            bidirectional=True,
            batch_first=True,
        )

        self.lstm2 = nn.LSTM(
            input_size=2 * rnn_size1,
            hidden_size=rnn_size2,
            bidirectional=True,
            batch_first=True,
        )

        self.fc1 = nn.Linear(rnn_size2 * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, seq, lengths):
        # input shape: (batch_size, seq_len=10)
        embedded = self.dropout(self.embedding(seq))

        packed_embeddings = pack_padded_sequence(
            embedded, lengths, batch_first=True, enforce_sorted=False
        )

        # input shape: (batch_size, seq_len, embedding_dim)
        out, _ = self.lstm1(packed_embeddings)

        # input shape: (batch_size, seq_len, 2*rnn_size1)
        out, _ = self.lstm2(out)

        unpacked_output, _ = pad_packed_sequence(out, batch_first=True, padding_value=1)

        # input shape: (batch_size, seq_len, 2*hidden_size)
        pooled, _ = torch.max(unpacked_output, dim=1)

        # input shape; (batch_size, 2*hidden_size)
        out = self.dropout(gelu(self.fc1(pooled)))

        # input shape: (batch_size, hidden_size)
        # output shape: (batch_size)
        return self.fc2(out).squeeze()

In [6]:
def process(model, loader, criterion, optim=None):
    epoch_loss, num_correct, total = 0, 0, 0
    preds, lbls = [], []
    
    for batch in tqdm(
        loader,
        desc="Train: " if optim is not None else "Eval: ",
        file=sys.stdout,
        unit="batches"
    ):
        seq, lbl, lengths = batch.seq, batch.lbl, batch.lengths
        seq, lbl = seq.to(device), lbl.to(device)
        
        scores = model(seq, lengths)
        loss = criterion(scores, lbl)
        
        if optim is not None:
            optim.zero_grad()
            loss.backward()
            optim.step()
        
        epoch_loss += loss.item()
        num_correct += ((scores > 0) == lbl).sum().item()
        total += seq.shape[0]
        preds.extend(scores.detach().tolist())
        lbls.extend(lbl.detach().tolist())
    return epoch_loss / total, num_correct / total, roc_auc_score(lbls, preds)

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
vocab_file = "../../params/n_term/bbpe_params/1k/vocab.json"
merge_file = "../../params/n_term/bbpe_params/1k/merges.txt"

# tokenizer serves as vocab at the same time
tokenizer = ByteLevelBPETokenizer.from_file(vocab_file, merge_file)
tokenizer.enable_padding(pad_token="<PAD>")

# load train and dev data
train_seqs, train_lbl = read_data("../../data/n_train.csv")
dev_seqs, dev_lbl = read_data("../../data/n_val.csv")

In [8]:
NUM_EPOCHS = 15
BATCH_SIZE = 512
VOCAB_SIZE = tokenizer.get_vocab_size()
EMBEDDING_DIM = 150
RNN_SIZE1 = 256
RNN_SIZE2 = 512
HIDDEN_SIZE = 128
DROPOUT = 0.5
LEARNING_RATE = 1e-4

model = BiLSTM(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_size1=RNN_SIZE1,
    rnn_size2=RNN_SIZE2,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

# create train and dev loader
train_data = CleavageDataset(train_seqs, train_lbl)
train_bucket_sampler = BucketSampler(train_seqs, BATCH_SIZE)
train_sampler = BatchSampler(train_bucket_sampler, BATCH_SIZE, drop_last=False)
train_loader = DataLoader(
    train_data,
    batch_sampler=train_sampler,
    collate_fn=train_wrapper,
    pin_memory=True,
    num_workers=10,
)

dev_data = CleavageDataset(dev_seqs, dev_lbl)
dev_bucket_sampler = BucketSampler(dev_seqs, BATCH_SIZE)
dev_sampler = BatchSampler(dev_bucket_sampler, BATCH_SIZE, drop_last=False)
dev_loader = DataLoader(
    dev_data,
    batch_sampler=dev_sampler,
    collate_fn=eval_wrapper,
    pin_memory=True,
    num_workers=10,
)

In [9]:
start = time()
print("Starting Training.")
logging_path = "../../params/n_term/bbpe1k_BiLSTM/results.csv"
highest_val_auc = 0
fold=None

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    train_loss, train_acc, train_auc = process(model, train_loader, criterion, optimizer)

    model.eval()
    with torch.no_grad():
        val_loss, val_acc, val_auc = process(model, dev_loader, criterion)

    # save metrics
    save_metrics(
        fold,
        epoch,
        train_loss,
        train_acc,
        train_auc,
        val_loss,
        val_acc,
        val_auc,
        path=logging_path,
    )

    print(
        f"Training:   [Epoch {epoch:2d}, Loss: {train_loss:8.6f}, Acc: {train_acc:.4f}, AUC: {train_auc:.4f}]"
    )
    print(f"Evaluation: [Epoch {epoch:2d}, Loss: {val_loss:8.6f}, Acc: {val_acc:.4f}, AUC: {val_auc:.4f}]")

    reg_auc = regularized_auc(train_auc, val_auc, threshold=0)
    if reg_auc > highest_val_auc:
        highest_val_auc = reg_auc
        path = f"../../params/n_term/bbpe1k_BiLSTM/auc{reg_auc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), path)

print("Finished Training.")
train_time = (time() - start) / 60
print(f"Cross-Validation took {train_time} minutes.")

Starting Training.
Train: 100%|███████████████████████████████████████████████████| 2243/2243 [00:13<00:00, 163.71batches/s]
Eval: 100%|██████████████████████████████████████████████████████| 281/281 [00:00<00:00, 302.18batches/s]
Training:   [Epoch  1, Loss: 0.000922, Acc: 0.8179, AUC: 0.5678]
Evaluation: [Epoch  1, Loss: 0.000888, Acc: 0.8179, AUC: 0.6510]
Train: 100%|███████████████████████████████████████████████████| 2243/2243 [00:13<00:00, 166.96batches/s]
Eval: 100%|██████████████████████████████████████████████████████| 281/281 [00:00<00:00, 315.52batches/s]
Training:   [Epoch  2, Loss: 0.000890, Acc: 0.8185, AUC: 0.6375]
Evaluation: [Epoch  2, Loss: 0.000854, Acc: 0.8198, AUC: 0.6981]
Train: 100%|███████████████████████████████████████████████████| 2243/2243 [00:13<00:00, 164.36batches/s]
Eval: 100%|██████████████████████████████████████████████████████| 281/281 [00:00<00:00, 311.98batches/s]
Training:   [Epoch  3, Loss: 0.000876, Acc: 0.8189, AUC: 0.6614]
Evaluation: [Epoch  

In [10]:
test_path = '../../data/n_test.csv'
test_seqs, test_lbls = read_data(test_path)

test_data = CleavageDataset(test_seqs, test_lbls)
test_bucket_sampler = BucketSampler(test_seqs, BATCH_SIZE)
test_sampler = BatchSampler(test_bucket_sampler, BATCH_SIZE, drop_last=False)
test_loader = DataLoader(
    test_data,
    batch_sampler=test_sampler,
    collate_fn=eval_wrapper,
    pin_memory=True,
    num_workers=10,
)

# load best model, evaluate on test set
best_model = sorted(
    [f for f in os.listdir("../../params/n_term/bbpe1k_BiLSTM/") if f.endswith(".pt")],
    reverse=True,
)[0]
print("Loaded model: ", best_model)
model.load_state_dict(torch.load('../../params/n_term/bbpe1k_BiLSTM/' + best_model))
model.eval()
test_loss, test_acc, test_auc = process(model, test_loader, criterion)
print(
    f"Test Set Performance: Loss: {test_loss:.6f}, Acc: {test_acc:.4f}, AUC: {test_auc:.4f}"
)
print(
    f"Total model params: {total_model_params(model)}, trainable model params: {trainable_model_params(model)}"
)

Loaded model:  auc0.7635_epoch15.pt
Eval: 100%|██████████████████████████████████████████████████████| 281/281 [00:00<00:00, 303.57batches/s]
Test Set Performance: Loss: 0.000788, Acc: 0.8288, AUC: 0.7656
Total model params: 5319409, trainable model params: 5319409
