In [1]:
import os
import sys
import csv
import pickle
import random
import math
import numpy as np
from time import time
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchtext.vocab import build_vocab_from_iterator

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [2]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(1234)

In [3]:
def read_data(path):
    with open(path, "r") as csvfile:
        train_data = list(csv.reader(csvfile))[1:]  # skip col name
        sents, lbls = [], []
        for s, l in train_data:
            sents.append(s)
            lbls.append(l)
    return sents, lbls


def apply_random_masking(seq, num_tokens):
    """
    Mask `num_tokens` as 0 at random positions per sequence.
    """
    dist = torch.rand(seq.shape)
    m, _ = torch.topk(dist, num_tokens)
    return seq * (dist < m)


def regularized_auc(train_auc, dev_auc, threshold=0.0025):
    """
    Returns development AUC if overfitting is below threshold, otherwise 0.
    """
    return dev_auc if (train_auc - dev_auc) < threshold else 0


def save_metrics(*args, path):
    if not os.path.isfile(path):
        with open(path, "w", newline="\n") as f:
            f.write(
                ",".join(
                    [
                        "fold",
                        "epoch",
                        "train_loss",
                        "train_acc",
                        "train_auc",
                        "val_loss",
                        "val_acc",
                        "val_auc",
                    ]
                )
            )
            f.write("\n")
    if args:
        with open(path, "a", newline="\n") as f:
            f.write(",".join([str(arg) for arg in args]))
            f.write("\n")


def gelu(x):
    """
    Facebook Research implementation of the gelu activation function.
    
    For information: OpenAI GPT's gelu is slightly different
    (and gives slightly different results):
    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def trainable_model_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def total_model_params(model):
    return sum(p.numel() for p in model.parameters())

In [4]:
class CleavageDataset(Dataset):
    def __init__(self, seq, lbl):
        self.seq = seq
        self.lbl = lbl
    
    def __getitem__(self, idx):
        return self.seq[idx], self.lbl[idx]
    
    def __len__(self):
        return len(self.lbl)
    
class TrainBatch:
    def __init__(self, batch):
        ordered_batch = list(zip(*batch))
        seq = torch.tensor([encode_text(seq) for seq in ordered_batch[0]], dtype=torch.int64)
        self.seq = apply_random_masking(seq, num_tokens=1)
        self.lbl = torch.tensor([int(l) for l in ordered_batch[1]], dtype=torch.long)
        
    def pin_memory(self):
        self.seq = self.seq.pin_memory()
        self.lbl = self.lbl.pin_memory()
        return self
    
def train_wrapper(batch):
    return TrainBatch(batch)


class EvalBatch:
    def __init__(self, batch):
        ordered_batch = list(zip(*batch))
        self.seq = torch.tensor([encode_text(seq) for seq in ordered_batch[0]], dtype=torch.int64)
        self.lbl = torch.tensor([int(l) for l in ordered_batch[1]], dtype=torch.long)
        
    def pin_memory(self):
        self.seq = self.seq.pin_memory()
        self.lbl = self.lbl.pin_memory()
        return self
    
def eval_wrapper(batch):
    return EvalBatch(batch)

In [5]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, rnn_size1, rnn_size2, hidden_size, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
        )
        
        self.dropout=nn.Dropout(dropout)
        
        self.lstm1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=rnn_size1,
            bidirectional=True,
            batch_first=True,
        )

        self.lstm2 = nn.LSTM(
            input_size=2 * rnn_size1,
            hidden_size=rnn_size2,
            bidirectional=True,
            batch_first=True,
        )
        
        self.fc1 = nn.Linear(rnn_size2 * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 2)
        
    def forward(self, seq):
        # input shape: (batch_size, seq_len=10)
        embedded = self.dropout(self.embedding(seq))
        
        # input shape: (batch_size, seq_len, embedding_dim)
        out, _ = self.lstm1(embedded)
        
        # input shape: (batch_size, seq_len, 2*rnn_size1)
        out, _ = self.lstm2(out)
        
        # input shape: (batch_size, seq_len, 2*hidden_size)
        pooled, _ = torch.max(out, dim=1)
        
        # input shape: (batch_size, 2*hidden_size)
        out = self.dropout(gelu(self.fc1(pooled)))
        
        # input shape: (batch_size, hidden_size)
        # output shape: (batch_size, 2)
        return self.fc2(out)

In [6]:
class NoiseAdaptation(nn.Module):
    def __init__(self, theta, k):
        super().__init__()
        self.theta = nn.Linear(k, k, bias=False)
        self.theta.weight.data = theta
        self.eye = torch.eye(k).to(device)
        
    def forward(self, x):
        theta = self.theta(self.eye)
        theta = torch.softmax(theta, dim=0)
        out = x @ theta
        return out

In [7]:
def process(model, loader, criterion, optim=None, conf=None):
    epoch_loss, num_correct, total = 0, 0, 0
    pos_preds, preds, lbls = [], [], []

    for batch in tqdm(
        loader,
        desc="Train: " if optim is not None else "Eval: ",
        file=sys.stdout,
        unit="batches",
    ):
        seq, lbl = batch.seq, batch.lbl
        seq, lbl = seq.to(device), lbl.to(device)

        scores = model(seq)
        loss = criterion(scores, lbl)

        if optim is not None:
            optim.zero_grad()
            loss.backward()
            optim.step()

        pred = scores.argmax(dim=1)
        epoch_loss += loss.item()
        num_correct += (pred == lbl).sum().item()
        total += seq.shape[0]
        preds.extend(pred.detach().tolist())
        pos_preds.extend(scores[:, 1].detach().tolist())
        lbls.extend(lbl.detach().tolist())
        
    if conf is not None:   
        return confusion_matrix(lbls, preds)
    return epoch_loss / total, num_correct / total, roc_auc_score(lbls, pos_preds)

In [8]:
def train_hybrid(
    model, noisemodel, optimizer, noise_optimizer, criterion, beta, loader
):
    epoch_loss, model_loss, noise_loss, num_correct, total = 0, 0, 0, 0, 0
    preds, lbls = [], []

    for batch in tqdm(loader, desc="Hybrid Train: ", file=sys.stdout, unit="batches"):

        seq, lbl = batch.seq, batch.lbl
        seq, lbl = seq.to(device), lbl.to(device)

        scores = model(seq)
        noise_scores = noisemodel(scores)

        model_loss = criterion(scores, lbl)
        noise_loss = criterion(noise_scores, lbl)

        loss = beta * noise_loss + (1 - beta) * model_loss

        optimizer.zero_grad()
        noise_optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        noise_optimizer.step()

        epoch_loss += loss.item()
        model_loss += model_loss.item()
        noise_loss += noise_loss.item()
        num_correct += (noise_scores.argmax(dim=1)  == lbl).sum().item()
        total += seq.shape[0]
        preds.extend(noise_scores[:, 1].detach().tolist())
        lbls.extend(lbl.detach().tolist())
        
    return (
        epoch_loss / total,
        model_loss / total,
        noise_loss / total,
        num_correct / total,
        roc_auc_score(lbls, preds),
    )

In [9]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# load train and dev data
train_seqs, train_lbl = read_data('../../data/n_train.csv')
dev_seqs, dev_lbl = read_data('../../data/n_val.csv')

# create vocab from train seqs
vocab = build_vocab_from_iterator(train_seqs, specials=['<UNK>'])
vocab.set_default_index(vocab['<UNK>'])
encode_text = lambda x: vocab(list(x))

In [10]:
NUM_EPOCHS = 15
NUM_WARMUP = 1
NUM_CLASSES = 2
BATCH_SIZE = 512
BETA=0.8
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 76
RNN_SIZE1 = 252
RNN_SIZE2 = 518
HIDDEN_SIZE = 179
DROPOUT = 0.5
LEARNING_RATE = 3e-4

model = BiLSTM(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_size1=RNN_SIZE1,
    rnn_size2=RNN_SIZE2,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT
).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

# create train and dev loader
train_data = CleavageDataset(train_seqs, train_lbl)
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=train_wrapper, pin_memory=True, num_workers=10)

dev_data = CleavageDataset(dev_seqs, dev_lbl)
dev_loader = DataLoader(dev_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=eval_wrapper, pin_memory=True, num_workers=10)

In [11]:
start = time()
print("Starting Training.")
highest_val_auc = 0

for epoch in range(1, NUM_EPOCHS + 1):
    if epoch < NUM_WARMUP + 1:
        model.train()
        train_loss, train_acc, train_auc = process(model, train_loader, criterion, optimizer)

        model.eval()
        with torch.no_grad():
            val_loss, val_acc, val_auc = process(model, dev_loader, criterion)
            
        print(
        f"Warmup Training:   [Epoch {epoch:2d}, Loss: {train_loss:8.6f}, Acc: {train_acc:.4f}, AUC: {train_auc:.4f}]"
        )
        print(f"Warmup Evaluation: [Epoch {epoch:2d}, Loss: {val_loss:8.6f}, Acc: {val_acc:.4f}, AUC: {val_auc:.4f}]")
        
        if epoch == NUM_WARMUP:
            # get conf matrix based on predictions on train data
            model.eval()
            with torch.no_grad():
                conf = process(model, train_loader, criterion, conf=True)
            theta = conf / conf.sum(axis=1, keepdims=True)
            theta = torch.from_numpy(np.log(theta + 1e-8)).to(torch.float) # avoid zeros with +1e-8

            # create noisemodel
            noisemodel = NoiseAdaptation(theta=theta, k=NUM_CLASSES).to(device)
            noise_optimizer = optim.Adam(noisemodel.parameters(), lr=LEARNING_RATE)
            print(f'Created NoiseModel in epoch {epoch}')

    else:
        # hybrid training
        model.train()
        noisemodel.train()
        hy_tr_loss, model_loss, noise_loss, hy_tr_acc, hy_tr_auc = train_hybrid(
            model=model,
            noisemodel=noisemodel,
            optimizer=optimizer,
            noise_optimizer=noise_optimizer,
            criterion=criterion,
            beta=BETA,
            loader=train_loader
        )

        model.eval()
        with torch.no_grad():
            val_loss, val_acc, val_auc = process(model, dev_loader, criterion)

        print(
            f"Hy-Training: [Epoch {epoch:2d}, Hy-Loss: {hy_tr_loss:.6f},\
            Model-Loss: {model_loss:.6f}, Noise-Loss: {noise_loss:.6f},\
            Acc: {hy_tr_acc:.4f}, AUC: {hy_tr_auc:.4f}]"
        )
        print(f"Evaluation:  [Epoch {epoch:2d}, Loss: {val_loss:8.6f}, Acc: {val_acc:.4f}, AUC: {val_auc:.4f}]")
        
        reg_auc = regularized_auc(hy_tr_auc, val_auc, threshold=0)
        if reg_auc > highest_val_auc:
            highest_val_auc = reg_auc
            path = f"../../params/n_term/BiLSTM_noise_layer/auc{reg_auc:.4f}_epoch{epoch}.pt"
            torch.save(model.state_dict(), path)

print("Finished Training.")
train_time = (time() - start) / 60
print(f"Training took {train_time} minutes.")

Starting Training.
Train: 100%|███████████████████████████████████████████████████| 2243/2243 [00:21<00:00, 105.08batches/s]
Eval: 100%|██████████████████████████████████████████████████████| 281/281 [00:01<00:00, 262.31batches/s]
Warmup Training:   [Epoch  1, Loss: 0.000836, Acc: 0.8216, AUC: 0.7085]
Warmup Evaluation: [Epoch  1, Loss: 0.000778, Acc: 0.8309, AUC: 0.7749]
Eval: 100%|████████████████████████████████████████████████████| 2243/2243 [00:06<00:00, 337.61batches/s]
Created NoiseModel in epoch 1
Hybrid Train: 100%|████████████████████████████████████████████| 2243/2243 [00:21<00:00, 104.77batches/s]
Eval: 100%|██████████████████████████████████████████████████████| 281/281 [00:01<00:00, 259.39batches/s]
Hy-Training: [Epoch  2, Hy-Loss: 0.000914,            Model-Loss: 0.000001, Noise-Loss: 0.000001,            Acc: 0.8252, AUC: 0.2561]
Evaluation:  [Epoch  2, Loss: 0.000919, Acc: 0.8315, AUC: 0.7807]
Hybrid Train: 100%|█████████████████████████████████████████████| 2243/2243 

In [12]:
test_path = '../../data/n_test.csv'
test_seqs, test_lbls = read_data(test_path)

test_data = CleavageDataset(test_seqs, test_lbls)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=eval_wrapper, pin_memory=True, num_workers=10)

# load best model, evaluate on test set
best_model = sorted(
    [f for f in os.listdir("../../params/n_term/BiLSTM_noise_layer/") if f.endswith(".pt")],
    reverse=True,
)[0]
print("Loaded model: ", best_model)
model.load_state_dict(torch.load('../../params/n_term/BiLSTM_noise_layer/' + best_model))
model.eval()
with torch.no_grad():
    test_loss, test_acc, test_auc = process(model, test_loader, criterion)
print(
    f"Test Set Performance: Loss: {test_loss:.6f}, Acc: {test_acc:.4f}, AUC: {test_auc:.4f}"
)
print(
    f"Total model params: {total_model_params(model)}, trainable model params: {trainable_model_params(model)}"
)

Loaded model:  auc0.7925_epoch12.pt
Eval: 100%|██████████████████████████████████████████████████████| 281/281 [00:01<00:00, 273.43batches/s]
Test Set Performance: Loss: 0.000754, Acc: 0.8343, AUC: 0.7948
Total model params: 5096315, trainable model params: 5096315
