In [None]:
# decoupling
# https://proceedings.neurips.cc/paper/2017/file/58d4d1e7b1e97b258c9ed0b37e02d087-Paper.pdf

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
ls

coteaching_BiLSTM.ipynb  [0m[01;34mdata[0m/  [01;34mparams[0m/  [01;34m__pycache__[0m/  week10ma.ipynb


In [6]:
import os
import sys
import csv
import pickle
import random
import numpy as np
from time import time
from tqdm import tqdm

from typing import List, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchtext.vocab import build_vocab_from_iterator

In [7]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(1234)

In [8]:
def read_data(path):
    with open(path, 'r') as csvfile:
        train_data = list(csv.reader(csvfile))[1:] # skip col name
        sents, lbls = [], []
        for s, l in train_data:
            sents.append(s)
            lbls.append(l)
    return sents, lbls

# number of trainable parameters in model
def get_total_model_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [9]:
class CleavageDataset(Dataset):
    def __init__(self, seq, lbl):
        self.seq = seq
        self.lbl = lbl
    
    def __getitem__(self, idx):
        return self.seq[idx], self.lbl[idx]
    
    def __len__(self):
        return len(self.lbl)
    
class CleavageBatch:
    def __init__(self, batch: List[Tuple[str, str]]):
        ordered_batch = list(zip(*batch))
        self.seq = torch.tensor([encode_text(seq) for seq in ordered_batch[0]], dtype=torch.int64)
        self.lbl = torch.tensor([int(l) for l in ordered_batch[1]], dtype=torch.float)
        
    def pin_memory(self):
        self.seq = self.seq.pin_memory()
        self.lbl = self.lbl.pin_memory()
        return self
    
def collate_wrapper(batch):
    return CleavageBatch(batch)

In [10]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, rnn_size, hidden_size, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
        )
        
        self.dropout=nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=rnn_size,
            bidirectional=True,
            batch_first=True,
        )
        
        self.fc1 = nn.Linear(rnn_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        
    def forward(self, seq):
        # input shape: (batch_size, seq_len=10)
        embedded = self.dropout(self.embedding(seq))
        
        # input shape: (batch_size, seq_len, embedding_dim)
        out, _ = self.lstm(embedded)
        
        # input shape: (batch_size, seq_len, 2*hidden_size)
        pooled = torch.mean(out, dim=1)
        
        # input shape: (batch_size, 2*hidden_size)
        out = self.dropout(F.relu(self.fc1(pooled)))
        
        # input shape: (batch_size, hidden_size)
        # output shape: (batch_size)
        out = self.fc2(out).squeeze()
        return out 

In [11]:
def train(model1, model2, loader, optim):
    epoch_loss1, num_correct1, total = 0, 0, 0
    epoch_loss2, num_correct2 = 0, 0
    criterion = nn.BCEWithLogitsLoss(reduce=False)

    for batch in tqdm(
        loader,
        desc="Train: ",
        file=sys.stdout,
        unit="batches"
    ):
        seq, lbl = batch.seq, batch.lbl
        seq, lbl = seq.to(device), lbl.to(device)
        
        scores1 = model1(seq)
        #_, pred1 = torch.max(scores1)
        pred1 = scores1 > 0

        scores2 = model2(seq)
        #_, pred2 = torch.max(scores2)
        pred2 = scores2 > 0

        inds = torch.where(pred1 != pred2)
        loss1 = criterion(scores1[inds] , lbl[inds]).sum()
        loss2 = criterion(scores2[inds] , lbl[inds]).sum()
        
        optim.zero_grad()
        loss1.backward()
        loss2.backward()
        optim.step()
        
        epoch_loss1 += loss1.item()
        epoch_loss2 += loss2.item()
        num_correct1 += ((scores1 > 0) == lbl).sum()
        num_correct2 += ((scores2 > 0) == lbl).sum()
        total += len(seq)
    return epoch_loss1 / total, epoch_loss2 / total, num_correct1 / total, num_correct2 / total

In [12]:
def evaluate(model1, model2, loader):
    epoch_loss1, num_correct1, total = 0, 0, 0
    epoch_loss2, num_correct2 = 0, 0


    for batch in tqdm(
        loader,
        desc="Eval: ",
        file=sys.stdout,
        unit="batches"
    ):
        seq, lbl = batch.seq, batch.lbl
        seq, lbl = seq.to(device), lbl.to(device)
        
        scores1 = model1(seq)
        #_, pred1 = torch.max(scores1)
        pred1 = scores1 > 0

        scores2 = model2(seq)
        #_, pred2 = torch.max(scores2)
        pred2 = scores2 > 0

        inds = torch.where(pred1 != pred2)
        loss1 = criterion(scores1[inds] , lbl[inds]).sum()
        loss2 = criterion(scores2[inds] , lbl[inds]).sum()
        
        epoch_loss1 += loss1.item()
        epoch_loss2 += loss2.item()
        num_correct1 += ((scores1 > 0) == lbl).sum()
        num_correct2 += ((scores2 > 0) == lbl).sum()
        total += len(seq)
    return epoch_loss1 / total, epoch_loss2 / total, num_correct1 / total, num_correct2 / total

In [13]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# load train and dev data
train_seqs, train_lbl = read_data('data/n_train.csv')
dev_seqs, dev_lbl = read_data('data/n_val.csv')

# create vocab from train seqs
vocab = build_vocab_from_iterator(train_seqs, specials=['<UNK>'])
vocab.set_default_index(vocab['<UNK>'])
encode_text = lambda x: vocab(list(x))

In [14]:
NUM_EPOCHS = 10
BATCH_SIZE = 512
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 100
RNN_SIZE = 512
HIDDEN_SIZE = 128
DROPOUT = 0.5
LEARNING_RATE = 1e-4



model1 = BiLSTM(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_size=RNN_SIZE,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT
).to(device)

optimizer = optim.Adam(model1.parameters(), lr=LEARNING_RATE)

model2 = BiLSTM(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_size=RNN_SIZE,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT
).to(device)

optimizer2 = optim.Adam(model1.parameters(), lr=LEARNING_RATE)

criterion = nn.BCEWithLogitsLoss()

# create train and dev loader
train_data = CleavageDataset(train_seqs, train_lbl)
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=collate_wrapper, pin_memory=True, num_workers=10)

dev_data = CleavageDataset(dev_seqs, dev_lbl)
dev_loader = DataLoader(dev_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=collate_wrapper, pin_memory=True, num_workers=10)

print(f"Total trainable model1 parameters: {get_total_model_params(model1):,}")
print(f"Total trainable model2 parameters: {get_total_model_params(model1):,}")


Total trainable model1 parameters: 2,648,373
Total trainable model2 parameters: 2,648,373


  cpuset_checked))


In [15]:
start = time()
print("Starting Training.")
highest_val_acc1, highest_val_acc2 = 0,0
train_losses1, train_accuracies1= [], []
train_losses2, train_accuracies2= [], []
val_losses1, val_accuracies1 = [], []
val_losses2, val_accuracies2 = [], []

for epoch in range(1, NUM_EPOCHS + 1):
    model1.train()
    model2.train()
    train_loss1, train_loss2, train_acc1, train_acc2 = train(model1, model2, train_loader, optimizer)
    
    model1.eval()
    model2.eval()
    with torch.no_grad():
        val_loss1, val_loss2, val_acc1, val_acc2 = evaluate(model1, model2, dev_loader)
        
    # save current acc, loss model1
    train_losses1.append((epoch, train_loss1))
    train_accuracies1.append((epoch, train_acc1))
    val_losses1.append((epoch, val_loss1))
    val_accuracies1.append((epoch, val_acc1))
    
    if val_acc1 > highest_val_acc1:
        highest_val_acc1 = val_acc1
        path = f"params/model1_acc{val_acc1:.4f}_epoch{epoch}.pt"
        torch.save(model1.state_dict(), path)


    print(f"Model1 Training:   [Epoch {epoch:2d}, Loss: {train_loss1:8.4f}, Acc: {train_acc1:.4f}]")
    print(f"Model1 Evaluation: [Epoch {epoch:2d}, Loss: {val_loss1:8.4f}, Acc: {val_acc1:.4f}]")


    # save current acc, loss model2
    train_losses2.append((epoch, train_loss2))
    train_accuracies2.append((epoch, train_acc2))
    val_losses2.append((epoch, val_loss2))
    val_accuracies2.append((epoch, val_acc2))

    if val_acc2 > highest_val_acc2:
        highest_val_acc2 = val_acc2
        path = f"params/model2_acc{val_acc2:.4f}_epoch{epoch}.pt"
        torch.save(model2.state_dict(), path)

    print(f"Model2 Training:   [Epoch {epoch:2d}, Loss: {train_loss2:8.4f}, Acc: {train_acc2:.4f}]")
    print(f"Model2 Evaluation: [Epoch {epoch:2d}, Loss: {val_loss2:8.4f}, Acc: {val_acc2:.4f}]")
    
print("Finished Training.")
train_time = (time() - start) / 60
print(f"Training took {train_time} minutes.")



Starting Training.
Train: 100%|██████████| 2236/2236 [01:50<00:00, 20.31batches/s]
Eval: 100%|██████████| 280/280 [00:05<00:00, 52.83batches/s]
Model1 Training:   [Epoch  1, Loss:   0.1081, Acc: 0.5834]
Model1 Evaluation: [Epoch  1, Loss:   0.0013, Acc: 0.5997]
Model2 Training:   [Epoch  1, Loss:   0.1101, Acc: 0.5651]
Model2 Evaluation: [Epoch  1, Loss:   0.0014, Acc: 0.5795]
Train: 100%|██████████| 2236/2236 [01:51<00:00, 19.97batches/s]
Eval: 100%|██████████| 280/280 [00:05<00:00, 52.20batches/s]
Model1 Training:   [Epoch  2, Loss:   0.2104, Acc: 0.6198]
Model1 Evaluation: [Epoch  2, Loss:   0.0013, Acc: 0.6740]
Model2 Training:   [Epoch  2, Loss:   0.2189, Acc: 0.5649]
Model2 Evaluation: [Epoch  2, Loss:   0.0014, Acc: 0.5795]
Train: 100%|██████████| 2236/2236 [01:53<00:00, 19.73batches/s]
Eval: 100%|██████████| 280/280 [00:05<00:00, 52.11batches/s]
Model1 Training:   [Epoch  3, Loss:   0.2551, Acc: 0.6779]
Model1 Evaluation: [Epoch  3, Loss:   0.0013, Acc: 0.6740]
Model2 Training:

In [None]:
# save training stats
lsts = [train_losses, train_accuracies, val_losses, val_accuracies, train_time]
names = [
    "train_losses",
    "train_accuracies",
    "val_losses",
    "val_accuracies",
    "train_time",
]
to_save = dict()
for name, lst in zip(names, lsts):
    to_save[name] = lst

with open(f"../params/n_term/quadBiLSTM/metrics.pkl", "wb") as f:
    pickle.dump(to_save, f, pickle.HIGHEST_PROTOCOL)

print("Finished Saving Details.")