In [3]:
!pip install bio-embeddings[seqvec]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bio-embeddings[seqvec]
  Downloading bio_embeddings-0.2.2-py3-none-any.whl (105 kB)
[K     |████████████████████████████████| 105 kB 7.2 MB/s 
Collecting scikit-learn<0.25.0,>=0.24.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 85.8 MB/s 
[?25hCollecting python-slugify<6.0.0,>=5.0.2
  Downloading python_slugify-5.0.2-py2.py3-none-any.whl (6.7 kB)
Collecting umap-learn<0.6.0,>=0.5.1
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 9.4 MB/s 
[?25hCollecting biopython<2.0,>=1.79
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 46.0 MB/s 
Collecting torch<=1.10.0,>=1.8.0
  Downloading torch-1.10.0-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |████████

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import sys
import csv
import pickle
import random
import numpy as np
from time import time
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from allennlp.commands.elmo import ElmoEmbedder

In [3]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(1234)

In [4]:
def read_data(path):
    with open(path, 'r') as csvfile:
        train_data = list(csv.reader(csvfile))[1:] # skip col name
        sents, lbls = [], []
        for s, l in train_data:
            sents.append(s)
            lbls.append(l)
    return sents, lbls

# number of trainable parameters in model
def get_total_model_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [5]:
class CleavageDataset(Dataset):
    def __init__(self, seq, lbl):
        self.seq = seq
        self.lbl = lbl
    
    def __getitem__(self, idx):
        return self.seq[idx], self.lbl[idx]
    
    def __len__(self):
        return len(self.lbl)    
    
def collate_batch(batch):
    ordered_batch = list(zip(*batch))
    seq = [list(s) for s in ordered_batch[0]] # is still a string
    lbl = torch.tensor([int(l) for l in ordered_batch[1]], dtype=torch.float)
    return seq, lbl

In [6]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_dim, rnn_size, hidden_size, dropout):
        super().__init__()

        self.dropout=nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=rnn_size,
            bidirectional=True,
            batch_first=True,
        )
        
        self.fc1 = nn.Linear(rnn_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        
    def forward(self, seq):
        # input is already embedded by ELMo
        # input shape: (batch_size, seq_len=10, embedding_dim)
        embedded = self.dropout(seq)

        # input shape: (batch_size, seq_len, embedding_dim)
        out, _ = self.lstm(embedded)
        
        # input shape: (batch_size, seq_len, 2*hidden_size)
        pooled = torch.mean(out, dim=1)
        
        # input shape: (batch_size, 2*hidden_size)
        out = self.dropout(F.relu(self.fc1(pooled)))
        
        # input shape: (batch_size, hidden_size)
        # output shape: (batch_size)
        out = self.fc2(out).squeeze()
        return out 

In [7]:
def process(model, loader, criterion, optim=None):
    epoch_loss, num_correct, total = 0, 0, 0
    
    for seq, lbl in tqdm(
        loader,
        desc="Train: " if optim is not None else "Eval: ",
        file=sys.stdout,
        unit="batches"
    ):
        seq, _ = embedder.batch_to_embeddings(seq) # is already on GPU
        seq = seq.sum(dim=1)
        lbl = lbl.to(device)
        
        scores = model(seq)
        loss = criterion(scores, lbl)
        
        if optim is not None:
            optim.zero_grad()
            loss.backward()
            optim.step()
        
        epoch_loss += loss.item()
        num_correct += ((scores > 0) == lbl).sum()
        total += len(seq)
    return epoch_loss / total, num_correct / total

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


embedder = ElmoEmbedder(
    options_file='./drive/MyDrive/data/seqvec/options.json',
    weight_file='./drive/MyDrive/data/seqvec/weights.hdf5',
    cuda_device=0 # use colab gpu
)

# load train and dev data
train_seqs, train_lbl = read_data('./drive/MyDrive/data/n_train.csv')
dev_seqs, dev_lbl = read_data('./drive/MyDrive/data/n_val.csv')

In [9]:
NUM_EPOCHS = 10
BATCH_SIZE = 512
EMBEDDING_DIM = 1024 # given by ELMo
RNN_SIZE = 512
HIDDEN_SIZE = 128
DROPOUT = 0.5
LEARNING_RATE = 1e-4

model = BiLSTM(
    embedding_dim=EMBEDDING_DIM,
    rnn_size=RNN_SIZE,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT
).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

# create train and dev loader
train_data = CleavageDataset(train_seqs, train_lbl)
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=collate_batch, num_workers=2)

dev_data = CleavageDataset(dev_seqs, dev_lbl)
dev_loader = DataLoader(dev_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=collate_batch, num_workers=2)

print(f"Total trainable model parameters: {get_total_model_params(model):,}")

Total trainable model parameters: 6,430,977


In [10]:
start = time()
print("Starting Training.")
highest_val_acc = 0
train_losses, train_accuracies= [], []
val_losses, val_accuracies = [], []

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    train_loss, train_acc = process(model, train_loader, criterion, optimizer)
    
    model.eval()
    with torch.no_grad():
        val_loss, val_acc = process(model, dev_loader, criterion)
        
    # save current acc, loss
    train_losses.append((epoch, train_loss))
    train_accuracies.append((epoch, train_acc))
    val_losses.append((epoch, val_loss))
    val_accuracies.append((epoch, val_acc))
    
    if val_acc > highest_val_acc:
        highest_val_acc = val_acc
        path = f"./drive/MyDrive/data/n_term/seqvecBiLSTM/acc{val_acc:.4f}_epoch{epoch}.pt"
        torch.save(model.state_dict(), path)
        
    print(
        f"Training:   [Epoch {epoch:2d}, Loss: {train_loss:8.4f}, Acc: {train_acc:.4f}]"
    )
    print(f"Evaluation: [Epoch {epoch:2d}, Loss: {val_loss:8.4f}, Acc: {val_acc:.4f}]")
    
print("Finished Training.")
train_time = (time() - start) / 60
print(f"Training took {train_time} minutes.")

Starting Training.
Train: 100%|██████████| 2236/2236 [27:51<00:00,  1.34batches/s]
Eval: 100%|██████████| 280/280 [03:20<00:00,  1.39batches/s]
Training:   [Epoch  1, Loss:   0.0012, Acc: 0.6467]
Evaluation: [Epoch  1, Loss:   0.0011, Acc: 0.6797]
Train: 100%|██████████| 2236/2236 [27:51<00:00,  1.34batches/s]
Eval: 100%|██████████| 280/280 [03:20<00:00,  1.40batches/s]
Training:   [Epoch  2, Loss:   0.0011, Acc: 0.6844]
Evaluation: [Epoch  2, Loss:   0.0011, Acc: 0.6894]
Train: 100%|██████████| 2236/2236 [27:49<00:00,  1.34batches/s]
Eval: 100%|██████████| 280/280 [03:19<00:00,  1.40batches/s]
Training:   [Epoch  3, Loss:   0.0011, Acc: 0.6895]
Evaluation: [Epoch  3, Loss:   0.0011, Acc: 0.6933]
Train: 100%|██████████| 2236/2236 [27:51<00:00,  1.34batches/s]
Eval: 100%|██████████| 280/280 [03:21<00:00,  1.39batches/s]
Training:   [Epoch  4, Loss:   0.0011, Acc: 0.6915]
Evaluation: [Epoch  4, Loss:   0.0011, Acc: 0.6941]
Train: 100%|██████████| 2236/2236 [27:57<00:00,  1.33batches/s]
E

In [None]:
# save training stats
lsts = [train_losses, train_accuracies, val_losses, val_accuracies, train_time]
names = [
    "train_losses",
    "train_accuracies",
    "val_losses",
    "val_accuracies",
    "train_time",
]
to_save = dict()
for name, lst in zip(names, lsts):
    to_save[name] = lst

with open(f"../params/n_term/quadBiLSTM/metrics.pkl", "wb") as f:
    pickle.dump(to_save, f, pickle.HIGHEST_PROTOCOL)

print("Finished Saving Details.")