In [1]:
import sys

sys.path.insert(0, "../")

In [2]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader

In [3]:
from universe.dataset import PfamDataset
from universe.models import BiLSTM

## Dataset

In [4]:
NUM_CLASSES = 500
BATCH_SIZE = 512

In [5]:
train = pd.read_csv(f"../data/processed/{NUM_CLASSES}/train.csv")
dev = pd.read_csv(f"../data/processed/{NUM_CLASSES}/dev.csv")
test = pd.read_csv(f"../data/processed/{NUM_CLASSES}/test.csv")

In [6]:
train_dataset = PfamDataset(train, overwrite_cache=False, num_classes=NUM_CLASSES, cache_dir="../data/cache", split_name="train")

11/14/2021 16:08:59 - INFO - universe.dataset - PID: 18547 -  Loading features from cached file: ../data/cache/cached_dataset_500_train


In [7]:
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=PfamDataset.collate, drop_last=True)

## Model

In [8]:
VOCAB_SIZE = 22
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 128
N_LAYERS = 2

In [9]:
model = BiLSTM(HIDDEN_SIZE, N_LAYERS, EMBEDDING_SIZE, NUM_CLASSES, VOCAB_SIZE)

## Training

In [10]:
EPOCHS = 50
LR = 0.01

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [12]:
device = torch.device("cuda:0")

In [13]:
model.to(device)

BiLSTM(
  (embedding): Embedding(22, 300, padding_idx=21)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=500, bias=True)
)

In [14]:
global_step = 0

for epoch in range(EPOCHS):
    print(f"Starting epoch: {epoch+1}/{EPOCHS}")
    epoch_step = 0
    n_sequences_epoch = 0
    total_loss_epoch = 0

    for i, batch in enumerate(train_dataloader):
        seq, labels = tuple(t.to(device) for t in batch[:2])
        lengths = batch[-1]

        outputs = model(seq, lengths)

        loss = F.cross_entropy(outputs, labels)
        total_loss_epoch += loss.item()
        global_step += 1
        epoch_step += 1

        loss.backward()

        # nn.utils.clip_grad_norm_(
        #    model.parameters(), 1.0
        # )

        optimizer.step()

        optimizer.zero_grad()

        n_sequences_epoch += seq.size(0)

        if i % 1000 == 0:
            print(f"Epoch: {epoch+1} Step: {global_step} Last loss: {loss.item()}")

    print(f"Epoch: {epoch+1} Step: {global_step} Epoch loss: {total_loss_epoch/epoch_step}")
    print("\n-----------------\n")
        

Starting epoch: 1/50
Epoch: 1 Step: 1 Last loss: 6.214513301849365
Epoch: 1 Step: 1001 Last loss: 5.37968111038208
Epoch: 1 Step: 1151 Epoch loss: 5.3556341209378475

-----------------

Starting epoch: 2/50
Epoch: 2 Step: 1152 Last loss: 5.393555641174316
Epoch: 2 Step: 2152 Last loss: 5.484552383422852
Epoch: 2 Step: 2302 Epoch loss: 5.434138525474807

-----------------

Starting epoch: 3/50
Epoch: 3 Step: 2303 Last loss: 5.427124500274658
Epoch: 3 Step: 3303 Last loss: 5.393227577209473
Epoch: 3 Step: 3453 Epoch loss: 5.401614520158279

-----------------

Starting epoch: 4/50
Epoch: 4 Step: 3454 Last loss: 5.389823913574219
Epoch: 4 Step: 4454 Last loss: 5.3792524337768555
Epoch: 4 Step: 4604 Epoch loss: 5.3906560689858205

-----------------

Starting epoch: 5/50
Epoch: 5 Step: 4605 Last loss: 5.381128787994385
Epoch: 5 Step: 5605 Last loss: 5.464656829833984
Epoch: 5 Step: 5755 Epoch loss: 5.401040965427429

-----------------

Starting epoch: 6/50
Epoch: 6 Step: 5756 Last loss: 5.44

KeyboardInterrupt: 