In [1]:
import sys

sys.path.insert(0, "../")

In [2]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader

In [3]:
from universe.dataset import PfamDataset
from universe.models import BiLSTM

## Dataset

In [4]:
train = pd.read_csv("../data/processed/train.csv")
dev = pd.read_csv("../data/processed/dev.csv")
test = pd.read_csv("../data/processed/test.csv")

In [5]:
train_dataset = PfamDataset(train, overwrite_cache=False, cache_dir="../data/cache", split_name="train")

11/14/2021 01:18:49 - INFO - universe.dataset - PID: 11004 -  Loading features from cached file: ../data/cache/cached_dataset_train


In [23]:
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=PfamDataset.collate, drop_last=True)

## Model

In [7]:
vocab_size = 22
hidden_size = 128

In [27]:
model = BiLSTM(hidden_size, 4, 128, 5000, 22)

## Training

In [9]:
EPOCHS = 50

In [9]:
for batch in train_dataloader:
    seq, labels = batch
    break

In [10]:
out = model(seq)

In [11]:
F.cross_entropy(out, labels)

tensor(8.5076, grad_fn=<NllLossBackward0>)

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [14]:
device = torch.device("cuda:0")

In [15]:
device

device(type='cuda', index=0)

In [29]:
model.to(device)

BiLSTM(
  (embedding): Embedding(22, 128, padding_idx=21)
  (lstm): LSTM(128, 128, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=5000, bias=True)
)

In [30]:
global_step = 0

for epoch in range(EPOCHS):
    print(f"Starting epoch: {epoch+1}/{EPOCHS}")
    epoch_step = 0
    n_sequences_epoch = 0
    total_loss_epoch = 0

    for i, batch in enumerate(train_dataloader):
        seq, labels = tuple(t.to(device) for t in batch)

        outputs = model(seq)

        loss = F.cross_entropy(outputs, labels)
        total_loss_epoch += loss.item()
        global_step += 1
        epoch_step += 1

        loss.backward()

        nn.utils.clip_grad_norm_(
            model.parameters(), 1.0
        )

        optimizer.step()

        optimizer.zero_grad()

        n_sequences_epoch += seq.size(0)

        if i % 1000 == 0:
            print(f"Epoch: {epoch+1} Last loss: {loss.item()}")

    print(f"Epoch: {epoch+1} Epoch loss: {total_loss_epoch/  epoch_step}")
        

Starting epoch: 1/50
Epoch: 0 Last loss: 8.516499519348145
Epoch: 0 Last loss: 8.051719665527344
Epoch: 0 Last loss: 8.064003944396973
Epoch: 0 Last loss: 8.186814308166504
Epoch: 0 Epoch loss: 8.10400343592353
Starting epoch: 2/50
Epoch: 1 Last loss: 8.02542781829834
Epoch: 1 Last loss: 7.558437347412109
Epoch: 1 Last loss: 7.2548394203186035
Epoch: 1 Last loss: 8.165397644042969
Epoch: 1 Epoch loss: 7.618294047969638
Starting epoch: 3/50
Epoch: 2 Last loss: 7.270704746246338
Epoch: 2 Last loss: 7.494513034820557
Epoch: 2 Last loss: 8.000884056091309
Epoch: 2 Last loss: 7.1603851318359375
Epoch: 2 Epoch loss: 7.423967940052244
Starting epoch: 4/50
Epoch: 3 Last loss: 7.131890296936035
Epoch: 3 Last loss: 8.818958282470703
Epoch: 3 Last loss: 9.139876365661621
Epoch: 3 Last loss: 8.263298034667969
Epoch: 3 Epoch loss: 7.757469066564391
Starting epoch: 5/50
Epoch: 4 Last loss: 8.22375202178955
Epoch: 4 Last loss: 7.172489166259766
Epoch: 4 Last loss: 7.0342254638671875
Epoch: 4 Last los

KeyboardInterrupt: 