In [1]:
import torch
import random
import numpy as np

SEED = 1988

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use all devices

In [2]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
2


In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
init_token_id = tokenizer.cls_token_id
eos_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id
unk_token_id = tokenizer.unk_token_id
max_length_input = tokenizer.max_model_input_sizes['bert-base-uncased']

In [5]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_length_input - 2]
    return tokens

In [6]:
from torchtext.data import Field, LabelField
from torchtext import datasets

TEXT  = Field(batch_first=True,
              use_vocab=False,
              tokenize=tokenize_and_cut,
              preprocessing=tokenizer.convert_tokens_to_ids,
              init_token=init_token_id,
              eos_token=eos_token_id,
              pad_token=pad_token_id,
              unk_token=unk_token_id)

LABEL = LabelField(dtype=torch.float)

train, test = datasets.IMDB.splits(TEXT, LABEL)

TEXT.build_vocab(train)
LABEL.build_vocab(train)



In [8]:
from torchtext.data import BucketIterator
from torchtext import data

BATCH_SIZE = 32

train_iter, test_iter = BucketIterator.splits(
    (train, test),
    batch_size=BATCH_SIZE,
    device=device
)



In [9]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [10]:
from torch import nn

class BertGRU(nn.Module):
    def __init__(self, bert, hidden_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.bert = bert

        embed_dim = bert.config.to_dict()['hidden_size']

        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, 
                          batch_first=True, dropout=0 if n_layers < 2 else dropout)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text): # text: [BATCH_SIZE, SEQ_LENGTH]
        with torch.no_grad():
            embedded = self.bert(text)[0] # embedded: [BATCH_SIZE, SEQ_LENGTH, EMBED_DIM]

        _, hidden = self.gru(embedded) # hidden: [N_LAYERS * n_driections, BATCH_SIZE, EMBED_DIM]

        if self.gru.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])

        output = self.fc(hidden) # hidden: [BATCH_SIZE, 1]

        return output

In [11]:
HIDDEN_DIM = 768
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = BertGRU(bert, HIDDEN_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False

In [17]:
from torch import optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

################################################
################ the only change ###############
model = nn.DataParallel(model) # use all devices
################################################
model = model.to(device)
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        preds = model(batch.text).squeeze(1)
        loss = criterion(preds.squeeze(), batch.label)
        acc = binary_accuracy(preds.squeeze(), batch.label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            preds = model(batch.text).squeeze(1)
            loss = criterion(preds.squeeze(), batch.label)
            acc = binary_accuracy(preds.squeeze(), batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - elapsed_mins * 60)
    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCHS = 10

best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), 'Bert-model-multiple-GPU.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

  result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,


Epoch: 01 | Epoch Time: 17m 4s
	Train Loss: 0.343 | Train Acc: 84.96%
	 Val. Loss: 0.258 |  Val. Acc: 89.83%
Epoch: 02 | Epoch Time: 17m 12s
	Train Loss: 0.255 | Train Acc: 89.97%
	 Val. Loss: 0.241 |  Val. Acc: 90.84%
Epoch: 03 | Epoch Time: 17m 12s
	Train Loss: 0.234 | Train Acc: 90.82%
	 Val. Loss: 0.191 |  Val. Acc: 92.52%
Epoch: 04 | Epoch Time: 17m 12s
	Train Loss: 0.222 | Train Acc: 91.48%
	 Val. Loss: 0.200 |  Val. Acc: 92.63%
Epoch: 05 | Epoch Time: 17m 11s
	Train Loss: 0.206 | Train Acc: 92.03%
	 Val. Loss: 0.188 |  Val. Acc: 92.35%
Epoch: 06 | Epoch Time: 17m 13s
	Train Loss: 0.193 | Train Acc: 92.88%
	 Val. Loss: 0.242 |  Val. Acc: 91.28%
Epoch: 07 | Epoch Time: 17m 14s
	Train Loss: 0.180 | Train Acc: 93.26%
	 Val. Loss: 0.190 |  Val. Acc: 92.78%
Epoch: 08 | Epoch Time: 17m 13s
	Train Loss: 0.169 | Train Acc: 93.69%
	 Val. Loss: 0.197 |  Val. Acc: 92.74%
Epoch: 09 | Epoch Time: 17m 14s
	Train Loss: 0.161 | Train Acc: 94.09%
	 Val. Loss: 0.225 |  Val. Acc: 92.48%
Epoch: 10 |