In [1]:
!pip install -U torch
!pip install -U torchtext
!pip install -U transformers

Requirement already up-to-date: torch in /opt/anaconda3/lib/python3.7/site-packages (1.6.0)
Requirement already up-to-date: torchtext in /opt/anaconda3/lib/python3.7/site-packages (0.7.0)
Requirement already up-to-date: transformers in /opt/anaconda3/lib/python3.7/site-packages (3.0.2)


In [2]:
import torch
import random
import numpy as np

SEED = 1988

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
init_token_id = tokenizer.cls_token_id
eos_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id
unk_token_id = tokenizer.unk_token_id

In [5]:
max_length_input = tokenizer.max_model_input_sizes['bert-base-uncased']

In [6]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_length_input - 2]
    return tokens

In [7]:
from torchtext.data import Field, LabelField

TEXT  = Field(batch_first=True,
              use_vocab=False,
              tokenize=tokenize_and_cut,
              preprocessing=tokenizer.convert_tokens_to_ids,
              init_token=init_token_id,
              eos_token=eos_token_id,
              pad_token=pad_token_id,
              unk_token=unk_token_id)

LABEL = LabelField(dtype=torch.float)



In [8]:
from torchtext import datasets

train, test = datasets.IMDB.splits(TEXT, LABEL)



In [9]:
LABEL.build_vocab(train)

In [10]:
from torchtext.data import BucketIterator

BATCH_SIZE = 64

train_iter, test_iter = BucketIterator.splits(
    (train, test),
    batch_size=BATCH_SIZE)



# 模型 & 训练

In [11]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [12]:
from torch import nn

class BertGRU(nn.Module):
    def __init__(self, bert, hidden_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.bert = bert

        embed_dim = bert.config.to_dict()['hidden_size']

        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, 
                          batch_first=True, dropout=0 if n_layers < 2 else dropout)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text): # text: [BATCH_SIZE, SEQ_LENGTH]
        with torch.no_grad():
            embedded = self.bert(text)[0] # embedded: [BATCH_SIZE, SEQ_LENGTH, EMBED_DIM]

        _, hidden = self.gru(embedded) # hidden: [N_LAYERS * n_driections, BATCH_SIZE, EMBED_DIM]

        if self.gru.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])

        output = self.fc(hidden) # hidden: [BATCH_SIZE, 1]

        return output

In [13]:
HIDDEN_DIM = 512
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = BertGRU(bert, HIDDEN_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [14]:
for name, param in model.named_parameters():
    if name.startswith('bert'):
        param.requires_grad = False

In [15]:
from torch import optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

In [16]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [17]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, label in iterator:
        optimizer.zero_grad()
        preds = model(text)
        loss = criterion(preds.squeeze(), label.float())
        acc = binary_accuracy(preds.squeeze(), label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [18]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for text, label in iterator:
            preds = model(text)
            loss = criterion(preds.squeeze(), label.float())
            acc = binary_accuracy(preds.squeeze(), label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [19]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - elapsed_mins * 60)
    return elapsed_mins, elapsed_secs

In [20]:
N_EPOCHS = 10

best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    print(f'start to train {epoch+1:02}/{N_EPOCHS}...')
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    print(f'start to evaluate {epoch+1:02}/{N_EPOCHS}...')
    test_loss, test_acc = evaluate(model, test_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), 'Bert-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

start to train 01/10...




KeyboardInterrupt: 