In [1]:
import torch
import numpy as np
import transformers as pt
import pandas as pd
torch.__version__

'1.1.0'

In [2]:
# bert_model, bert_tokenizer, pretrained_weights = (pt.DistilBertModel, pt.DistilBertTokenizer, 'distilbert-base-uncased')
bert_model, bert_tokenizer, pretrained_weights = (pt.BertModel, pt.BertTokenizer, '/home/yzhao/data/bert/bert-base-uncased')

tokenizer = bert_tokenizer.from_pretrained(pretrained_weights)
bert_model = bert_model.from_pretrained(pretrained_weights)

In [3]:
print(len(tokenizer.vocab))
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')
print(tokens)
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)
print(tokenizer.encode('Hello WORLD how ARE yoU?', add_special_tokens=True))

init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(init_token, eos_token, pad_token, unk_token)

init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

30522
['hello', 'world', 'how', 'are', 'you', '?']
[7592, 2088, 2129, 2024, 2017, 1029]
[101, 7592, 2088, 2129, 2024, 2017, 1029, 102]
[CLS] [SEP] [PAD] [UNK]
101 102 0 100
101 102 0 100
512


In [4]:
def load_data(path):
    df = pd.read_csv(path, delimiter='\t', header=None)
    tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    y = np.array(df[1].values)
    max_len = max(map(lambda s:len(s), tokenized.values))
    X_padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    return X_padded, y

In [5]:
X_train, y_train = load_data('SST2/train.tsv')
X_test, y_test = load_data('SST2/test.tsv')

print(X_train.shape, X_test.shape)

(6920, 67) (1821, 59)


In [6]:
dataset_train = torch.utils.data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
dataset_test = torch.utils.data.TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=32, shuffle=True)
data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=32, shuffle=False)

In [12]:
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper

class BertGruClassifier(torch.nn.Module):
    
    def __init__(self, bert: torch.nn.Module, 
                 hidden_dim:int=128, output_dim:int=2, 
                 n_layers:int=2, bidirectional:bool=True, dropout=0.2):
        super().__init__()
        self.bert = bert
        self.embedding_dim = bert.config.to_dict()['hidden_size']
        self.hidden_dim=hidden_dim
        self.output_dim=output_dim
        
        self.encoder = PytorchSeq2VecWrapper(
            torch.nn.GRU(self.embedding_dim, self.hidden_dim, 
                         num_layers=n_layers, 
                         bidirectional=bidirectional, 
                         batch_first=True, 
                         dropout = 0 if n_layers < 2 else dropout))
        
        self.dropout = torch.nn.Dropout(dropout)
        self.out = torch.nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
    
    def forward(self, inputs: torch.Tensor, mask: torch.Tensor=None) -> torch.Tensor:
        embedded = self.bert(inputs, attention_mask=mask)[0]
        hidden = self.encoder(self.dropout(embedded), mask)
        output = self.out(hidden)
        return output

In [13]:
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
criterion = torch.nn.CrossEntropyLoss().to(device)
model = BertGruClassifier(bert_model).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 986,626 trainable parameters


In [15]:
for epoch in range(10):
    count, losses = 0, 0
    for i, (batch, y_batch) in enumerate(data_loader_train, start=1):
        count += batch.size(0)
        batch = batch.to(device)
        y_batch = y_batch.to(device)
        mask = (batch != pad_token_idx).float()
        optimizer.zero_grad()
        pred = model(batch, mask)
        loss = criterion(pred, y_batch)
        losses += loss.item()
        print('{:.2%}|loss:{:.4f}'.format(count / len(dataset_train), losses / i), end='\r')
        loss.backward()
        optimizer.step()
    
    with torch.no_grad():
        correct, count = 0, 0
        for (batch, y_batch) in data_loader_test:
            count += batch.size(0)
            batch = batch.to(device)
            y_batch = y_batch.to(device)
            mask = (batch != pad_token_idx).float()
            pred = model(batch, mask)
            correct += (pred.argmax(-1) == y_batch).long().sum().item()
            print('{:.2%}|acc:{:.4f}'.format(count / len(dataset_test), correct/len(dataset_test)), end='\r')
        print('epoch:{:2d}|acc:{:.4f}'.format(epoch, correct/len(dataset_test)))

epoch: 0|acc:0.8764
epoch: 1|acc:0.8704
epoch: 2|acc:0.8622
epoch: 3|acc:0.8781
epoch: 4|acc:0.8534
epoch: 5|acc:0.8797
epoch: 6|acc:0.8825
epoch: 7|acc:0.8682
epoch: 8|acc:0.8677
epoch: 9|acc:0.8655
