In [1]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import pandas as pd
import torch
import numpy as np
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from utils import loadData

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
MAX_LEN = 256
NUM_LABELS = 2
epochs = 50
evalModel = False
batch_size = 16
save_path = '/home/vk352/paraphraseDomainShift/savedModels/'
data_path = '/home/vk352/paraphraseDomainShift/data/'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# task options: quora, msr, twitter
task = 'msr'
train_task = 'msr'

In [3]:
def preprocess(x, tokenizer):
    # Given two sentences, x["string1"] and x["string2"], this function returns BERT ready inputs.
    inputs = tokenizer.encode_plus(
            x["utt1"],
            x["utt2"],
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation=True
            )

    # First `input_ids` is a sequence of id-type representation of input string.
    # Second `token_type_ids` is sequence identifier to show model the span of "string1" and "string2" individually.
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    attention_mask = [1] * len(input_ids)

    # BERT requires sequences in the same batch to have same length, so let's pad!
    padding_length = MAX_LEN - len(input_ids)

    pad_id = tokenizer.pad_token_id
    input_ids = input_ids + ([pad_id] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([pad_id] * padding_length)

    # Super simple validation.
    assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
    assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
    assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

    # Convert them into PyTorch format.
    label = torch.tensor(int(x["paraphrase"])).long()
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    token_type_ids = torch.tensor(token_type_ids)

    # DONE!
    return {
            "label": label,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids
            }

In [4]:
def getDataloaders(data_path, task, evalModel=False):
    
    train, test, val = loadData(data_path, task)
    
    if evalModel:
        test_data = test.apply(preprocess, axis=1, args=[tokenizer])
        
        test_dataloader = DataLoader(
            list(test_data),
            sampler=SequentialSampler(list(test_data)),
            batch_size=batch_size
            )
        return test_dataloader


    train_data = train.apply(preprocess, axis=1, args=[tokenizer])
    val_data = val.apply(preprocess, axis=1, args=[tokenizer])
    test_data = test.apply(preprocess, axis=1, args=[tokenizer])

    train_dataloader = DataLoader(
                train_data,
                sampler=RandomSampler(list(train_data)),
                batch_size=batch_size
                )
    val_dataloader = DataLoader(
                val_data,
                sampler=SequentialSampler(list(val_data)),
                batch_size=batch_size
                )
    test_dataloader = DataLoader(
            list(test_data),
            sampler=SequentialSampler(list(test_data)),
            batch_size=batch_size
            )
    return train_dataloader, test_dataloader, val_dataloader

In [6]:
# get data and set up dataloaders
train_dataloader, test_dataloader, val_dataloader = getDataloaders(data_path, task)

In [7]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUM_LABELS, return_dict=True)
optimizer = AdamW(model.parameters(), lr=1e-5)
best_dev_acc = 0

# setup optimizer
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

model.to(device)

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    model.train()
    total_train_loss = 0
    total_train_accuracy = 0
    for step, batch in enumerate(train_dataloader):
        if step%1000==0:
            print('%d completed epochs, %d batches' % (epoch_i, step))
        labels = batch["label"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        model.zero_grad()        

        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(input_ids, 
                             token_type_ids=token_type_ids, 
                             attention_mask=attention_mask, 
                             labels=labels)[:2]
        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).flatten()
        total_train_accuracy += ((preds == labels).cpu().numpy().mean() * 100)
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)

    print("Training accuracy: {0:.2f}".format(avg_train_accuracy))
    print("Training loss: {0:.2f}".format(avg_train_loss))

    
    print('Validation...')
    model.eval()
    
    total_eval_accuracy = 0
    total_eval_loss = 0

    # Evaluate data for one epoch
    for batch in val_dataloader:
        labels = batch["label"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        
        with torch.no_grad():        

            loss, logits = model(input_ids, 
                             token_type_ids=token_type_ids, 
                             attention_mask=attention_mask, 
                             labels=labels)[:2]
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        preds = torch.argmax(logits, dim=1).flatten()
        total_eval_accuracy += ((preds == labels).cpu().numpy().mean() * 100)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    avg_val_loss = total_eval_loss / len(val_dataloader)
    print(" Val Accuracy: {0:.2f}".format(avg_val_accuracy))

    if avg_val_accuracy >= best_dev_acc:
            torch.save(model.state_dict(), save_path+'bert_'+task+'.pt')
            best_dev_acc = avg_val_accuracy

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Training...
cuda:0
0 completed epochs, 0 batches
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0


KeyboardInterrupt: 

In [9]:
# load model
if evalModel:
    test_dataloader = getDataloaders(data_path, task, evalModel=evalModel)
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUM_LABELS, return_dict=True)
    model.load_state_dict(torch.load(save_path+'bert_'+train_task+'.pt'))

    model.to(device)

    # test
    model.eval()
    total_test_accuracy = 0
    total_test_loss = 0
    for batch in test_dataloader:

        labels = batch["label"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)

        with torch.no_grad():        

            loss, logits = model(input_ids, 
                             token_type_ids=token_type_ids, 
                             attention_mask=attention_mask, 
                             labels=labels)[:2]

        # Accumulate the validation loss.
        total_test_loss += loss.item()

        preds = torch.argmax(logits, dim=1).flatten()
        total_test_accuracy += ((preds == labels).cpu().numpy().mean() * 100)


    # Report the final accuracy for this run.
    avg_test_accuracy = total_test_accuracy / len(test_dataloader)
    avg_test_loss = total_test_loss / len(test_dataloader)
    print(" Test Accuracy: {0:.2f}".format(avg_test_accuracy))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

 Test Accuracy: 83.20
