In [1]:
import json
import copy
import re
import numpy as np
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer, BertForMaskedLM, AdamW, BertConfig, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import random
from sklearn.model_selection import train_test_split
import pickle
from pytorchtools import EarlyStopping

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Make training & validation set with toeic part5 dataset

In [3]:
with open('Part5_training.txt', 'rb') as f:
    dataset = pickle.load(f)
training, val = train_test_split(dataset, test_size = 0.1, random_state = 2)

In [4]:
len(training)

4666

In [5]:
max_len = 64
toeic_training = []

for pset in training:
    q = pset['question'].strip().lower()
    ans = pset['answer'].strip().lower()
    for j in range(1, 5):
        if pset[str(j)].strip().lower() == ans:
            label = 1
        else:
            label = 0
        sentence = re.sub('_', ' '+pset[str(j)]+' ', q).lower()
        
        input_ids = tokenizer.encode(sentence, add_special_tokens=True)
        attn_mask = [1]*len(input_ids) + [0]*(max_len - len(input_ids))
        input_ids = input_ids + [0]*(max_len-len(input_ids))
        segment_ids = [0] * max_len
        if label == 1:
            for _ in range(3):
                toeic_training.append([input_ids, segment_ids, attn_mask, label])
        else:
            toeic_training.append([input_ids, segment_ids, attn_mask, label])
random.shuffle(toeic_training)


toeic_val = []

for pset in val:
    q = pset['question'].strip().lower()
    ans = pset['answer'].strip().lower()
    for j in range(1, 5):
        if pset[str(j)].strip().lower() == ans:
            label = 1
        else:
            label = 0
        sentence = re.sub('_', ' '+pset[str(j)]+' ', q).lower()
        input_ids = tokenizer.encode(sentence, add_special_tokens=True)
        attn_mask = [1]*len(input_ids) + [0]*(max_len - len(input_ids))
        input_ids = input_ids + [0]*(max_len-len(input_ids))
        segment_ids = [0] * max_len
        toeic_val.append([input_ids, segment_ids, attn_mask, label])
random.shuffle(toeic_val)

In [6]:
len(toeic_training)

27996

In [7]:
input_ids, segment_ids, attn_mask, label = zip(*toeic_training)
input_ids = torch.LongTensor(input_ids)
segment_ids = torch.LongTensor(segment_ids)
label = torch.LongTensor(label)
attn_mask = torch.LongTensor(attn_mask)
input_ids_loader = torch.utils.data.DataLoader(input_ids, batch_size=16)
segment_ids_loader = torch.utils.data.DataLoader(segment_ids, batch_size=16)
label_loader = torch.utils.data.DataLoader(label, batch_size=16)
attn_mask_loader = torch.utils.data.DataLoader(attn_mask, batch_size=16)

#validation
input_ids_val, segment_ids_val, attn_mask_val, label_val = zip(*toeic_val)
input_ids_val = torch.LongTensor(input_ids_val)
segment_ids_val = torch.LongTensor(segment_ids_val)
label_val = torch.LongTensor(label_val)
attn_mask_val = torch.LongTensor(attn_mask_val)
input_ids_loader_val = torch.utils.data.DataLoader(input_ids_val, batch_size=16)
segment_ids_loader_val = torch.utils.data.DataLoader(segment_ids_val, batch_size=16)
label_loader_val = torch.utils.data.DataLoader(label_val, batch_size=16)
attn_mask_loader_val = torch.utils.data.DataLoader(attn_mask_val, batch_size=16)

print('batch size : 16, num of batch_train : {}, num of batch_validation : {}'.format(len(input_ids_loader), len(input_ids_loader_val)))

batch size : 16, num of batch_train : 1750, num of batch_validation : 130


## Finetuning BertForSequenceClassification model(bert_grammer) with toeic part5 questions(4666)

In [8]:
bert_grammer = BertForSequenceClassification.from_pretrained('bert-base-uncased')
bert_grammer.cuda()

optimizer = AdamW(bert_grammer.parameters(),
                  lr =1e-5, 
                  eps = 1e-8 
                )
epochs = 5

#number of batches * epochs = total number of training step
total_steps = len(input_ids_loader) * epochs
#create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


In [9]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
loss_train = []
loss_val = []
early_stopping = EarlyStopping(verbose = True)
for epoch in range(0, epochs):
    
    #train
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')
    train_loss = 0
    val_loss = 0
    bert_grammer.train()
    step = 0
    for input_ids, segment_ids, label, attn_mask in zip(input_ids_loader, segment_ids_loader, label_loader, attn_mask_loader):
        
        if step % 300 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(input_ids_loader)))
        step += 1
        input_ids = input_ids.to('cuda')
        segment_ids = segment_ids.to('cuda')
        label = label.to('cuda')
        attn_mask = attn_mask.to('cuda')
        bert_grammer.zero_grad()
        outputs = bert_grammer(input_ids=input_ids ,token_type_ids=segment_ids, labels = label, attention_mask = attn_mask)
        clf_loss = outputs[0]
        train_loss += clf_loss.item()
        clf_loss.backward()
        
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(bert_grammer.parameters(), 1.0)
        optimizer.step()
        #Update the learning rate.
        scheduler.step()
        
    avg_train_loss = train_loss / len(input_ids_loader)
    loss_train.append(avg_train_loss)
    print("")
    print("Average training loss: {0:.2f}".format(avg_train_loss))

    # validation
    print("")
    print('Running Validaiton...')
    bert_grammer.eval()
    
    for input_ids_val, segment_ids_val, label_val, attn_mask_val in zip(input_ids_loader_val, segment_ids_loader_val, label_loader_val, attn_mask_loader_val):
        input_ids_val = input_ids_val.to('cuda')
        segment_ids_val = segment_ids_val.to('cuda')
        label_val = label_val.to('cuda')
        attn_mask_val = attn_mask_val.to('cuda')
        
        with torch.no_grad():
            outputs = bert_grammer(input_ids=input_ids_val ,token_type_ids=segment_ids_val, labels = label_val, attention_mask = attn_mask_val)
        clf_loss_val = outputs[0]
        val_loss += clf_loss_val.item()
    avg_val_loss = val_loss / len(input_ids_loader_val)
    loss_val.append(avg_val_loss)
    
    early_stopping(avg_val_loss, bert_grammer)
    print("Average validation loss: {0:.2f}".format(avg_val_loss))
    
    if early_stopping.early_stop:
        print("Early stopping executed")
        break
        
    bert_grammer.load_state_dict(torch.load('checkpoint.pt'))
    
print("Finetuning Bert for grammer is finished!")


Training...
  Batch   300  of  1,750.
  Batch   600  of  1,750.
  Batch   900  of  1,750.
  Batch 1,200  of  1,750.
  Batch 1,500  of  1,750.

Average training loss: 0.51

Running Validaiton...
Validation loss decreased ( inf --> 0.450405). Saving model ...
Average validation loss: 0.45

Training...
  Batch   300  of  1,750.
  Batch   600  of  1,750.
  Batch   900  of  1,750.
  Batch 1,200  of  1,750.
  Batch 1,500  of  1,750.

Average training loss: 0.32

Running Validaiton...
EarlyStopping counter: 1 out of 2
Average validation loss: 0.65

Training...
  Batch   300  of  1,750.
  Batch   600  of  1,750.
  Batch   900  of  1,750.
  Batch 1,200  of  1,750.
  Batch 1,500  of  1,750.

Average training loss: 0.32

Running Validaiton...
EarlyStopping counter: 2 out of 2
Average validation loss: 0.63
Early stopping executed
Finetuning Bert for grammer is finished!


## Test models with toeic part5 915 questions

In [5]:
#function to get score of Finetuned SequenceClassification model(bert_grammer)
def get_logit(model, input_ids, segment_ids):
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0).to('cuda')
    segment_ids_tensor = torch.tensor(segment_ids).to('cuda')
    outputs = model(input_ids = input_ids_tensor, token_type_ids = segment_ids_tensor)
    logit =outputs[0][0][1]

    return logit.item()

#function to get score or pretrained BertForMaskedLM from 
def get_score(model, tokenizer, question_tensors, segment_tensors, masked_index, candidate):
    
    question_tensors = torch.tensor(question_tensors).unsqueeze(0).to('cuda')
    segment_tensors = torch.tensor(segment_tensors).to('cuda')

    candidate_tokens = tokenizer.tokenize(candidate) # warranty -> ['warrant', '##y']
    candidate_ids = tokenizer.convert_tokens_to_ids(candidate_tokens)
    with torch.no_grad():
        predictions = model(input_ids = question_tensors, token_type_ids = segment_tensors)
        predictions_candidates = predictions[0][0][masked_index][candidate_ids].mean()

    return predictions_candidates.item()

In [29]:
#Testing BertForMaskedLM(only pretrained) + BertForSequenceClassification(Grammer finetuned)
bert_lm = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert_lm.cuda()
bert_lm.eval()
bert_grammer.eval()
cnt_bert_mixed = 0
cnt_bert_base = 0
cnt_bert_grammer = 0

with open('Part5_test.txt', 'rb') as f:
    testset = pickle.load(f)
random.shuffle(testset)

print(f'\n{len(testset)} Toeic part5 questions are loaded! Our model will solve {len(testset)} questions like below.\n')
print('=================================================Quesiton Example==================================================')
for k, v in testset[random.randrange(1, len(testset))].items():
    if k == 'question':
        v = re.sub('_', '[ ? ]', v)
    print(f'{k} : {v}')
print('\n')
for i, pset in enumerate(testset):
    
    grammer_score = []
    if (i+1) % 100 == 0:
        print("Testing {} in {}".format(i+1, len(testset)))
    q = pset['question'].lower()
    ans = pset['answer'].lower() 
    sentence_lm = re.sub('_', ' [MASK] ', q)
    input_ids_lm = tokenizer.encode(sentence_lm, add_special_tokens=True)
    masked_index_lm = input_ids_lm.index(103)
    segment_ids_lm = [0] * len(input_ids_lm)
    lm_score = [get_score(bert_lm, tokenizer, input_ids_lm, segment_ids_lm, masked_index_lm, pset[str(j)])  for j in range(1, 5)]
    
    for k in range(1, 5):
        sentence_grammer = re.sub('_', ' '+pset[str(k)]+' ', q).lower()
        input_ids_grammer = tokenizer.encode(sentence_grammer, add_special_tokens=True)
        segment_ids_grammer = [0] * len(input_ids_grammer)
        grammer_score.append(get_logit(bert_grammer, input_ids_grammer, segment_ids_grammer))
    
    softmax = torch.nn.Softmax(dim=0)
    pred_tunedModel = np.argmax(grammer_score)+1
    pred_baseModel = torch.argmax(softmax(torch.tensor(lm_score))).item()+1
    
        
    if pset[str(pred_tunedModel)].lower() == ans:
        cnt_bert_grammer += 1
    if pset[str(pred_baseModel)].lower() == ans:
        cnt_bert_base += 1

    if 0<= i < 20:
        if i == 0:
            print('==================================================Predictions Example==================================================')
            print('{0:15s} {1:<30s} {2:<30s} {3:<30s}'.format('','Correct answer', 'Pretrained BertForMaskedLM', 'Finetuned BertForSeqClf'))
        print('{0:15s} {1:<30s} {2:<30s} {3:<30s}'.format('Question'+str(i+1), ans, pset[str(pred_baseModel)].lower(), pset[str(pred_tunedModel)].lower()))
    if i == 20:
        print('.\n.\n.')
print('=================================================Test finished=================================================\n')
print('Pretrained BertForMaskedLM : {}\nFinetuned BertForSequenceClassification(Bert_grammer) : {}'.format(cnt_bert_base/len(testset), cnt_bert_grammer/len(testset)))



915 Toeic part5 questions are loaded! Our model will solve 915 questions like below.

question : Since this is still under [ ? ] , you should return it to the manufacturer to be repaired.
answer : warranty
1 : warranty
2 : promise
3 : debate
4 : requirement


                Correct answer                 Pretrained BertForMaskedLM     Finetuned BertForSeqClf       
Question1       used                           used                           used                          
Question2       showed                         showed                         showed                        
Question3       competitive                    competitive                    competitive                   
Question4       decrease                       decrease                       decrease                      
Question5       that                           that                           that                          
Question6       upon                           upon                           upon   