In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pickle
import random
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import random
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)
# Set the seed value all over the place to make this reproducible.

Using TensorFlow backend.


Using device: cuda

Tesla K80
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


<torch._C.Generator at 0x7fe7d9385110>

In [3]:
!pip install transformers
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification, AdamW



In [0]:
path = "drive/My Drive/FiQA/"

In [0]:
from evaluate import *

In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def remove_empty(test_set):
    for index, row in enumerate(test_set):
        for doc in row[1]:
            if doc in empty_docs:
                del test_set[index]
    return test_set

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
train_qid_rel = load_pickle(path + "new-data/qid_rel_train.pickle")
test_qid_rel = load_pickle(path + "new-data/qid_rel_test.pickle")
valid_qid_rel = load_pickle(path + "new-data/qid_rel_valid.pickle")

train_set = load_pickle(path + 'new-data/data_50/train_set_50.pickle')
valid_set = load_pickle(path + 'new-data/data_50/valid_set_50.pickle')

test_set = load_pickle(path + 'new-data/data_50/test_set_50.pickle')
test_set_full = load_pickle(path + 'new-data/data_50/test_set_full_50.pickle')

In [8]:
print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 284050
Number of validation samples: 31600
Number of test samples: 333


In [9]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [0]:
collection = pd.read_csv("/content/drive/My Drive/Thesis/data/retrieval/collection_cleaned.tsv", sep="\t", header=None)
collection = collection.rename(columns={0: 'docid', 1: 'doc'})

def load_questions(path):
    """
    Returns a dataframe of cols: qid, question
    """
    # Question ID and Question text
    query_df = pd.read_csv(path, sep="\t")
    queries = query_df[['qid', 'question']]

    return queries

queries = load_questions(path + "FiQA_train_question_final.tsv")

# Question to question text
qid_to_text = {}

for index, row in queries.iterrows():
    qid_to_text[row['qid']] = row['question']

docid_to_text = {}

for index, row in collection.iterrows():
    docid_to_text[row['docid']] = row['doc']

In [0]:
def get_sequence_df(dataset):
    df = pd.DataFrame(dataset)
    df = df.rename(columns={0: 'qid', 1: 'pos', 2:'neg'})
    df_pos = df[['qid', 'pos']]
    df_pos = df_pos.rename(columns={'pos': 'docid'})
    df_pos['label'] = df_pos.apply(lambda x: 1, axis=1)
    df_pos = df_pos.drop_duplicates()

    df_neg = df[['qid', 'neg']]
    df_neg = df_neg.rename(columns={'neg': 'docid'})
    df_neg['label'] = df_neg.apply(lambda x: 0, axis=1)
    data_df = pd.concat([df_pos, df_neg]).sort_values(by=['qid'])

    data_df['question'] = data_df['qid'].apply(lambda x: qid_to_text[x])
    data_df['ans_cand'] = data_df['docid'].apply(lambda x: docid_to_text[x])

    return data_df

In [0]:
def get_pairwise_sequence_df(dataset):
    df = pd.DataFrame(dataset)
    df = df.rename(columns={0: 'qid', 1: 'pos_id', 2:'neg_id'})
    df['pos_label'] = df.apply(lambda x: 1, axis=1)
    df['neg_label'] = df.apply(lambda x: 0, axis=1)

    df['question'] = df['qid'].apply(lambda x: qid_to_text[x])
    df['pos_ans'] = df['pos_id'].apply(lambda x: docid_to_text[x])
    df['neg_ans'] = df['neg_id'].apply(lambda x: docid_to_text[x])

    return df

In [0]:
def get_input(questions, answers, max_seq_len):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    token_type_ids = []
    att_masks = []

    for i in tqdm(range(len(questions))):
        a = questions[i]
        b = answers[i]

        encoded_seq = tokenizer.encode_plus(a, b, 
                                            max_length=max_seq_len, 
                                            pad_to_max_length=True, 
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

        input_id = encoded_seq['input_ids']
        token_type_id = encoded_seq['token_type_ids']
        att_mask = encoded_seq['attention_mask']

        assert len(input_id) == max_seq_len, "Input id dimension incorrect!"
        assert len(token_type_id) == max_seq_len, "Token type id dimension incorrect!"
        assert len(att_mask) == max_seq_len, "Attention mask dimension incorrect!"

        input_ids.append(input_id)
        token_type_ids.append(token_type_id)
        att_masks.append(att_mask)

    return input_ids, token_type_ids, att_masks

## **Pairwise**

In [0]:
trainset = get_pairwise_sequence_df(train_set)
train_questions = trainset.question.values
train_pos_answers = trainset.pos_ans.values
train_neg_answers = trainset.neg_ans.values

train_pos_labels = trainset.pos_label.values
train_neg_labels = trainset.neg_label.values

In [0]:
train_pos_input, train_pos_type_id, train_pos_att_mask = get_input(train_questions, train_pos_answers, 256)
train_neg_input, train_neg_type_id, train_neg_att_mask = get_input(train_questions, train_neg_answers, 256)

100%|██████████| 284050/284050 [20:36<00:00, 229.81it/s]
100%|██████████| 284050/284050 [27:30<00:00, 172.12it/s]


In [0]:
validset = get_pairwise_sequence_df(valid_set)
valid_questions = validset.question.values
valid_pos_answers = validset.pos_ans.values
valid_neg_answers = validset.neg_ans.values

valid_pos_labels = validset.pos_label.values
valid_neg_labels = validset.neg_label.values

valid_pos_input, valid_pos_type_id, valid_pos_att_mask = get_input(valid_questions, valid_pos_answers, 256)
valid_neg_input, valid_neg_type_id, valid_neg_att_mask = get_input(valid_questions, valid_neg_answers, 256)

100%|██████████| 31600/31600 [02:25<00:00, 217.24it/s]
100%|██████████| 31600/31600 [02:25<00:00, 217.83it/s]


In [0]:
save_pickle(path+'/data-bert/train_pos_labels.pickle', train_pos_labels)
save_pickle(path+'/data-bert/train_neg_labels.pickle', train_neg_labels)
save_pickle(path+'/data-bert/valid_pos_labels.pickle', valid_pos_labels)
save_pickle(path+'/data-bert/valid_neg_labels.pickle', valid_neg_labels)

save_pickle(path+'/data-bert/train_pos_input_256.pickle', train_pos_input)
save_pickle(path+'/data-bert/train_neg_input_256.pickle', train_neg_input)
save_pickle(path+'/data-bert/valid_pos_input_256.pickle', valid_pos_input)
save_pickle(path+'/data-bert/valid_neg_input_256.pickle', valid_neg_input)

save_pickle(path+'/data-bert/train_pos_type_id_256.pickle', train_pos_type_id)
save_pickle(path+'/data-bert/train_neg_type_id_256.pickle', train_neg_type_id)
save_pickle(path+'/data-bert/valid_pos_type_id_256.pickle', valid_pos_type_id)
save_pickle(path+'/data-bert/valid_neg_type_id_256.pickle', valid_neg_type_id)

save_pickle(path+'/data-bert/train_pos_mask_256.pickle', train_pos_att_mask)
save_pickle(path+'/data-bert/train_neg_mask_256.pickle', train_neg_att_mask)
save_pickle(path+'/data-bert/valid_pos_mask_256.pickle', valid_pos_att_mask)
save_pickle(path+'/data-bert/valid_neg_mask_256.pickle', valid_neg_att_mask)

In [0]:
train_pos_labels = load_pickle(path+'/data-bert/train_pos_labels.pickle')
train_neg_labels = load_pickle(path+'/data-bert/train_neg_labels.pickle')
valid_pos_labels = load_pickle(path+'/data-bert/valid_pos_labels.pickle')
valid_neg_labels = load_pickle(path+'/data-bert/valid_neg_labels.pickle')

train_pos_input = load_pickle(path+'/data-bert/train_pos_input_256.pickle')
train_neg_input = load_pickle(path+'/data-bert/train_neg_input_256.pickle')
valid_pos_input = load_pickle(path+'/data-bert/valid_pos_input_256.pickle')
valid_neg_input = load_pickle(path+'/data-bert/valid_neg_input_256.pickle')

train_pos_type_id = load_pickle(path+'/data-bert/train_pos_type_id_256.pickle')
train_neg_type_id = load_pickle(path+'/data-bert/train_neg_type_id_256.pickle')
valid_pos_type_id = load_pickle(path+'/data-bert/valid_pos_type_id_256.pickle')
valid_neg_type_id = load_pickle(path+'/data-bert/valid_neg_type_id_256.pickle')

train_pos_mask = load_pickle(path+'/data-bert/train_pos_mask_256.pickle')
train_neg_mask = load_pickle(path+'/data-bert/train_neg_mask_256.pickle')
valid_pos_mask = load_pickle(path+'/data-bert/valid_pos_mask_256.pickle')
valid_neg_mask = load_pickle(path+'/data-bert/valid_neg_mask_256.pickle')

In [0]:
train_pos_labels = train_pos_labels[:100000]
train_neg_labels = train_neg_labels[:100000]
train_pos_input = train_pos_input[:100000]
train_neg_input = train_neg_input[:100000]
train_pos_type_id = train_pos_type_id[:100000]
train_neg_type_id = train_neg_type_id[:100000]
train_pos_mask = train_pos_mask[:100000]
train_neg_mask = train_neg_mask[:100000]

valid_pos_labels = valid_pos_labels[:10000]
valid_neg_labels = valid_neg_labels[:10000]
valid_pos_input = valid_pos_input[:10000]
valid_neg_input = valid_neg_input[:10000]
valid_pos_type_id = valid_pos_type_id[:10000]
valid_neg_type_id = valid_neg_type_id[:10000]
valid_pos_mask = valid_pos_mask[:10000]
valid_neg_mask = valid_neg_mask[:10000]

In [0]:
train_pos_inputs = torch.tensor(train_pos_input)
train_neg_inputs = torch.tensor(train_neg_input)
valid_pos_inputs = torch.tensor(valid_pos_input)
valid_neg_inputs = torch.tensor(valid_neg_input)

train_pos_labels = torch.tensor(train_pos_labels)
train_neg_labels = torch.tensor(train_neg_labels)
valid_pos_labels = torch.tensor(valid_pos_labels)
valid_neg_labels = torch.tensor(valid_neg_labels)

train_pos_type_ids = torch.tensor(train_pos_type_id)
train_neg_type_ids = torch.tensor(train_neg_type_id)
valid_pos_type_ids = torch.tensor(valid_pos_type_id)
valid_neg_type_ids = torch.tensor(valid_neg_type_id)

train_pos_masks = torch.tensor(train_pos_mask)
train_neg_masks = torch.tensor(train_neg_mask)
valid_pos_masks = torch.tensor(valid_pos_mask)
valid_neg_masks = torch.tensor(valid_neg_mask)

In [14]:
print(len(train_pos_inputs))
print(len(valid_pos_inputs))

100000
10000


In [0]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_pos_inputs, train_pos_type_ids, train_pos_masks, train_pos_labels, train_neg_inputs, train_neg_type_ids, train_neg_masks, train_neg_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(valid_pos_inputs, valid_pos_type_ids, valid_pos_masks, valid_pos_labels, valid_neg_inputs, valid_neg_type_ids, valid_neg_masks, valid_neg_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [16]:
print(len(train_dataloader))
print(len(validation_dataloader))

6250
625


In [17]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir=None, num_labels=2)

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
def pairwise_loss(pos_scores, neg_scores):

    cross_entropy_loss = -torch.log(pos_scores) - torch.log(1 - neg_scores)

    margin = 1

    hinge_loss = torch.max(torch.tensor(0, dtype=torch.float).to(device), margin - pos_scores + neg_scores)

    loss = (0.5 * cross_entropy_loss + 0.5 * hinge_loss)

    return loss

In [0]:
def train_pairwise(model, train_dataloader, optimizer, scheduler):

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    # Reset the total loss for this epoch.
    total_loss = 0
    nb_eval_steps, nb_eval_examples = 0, 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # `batch` contains three pytorch tensors:
        pos_input = batch[0].to(device)
        pos_type_id = batch[1].to(device)
        pos_mask = batch[2].to(device)
        pos_labels = batch[3].to(device)

        neg_input = batch[4].to(device)
        neg_type_id = batch[5].to(device)
        neg_mask = batch[6].to(device)
        neg_labels = batch[7].to(device)

        model.zero_grad()        

        pos_scores = softmax(model(pos_input, token_type_ids=pos_type_id, attention_mask=pos_mask, labels=pos_labels)[1], dim=1)[:,1]
        neg_scores = softmax(model(neg_input, token_type_ids=neg_type_id, attention_mask=neg_mask, labels=neg_labels)[1], dim=1)[:,1]
        
        loss = pairwise_loss(pos_scores, neg_scores).mean()
        
        # Track the number of batches
        nb_eval_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    return avg_train_loss

In [0]:
def validate_pairwise(model, validation_dataloader):

    model.eval()

    # Tracking variables 
    total_loss = 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        pos_input, pos_type_ids, pos_mask, pos_labels, neg_input, neg_type_ids, neg_mask, neg_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            pos_scores = softmax(model(pos_input, token_type_ids=pos_type_ids, attention_mask=pos_mask, labels=pos_labels)[1], dim=1)[:,1]
            neg_scores = softmax(model(neg_input, token_type_ids=neg_type_ids, attention_mask=neg_mask, labels=neg_labels)[1], dim=1)[:,1]
        
        loss = pairwise_loss(pos_scores, neg_scores).mean()
        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    avg_loss = total_loss / len(validation_dataloader) 

    return avg_loss

In [0]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-8, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [0]:
# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(epochs):

    # Evaluate training loss
    train_loss = train_pairwise(model, train_dataloader, optimizer, scheduler)
    # Evaluate validation loss
    valid_loss = validate_pairwise(model, validation_dataloader)
    
    # At each epoch, if the validation loss is the best
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_pairwise6.pt')

    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {}".format(round(train_loss, 3)))
    print("\t Validation Loss: {}\n".format(round(valid_loss, 3)))

 73%|███████▎  | 4586/6250 [1:01:08<22:10,  1.25it/s]

## **Pointwise**

In [17]:
trainset = get_sequence_df(train_set)
train_questions = trainset.question.values
train_answers = trainset.ans_cand.values
train_labels = trainset.label.values

train_input, train_type_id, train_att_mask = get_input(train_questions, train_answers, 256)

validset = get_sequence_df(valid_set)
valid_questions = validset.question.values
valid_answers = validset.ans_cand.values
valid_labels = validset.label.values

valid_input, valid_type_id, valid_att_mask = get_input(valid_questions, valid_answers, 256)

100%|██████████| 298662/298662 [28:54<00:00, 172.23it/s]
100%|██████████| 33195/33195 [02:31<00:00, 219.58it/s]


In [0]:
save_pickle(path+'/pointwise-data/train_labels.pickle', train_labels)
save_pickle(path+'/pointwise-data/valid_labels.pickle', valid_labels)

save_pickle(path+'/pointwise-data/train_input_256.pickle', train_input)
save_pickle(path+'/pointwise-data/valid_input_256.pickle', valid_input)

save_pickle(path+'/pointwise-data/train_type_id_256.pickle', train_type_id)
save_pickle(path+'/pointwise-data/valid_type_id_256.pickle', valid_att_mask)

save_pickle(path+'/pointwise-data/train_mask_256.pickle', train_att_mask)
save_pickle(path+'/pointwise-data/valid_mask_256.pickle', valid_att_mask)

In [0]:
train_labels = load_pickle(path+'/pointwise-data/train_labels.pickle')
valid_labels = load_pickle(path+'/pointwise-data/valid_labels.pickle')

train_input = load_pickle(path+'/pointwise-data/train_input_256.pickle')
valid_input = load_pickle(path+'/pointwise-data/valid_input_256.pickle')

train_type_id = load_pickle(path+'/pointwise-data/train_type_id_256.pickle')
valid_type_id = load_pickle(path+'/pointwise-data/valid_type_id_256.pickle')

train_att_mask = load_pickle(path+'/pointwise-data/train_mask_256.pickle')
valid_att_mask = load_pickle(path+'/pointwise-data/valid_mask_256.pickle')

In [20]:
print(len(train_input))
print(len(valid_input))

298662
33195


In [0]:
train_labels = train_labels[:10000]
train_input = train_input[:10000]
train_type_id = train_type_id[:10000]
train_att_mask = train_att_mask[:10000]

valid_labels = valid_labels[:1000]
valid_input = valid_input[:1000]
valid_type_id = valid_type_id[:1000]
valid_att_mask = valid_att_mask[:1000]

In [22]:
print(len(train_input))
print(len(valid_input))

10000
1000


In [0]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(valid_labels)

train_inputs = torch.tensor(train_input)
validation_inputs = torch.tensor(valid_input)

train_type_ids = torch.tensor(train_type_id)
validation_type_ids = torch.tensor(valid_type_id)

train_masks = torch.tensor(train_att_mask)
validation_masks = torch.tensor(valid_att_mask)

In [0]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_type_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_type_ids, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [25]:
print(len(train_dataloader))
print(len(validation_dataloader))

313
32


In [0]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## **Model**

In [27]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# # Load BertForSequenceClassification, the pretrained BERT model with a single 
# # linear classification layer on top. 
# model_path = "/content/drive/My Drive/FiQA/model/fin_model"
# model = BertForSequenceClassification.from_pretrained(model_path, cache_dir=None, num_labels=2)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir=None, num_labels=2)
model.to(device)

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
def train(model, train_dataloader, optimizer, scheduler):

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    # Reset the total loss for this epoch.
    total_loss = 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: token_type_ids
        #   [2]: attention masks
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        outputs = model(b_input_ids, 
                    token_type_ids=b_token_type_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    acc = eval_accuracy/nb_eval_steps

    return avg_train_loss, acc

In [0]:
def validate(model, validation_dataloader):

    model.eval()

    # Tracking variables 
    total_loss = 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_token_type_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=b_token_type_ids, 
                            attention_mask=b_input_mask,
                            labels=b_labels)
        
        loss = outputs[0]

        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    acc = eval_accuracy/nb_eval_steps
    avg_loss = total_loss / len(validation_dataloader) 

    return avg_loss, acc

In [0]:
from transformers import AdamW


# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [0]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [0]:
# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(epochs):

    # Evaluate training loss
    train_loss, train_acc = train(model, train_dataloader, optimizer, scheduler)
    
    # Evaluate validation loss
    valid_loss, valid_acc = validate(model, validation_dataloader)
    
    # At each epoch, if the validation loss is the best
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_finbert_test.pt')
    # torch.save(model.state_dict(), 'fin-bert_test2.pt')
    
    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 2)))
    print("\t Validation Loss: {} | Validation Accuracy: {}%\n".format(round(valid_loss, 3), round(valid_acc*100, 2)))

100%|██████████| 313/313 [13:01<00:00,  2.15s/it]
100%|██████████| 32/32 [00:27<00:00,  1.43it/s]
  0%|          | 0/313 [00:00<?, ?it/s]



 Epoch 1:
	 Train Loss: 0.09 | Train Accuracy: 96.87%
	 Validation Loss: 0.634 | Validation Accuracy: 83.01%



 52%|█████▏    | 163/313 [06:47<06:14,  2.50s/it]

In [0]:
for row in test_set:
    row[2] = [x for x in row[2] if x is not 0]

for row in test_set_full:
    row[2] = [x for x in row[2] if x is not 0]

In [0]:
from torch.nn.functional import softmax

## **Eval**

In [0]:
def get_rank(model, test_set, qid_rel, max_seq_len):

    qid_pred_rank = {}

    model.eval()

    for i, seq in enumerate(tqdm(test_set)):
        
        qid, label, cands = seq[0], seq[1], seq[2]

        q_text = qid_to_text[qid]

        cands_id = np.array(cands)

        scores = []

        for docid in cands:

            ans_text = docid_to_text[docid]

            encoded_seq = tokenizer.encode_plus(q_text, ans_text, 
                                            max_length=max_seq_len, 
                                            pad_to_max_length=True, 
                                            return_token_type_ids=True,
                                            return_attention_mask = True)

            input_ids = torch.tensor([encoded_seq['input_ids']]).to(device)
            token_type_ids = torch.tensor([encoded_seq['token_type_ids']]).to(device)
            att_mask = torch.tensor([encoded_seq['attention_mask']]).to(device)

            with torch.no_grad():
            # Forward pass, calculate logit predictions
                outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=att_mask)

            logits = outputs[0]

            pred = softmax(logits, dim=1)
            # pred = torch.sigmoid(logits)

            # Move logits and labels to CPU
            pred = pred.detach().cpu().numpy()

            scores.append(pred[:,1][0])

        # print(scores)

        # Get the indices of the sorted similarity scores
        sorted_index = np.argsort(scores)[::-1]

        # Get the docid from the sorted indices
        ranked_ans = cands_id[sorted_index]

        # Dict - key: qid, value: ranked list of docids
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank

In [0]:
toy_test_label = dict(itertools.islice(test_qid_rel.items(), 15))
toy_test = test_set[:15]
# toy_test = [[14, [398960], [84963, 14255, 398960]],
#             [68, [19183], [107584, 562777, 19183]],
#             [70, [327002], [107584, 327002, 19183]]]

In [28]:
model.load_state_dict(torch.load(path+'model/3_pairwise5.pt'))

# qid_pred_rank = get_rank(model, test_set, test_qid_rel, max_seq_len=512)
qid_pred_rank = get_rank(model, toy_test, toy_test_label, max_seq_len=256)

100%|██████████| 15/15 [02:27<00:00,  9.89s/it]


In [29]:
k = 10

num_q = len(test_set)

# MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, test_qid_rel, k)
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, toy_test_label, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 333 queries: 0.4510568402073561

MRR@10 for 333 queries: 0.44166666666666665

Average Precision@1: 0.4


In [0]:
k = 10

num_q = len(test_set)

# MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, test_qid_rel, k)
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, toy_test_label, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 333 queries: 0

MRR@10 for 333 queries: 0.0

Average Precision@1: 0.0


In [0]:
save_pickle(path+'rank/rank_bert_256_full.pickle', qid_pred_rank)