In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pickle
from collections import Counter
from tqdm import tqdm
import itertools
import pandas as pd
from itertools import islice
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import random
!pip install transformers
import torch
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
from torch.nn import CrossEntropyLoss
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

path = "drive/My Drive/FiQA/"

Using TensorFlow backend.


Using device: cuda

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [0]:
from evaluate import *

In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def remove_empty(test_set):
    for index, row in enumerate(test_set):
        for doc in row[1]:
            if doc in empty_docs:
                del test_set[index]
    return test_set

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def pad_seq(seq, max_seq_len):
    # Pad each seq to be the same length to process in batch.
    # pad_token = 0
    if len(seq) >= max_seq_len:
        seq = seq[:max_seq_len]
    else:
        seq += [0]*(max_seq_len - len(seq))
    return seq

In [0]:
# dict mapping of token to idx
# vocab = load_pickle(path + 'vocab_full.pickle')
# # dict mapping of docid to doc text
# docid_to_text = load_pickle(path + 'label_ans.pickle')

# # dict mapping of qid to question text
# qid_to_text = load_pickle(path + 'qid_text.pickle')

train_qid_rel = load_pickle(path + "new-data/qid_rel_train.pickle")
test_qid_rel = load_pickle(path + "new-data/qid_rel_test.pickle")
valid_qid_rel = load_pickle(path + "new-data/qid_rel_valid.pickle")

train_set = load_pickle(path + 'new-data/data_50/train_set_50.pickle')
valid_set = load_pickle(path + 'new-data/data_50/valid_set_50.pickle')

test_set = load_pickle(path + 'new-data/data_50/test_set_50.pickle')
test_set_full = load_pickle(path + 'new-data/data_50/test_set_full_50.pickle')

# empty_docs = load_pickle(path+'empty_docs.pickle')

In [0]:
# train_set = [x for x in train_set if x[1] not in empty_docs]
# valid_set = [x for x in valid_set if x[1] not in empty_docs]

# test_set = remove_empty(test_set)
# test_set_full = remove_empty(test_set_full)

print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 284050
Number of validation samples: 31600
Number of test samples: 333


In [0]:
collection = pd.read_csv(path+"data-bert/collection_new.tsv", sep="\t", header=None)
collection = collection.rename(columns={0: 'docid', 1: 'doc'})

def load_questions(path):
    """
    Returns a dataframe of cols: qid, question
    """
    # Question ID and Question text
    query_df = pd.read_csv(path, sep="\t")
    queries = query_df[['qid', 'question']]

    return queries

queries = load_questions(path + "FiQA_train_question_final.tsv")

# Question to question text
qid_to_text = {}

for index, row in queries.iterrows():
    qid_to_text[row['qid']] = row['question']

docid_to_text = {}

for index, row in collection.iterrows():
    docid_to_text[row['docid']] = row['doc']

In [0]:
# # Load the BERT tokenizer.
# print('Loading BERT tokenizer...')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
from tokenizers import BertWordPieceTokenizer
from tokenizers.processors import BertProcessing

tokenizer = BertWordPieceTokenizer(path+"bert-lm/lm_model/vocab.txt")
tokenizer._tokenizer.post_processor = BertProcessing(
    ("[SEP]", tokenizer.token_to_id("[CLS]")),
    ("[CLS]", tokenizer.token_to_id("[SEP]")),
)
tokenizer.enable_truncation(max_length=256)
tokenizer.enable_padding(max_length=256)

In [0]:
def add_ques_token(string):
    question = string + " [SEP] "

    return question

In [0]:
def get_sequence_df(dataset):
    df = pd.DataFrame(dataset)
    df = df.rename(columns={0: 'qid', 1: 'pos', 2:'neg'})
    df_pos = df[['qid', 'pos']]
    df_pos = df_pos.rename(columns={'pos': 'docid'})
    df_pos['label'] = df_pos.apply(lambda x: 1, axis=1)
    df_pos = df_pos.drop_duplicates()

    df_neg = df[['qid', 'neg']]
    df_neg = df_neg.rename(columns={'neg': 'docid'})
    df_neg['label'] = df_neg.apply(lambda x: 0, axis=1)
    data_df = pd.concat([df_pos, df_neg]).sort_values(by=['qid'])

    data_df['question'] = data_df['qid'].apply(lambda x: qid_to_text[x])
    data_df['ans_cand'] = data_df['docid'].apply(lambda x: docid_to_text[x])

    return data_df

In [0]:
# Get the lists of sentences and their labels.
train_data = get_sequence_df(train_set)

In [0]:
train_data.head(5)

Unnamed: 0,qid,docid,label,question,ans_cand
0,0,18850,1,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
21,0,245117,0,What is considered a business expense on a bus...,The dividend yield can be used to compare a st...
22,0,351137,0,What is considered a business expense on a bus...,I haven't seen anyone mention tax consideratio...
23,0,242850,0,What is considered a business expense on a bus...,"As a common shareholder, why would I want to a..."
24,0,355972,0,What is considered a business expense on a bus...,Will the bank be taxed on the $x received thro...


In [0]:
def get_data(data_set):# Get the lists of sentences and their labels.
    dataset = get_sequence_df(data_set)
    data_questions = dataset.question.values
    data_answers = dataset.ans_cand.values
    data_labels = dataset.label.values

    data_inputs = []
    data_type_ids = []
    data_att_masks = []


    for i in range(len(data_questions)):

        tokenized_seq = tokenizer.encode(data_questions[i], data_answers[i])

        data_inputs.append(tokenized_seq.ids)
        data_type_ids.append(tokenized_seq.type_ids)
        data_att_masks.append(tokenized_seq.attention_mask)

    return data_inputs, data_type_ids, data_att_masks, data_labels

In [0]:
train_inputs, train_type_ids, train_att_masks, train_labels = get_data(train_set)
valid_inputs, valid_type_ids, valid_att_masks, valid_labels = get_data(valid_set)

In [0]:
len(train_inputs)

29224

In [0]:
save_pickle(path+"data-finbert/train_inputs.pickle", train_inputs)
save_pickle(path+"data-finbert/train_type_id.pickle", train_type_id)
save_pickle(path+"data-finbert/train_att_masks.pickle", train_att_masks)
save_pickle(path+"data-finbert/train_labels.pickle", train_labels)

In [0]:
train_inputs = load_pickle(path+"data-finbert/train_inputs.pickle")
train_type_id = load_pickle(path+"data-finbert/train_type_id.pickle")
train_att_masks = load_pickle(path+"data-finbert/train_att_masks.pickle")
train_labels = load_pickle(path+"data-finbert/train_labels.pickle")

In [0]:
save_pickle(path+"data-finbert/valid_inputs.pickle", valid_inputs)
save_pickle(path+"data-finbert/valid_type_id.pickle", valid_type_ids)
save_pickle(path+"data-finbert/valid_att_masks.pickle", valid_att_masks)
save_pickle(path+"data-finbert/valid_labels.pickle", valid_labels)

In [0]:
valid_inputs = load_pickle(path+"data-finbert/valid_inputs.pickle")
valid_type_ids = load_pickle(path+"data-finbert/valid_type_id.pickle")
valid_att_masks = load_pickle(path+"data-finbert/valid_att_masks.pickle")
valid_labels = load_pickle(path+"data-finbert/valid_labels.pickle")

In [0]:
print(len(train_inputs))
print(len(valid_inputs))

29224
3190


In [0]:
train_inputs = train_inputs[:10000]
train_type_id = train_type_id[:10000]
train_att_masks = train_att_masks[:10000]
train_labels = train_labels[:10000]

valid_inputs = train_inputs[:1000]
valid_type_ids = valid_type_ids[:1000]
valid_att_masks = train_att_masks[:1000]
valid_labels = train_labels[:1000]

## **Pointwise**

In [0]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(valid_labels)

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(valid_inputs)

train_type_ids = torch.tensor(train_type_id)
validation_type_ids = torch.tensor(valid_type_ids)

train_masks = torch.tensor(train_att_masks)
validation_masks = torch.tensor(valid_att_masks)

In [0]:
print(len(train_inputs))
print(len(validation_inputs))

1000
100


In [0]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_type_ids, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_type_ids, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
print(len(train_dataloader))
print(len(validation_dataloader))

914
100


In [0]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## **Model**

In [0]:
# import torch.nn as nn

# class BertClassifier(nn.Module):
#     def __init__(self, bert):
        
#         super().__init__()

#         self.config = BertConfig.from_pretrained(path+"model/fin_model/config.json")
#         # self.config = BertConfig.from_pretrained("/content/drive/My Drive/FiQA/bert-lm/weights2/config.json")
#         # self.config = BertConfig()
#         self.num_labels = self.config.num_labels
#         self.bert = bert
#         self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
#         self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)

#     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

#         outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)

#         pooled_output = outputs[1]

#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

#         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

#         if labels is not None:
#             loss_fct = CrossEntropyLoss()
#             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
#             outputs = (loss,) + outputs

#         return outputs  # (loss), logits, (hidden_states), (attentions)

In [0]:
model_path = "/content/drive/My Drive/FiQA/bert-lm/lm_model"
model = BertForSequenceClassification.from_pretrained(model_path,cache_dir=None, num_labels=2)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28989, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
def train(model, train_dataloader, optimizer, scheduler):

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    # Reset the total loss for this epoch.
    total_loss = 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: token_type_ids
        #   [2]: attention masks
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        outputs = model(b_input_ids, 
                    token_type_ids=b_token_type_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        # Accumulate the training loss over all of the batches
        total_loss += loss.item()
    
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    acc = eval_accuracy/nb_eval_steps

    return avg_train_loss, acc

In [0]:
def validate(model, validation_dataloader):

    model.eval()

    # Tracking variables 
    total_loss = 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_token_type_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=b_token_type_ids, 
                            attention_mask=b_input_mask,
                            labels=b_labels)
        
        loss = outputs[0]

        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

        total_loss += loss.item()

    acc = eval_accuracy/nb_eval_steps
    avg_loss = total_loss / len(validation_dataloader) 

    return avg_loss, acc

In [0]:
from transformers import AdamW


# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-8, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [0]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [0]:
# Lowest validation lost
best_valid_loss = float('inf')

for epoch in range(epochs):

    # Evaluate training loss
    train_loss, train_acc = train(model, train_dataloader, optimizer, scheduler)
    
    # Evaluate validation loss
    valid_loss, valid_acc = validate(model, validation_dataloader)
    
    # At each epoch, if the validation loss is the best
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    torch.save(model.state_dict(), path + 'model/' + str(epoch+1)+'_lmm.pt')
    # torch.save(model.state_dict(), 'fin-bert_test2.pt')
    
    print("\n\n Epoch {}:".format(epoch+1))
    print("\t Train Loss: {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 2)))
    print("\t Validation Loss: {} | Validation Accuracy: {}%\n".format(round(valid_loss, 3), round(valid_acc*100, 2)))

100%|██████████| 313/313 [04:02<00:00,  1.47it/s]
100%|██████████| 32/32 [00:07<00:00,  4.20it/s]
  0%|          | 0/313 [00:00<?, ?it/s]



 Epoch 1:
	 Train Loss: 0.595 | Train Accuracy: 90.48%
	 Validation Loss: 0.574 | Validation Accuracy: 97.17%



 27%|██▋       | 84/313 [01:05<02:58,  1.29it/s]

KeyboardInterrupt: ignored

In [0]:
from scipy.special import softmax

In [0]:
def get_rank(model, test_set, qid_rel, max_seq_len):

    qid_pred_rank = {}

    model.eval()

    for i, seq in enumerate(tqdm(test_set)):
        
        qid, label, cands = seq[0], seq[1], seq[2]

        q_text = qid_to_text[qid]

        cands_id = np.array(cands)

        scores = []

        for docid in cands:

            ans_text = docid_to_text[docid]
            
            tokenized_seq = tokenizer.encode(q_text, ans_text)

            input_id = tokenized_seq.ids
            type_ids = tokenized_seq.type_ids
            att_masks = tokenized_seq.attention_mask
            
            input_ids = torch.tensor([input_id]).to(device)
            token_type_ids = torch.tensor([type_ids]).to(device)
            att_mask = torch.tensor([att_masks]).to(device)

            with torch.no_grad():
            # Forward pass, calculate logit predictions
                outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=att_mask)

            logits = outputs[0]

            # print(logits)

            # logits = logits.detach().cpu()

            # pred = softmax(logits, axis=1)

            # print(pred)

            pred = torch.sigmoid(logits)

            # Move logits and labels to CPU
            pred = pred.detach().cpu().numpy()

            scores.append(pred[:,1][0])

        print(scores)

    #     # Get the indices of the sorted similarity scores
    #     sorted_index = np.argsort(scores)[::-1]

    #     # Get the docid from the sorted indices
    #     ranked_ans = cands_id[sorted_index]

    #     # Dict - key: qid, value: ranked list of docids
    #     qid_pred_rank[qid] = ranked_ans

    # return qid_pred_rank

In [0]:
toy_test_label = dict(itertools.islice(test_qid_rel.items(), 2))
# toy_test = test_set[:2]
toy_test = [[14, [14255], [84963, 354716, 398960]],
            [68, [107584, 562777], [107584, 19183, 84963]]]

In [0]:
model.load_state_dict(torch.load(path+'model/1_lmm.pt'))

# qid_pred_rank = get_rank(model, test_set_full, test_qid_rel, max_seq_len=512)
qid_pred_rank = get_rank(model, toy_test, toy_test_label, max_seq_len=256)


  0%|          | 0/2 [00:00<?, ?it/s][A
100%|██████████| 2/2 [00:00<00:00, 19.85it/s][A

[0.46642703, 0.46642703, 0.46642703]
[0.46642703, 0.46642703, 0.46642703]


In [0]:
k = 10

num_q = len(test_set)

# MRR, average_ndcg, precision = evaluate(qid_pred_rank, test_qid_rel, k)
MRR, average_ndcg, precision = evaluate(qid_pred_rank, toy_test_label, k)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))



Average nDCG@10 for 330 queries: 0.023979694423373953

MRR@10 for 330 queries: 0.010317460317460315

Average Precision@1: 0.0


In [0]:
save_pickle(path+'rank/rank_lm_bert_test_full.pickle', qid_pred_rank)