In [1]:
!pip install transformers==4.19.2
!pip install rouge_metric
!pip install nltk==3.6.5
!pip install sentencepiece
from google.colab import drive
drive.mount('./mydata')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.19.2
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 41.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 69.3 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

In [2]:
import sys
sys.path.append('./mydata/MyDrive/CSNLP_Project/Bert_model_COQA')
sys.path.append('./mydata/MyDrive/CSNLP_Project/T5_model_COQAR')
sys.path.append('./mydata/MyDrive/CSNLP_Project/T5_model_COQAR/rewriting')

In [3]:
import collections
import glob
import os
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from transformers import (AdamW, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup, BertTokenizer, BertModel, BertConfig)
from data.processors.coqa import Extract_Features, Processor, Result
from data.processors.metrics import get_predictions
from data.processors.evaluate import CoQAEvaluator, parse_args
from transformers import BertModel, BertPreTrainedModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import csv
import numpy as np

import evaluation
import argparse
import qrdatasets
import models
from utils import *
import random
import t5small
import t5base
import nltk

import config
import json

# CoQA dataset file
train_file="coqa-train-v1.0.json"
predict_file="coqa-dev-v1.0.json"
cur_path = os.getcwd()
output_directory = cur_path + "/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models"
input_dir = cur_path + "/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data"
# can use either BERT base or BERT large
pretrained_model="bert-base-uncased"
# pretrained_model="bert-large-uncased"
# it's better to fine-tune Bert-base for 4 epoches than only one
epochs = 4
evaluation_batch_size=1
train_batch_size=2

## Classes and Functions

In [4]:
#   our model is adapted from the baseline model of https://arxiv.org/pdf/1909.10772.pdf

class BertBaseUncasedModel(BertPreTrainedModel):

    #   Initialize Layers for our model
    def __init__(self,config,activation='relu'):
        super(BertBaseUncasedModel, self).__init__(config)
        self.bert = BertModel(config)
        hidden_size = config.hidden_size
        self.fc=nn.Linear(hidden_size,hidden_size)
        self.linear1 =nn.Linear(hidden_size,1)
        self.linear2= nn.Linear(hidden_size,2)
        self.activation = getattr(F, activation)
        self.init_weights()

    def forward(self,input_ids,token_type_ids=None,attention_mask=None,start_positions=None,end_positions=None,rational_mask=None,cls_idx=None,head_mask=None):
        #   Bert-base outputs
        outputs = self.bert(input_ids,token_type_ids=token_type_ids,attention_mask=attention_mask,head_mask=head_mask)
        output_vector, bert_pooled_output = outputs

        #   rational logits (rationale probability to calculate start and end logits)
        #   fc = w2 x relu(W1 x h)
        rational_logits = self.fc(output_vector)
        rational_logits = self.activation(self.linear1(rational_logits))

        #   pr = sigmoid(fc)
        rational_logits = torch.sigmoid(rational_logits)
        #   h1 = pr x outputvector-h
        output_vector = output_vector * rational_logits
        mask = token_type_ids.type(output_vector.dtype)
        rational_logits = rational_logits.squeeze(-1) * mask

        #   calculating start and end logits using FC(h1)
        start_end_logits = self.fc(output_vector)
        start_end_logits = self.activation(self.linear2(start_end_logits))
        
        start_logits, end_logits = start_end_logits.split(1, dim=-1)
        start_logits, end_logits = start_logits.squeeze(-1), end_logits.squeeze(-1)
        start_logits= start_logits * rational_logits
        end_logits =  end_logits * rational_logits

        #   fc2 = wa2 x relu(Wa1 x h1)
        attention  = self.fc(output_vector)
        attention  = (self.activation(self.linear1(attention))).squeeze(-1)

        #   a = SoftMax(fc2)
        attention = F.softmax(attention, dim=-1)
        attention_pooled_output = (attention.unsqueeze(-1) * output_vector).sum(dim=-2)
        unk_logits = self.fc(bert_pooled_output)
        unk_logits = self.activation(self.linear1(unk_logits))

        #   calculate yes and no logits using pooled-output = FC(a)
        yes_no_logits =self.fc(attention_pooled_output)
        yes_no_logits =self.activation(self.linear2(yes_no_logits))
        yes_logits, no_logits = yes_no_logits.split(1, dim=-1)

        if start_positions != None and end_positions != None:
            start_positions, end_positions = start_positions + cls_idx, end_positions + cls_idx
            start = torch.cat((yes_logits, no_logits, unk_logits, start_logits), dim=-1)
            end = torch.cat((yes_logits, no_logits, unk_logits, end_logits), dim=-1)
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            #   calculate cross entropy loss for start and end logits
            Entropy_loss = CrossEntropyLoss()
            start_loss = Entropy_loss(start, start_positions)
            end_loss = Entropy_loss(end, end_positions)
            #   Training objective: to minimize the total loss of both start and end logits
            total_loss = (start_loss + end_loss) / 2 
            return total_loss
        return start_logits, end_logits, yes_logits, no_logits, unk_logits

In [5]:
### load dataset for training or evaluation

def load_dataset(tokenizer, evaluate=True, cache_file_name=None, predict_file_name=None):
    '''
    converting raw coqa dataset into features to be processed by BERT  
    '''
    # print(os.path.join(input_dir,"bert-base-uncased_train"))
    # use user-defined cache_file_name or the default ones
    if cache_file_name is not None:
        cache_file = os.path.join(input_dir,cache_file_name)
    else:
        if evaluate:
            cache_file = os.path.join(input_dir,"bert-base-uncased_dev")
        else:
            cache_file = os.path.join(input_dir,"bert-base-uncased_train")

    if os.path.exists(cache_file):
        print("Loading cache",cache_file)
        features_and_dataset = torch.load(cache_file)
        features, dataset, examples = (
            features_and_dataset["features"],features_and_dataset["dataset"],features_and_dataset["examples"])
    else:
        print("Creating features from dataset file at", input_dir)
        if predict_file_name is not None:
            predict_file = predict_file_name

        if not "data" and ((evaluate and not predict_file) or (not evaluate and not train_file)):
            raise ValueError("predict_file or train_file not found")
        else:
            processor = Processor()
            if evaluate:
                # process the raw data, load only two historical conversation
                # def get_examples(self, data_dir, history_len, filename=None, threads=1)
                examples = processor.get_examples(input_dir, 2, filename=predict_file, threads=1)
            else:
                # process the raw data
                # def get_examples(self, data_dir, history_len, filename=None, threads=1)
                # number of examples is the same as the number of the QA pairs: 108647
                # each example is consist of question_text with 2 historical turn and the text, and ground truth start and end positions
                examples = processor.get_examples(input_dir, 2, filename=train_file, threads=1)
        
        # max_seq_length is the total length for input sequence of BERT 
        features, dataset = Extract_Features(examples=examples,tokenizer=tokenizer,max_seq_length=512, doc_stride=128, max_query_length=64, is_training=not evaluate, threads=1)
    #   caching it in a cache file to reduce time
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cache_file)
    if evaluate:
        return dataset, examples, features
    return dataset

In [6]:
### Load two fine-tuned models, but with diffetent load methods

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load bert
bert_path = './mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models/Bert_from_original_Surya_epoch4'
bert_model = BertBaseUncasedModel.from_pretrained(bert_path) 
bert_tokenizer = BertTokenizer.from_pretrained(bert_path, do_lower_case=True)
bert_model.to(device)

# load t5
# t5_path = './mydata/MyDrive/CSNLP_Project/T5_model_COQAR/trained_models/t5_small_with_story_batch16_hist_3_mixed/epoch10'
t5_path = './mydata/MyDrive/CSNLP_Project/T5_model_COQAR/trained_models/t5_base_with_story_batch4_hist_20/epoch6.zip'

t5_model = torch.load(t5_path)
# t5_input_tokenizer = t5small.get_input_tokenizer()
# t5_output_tokenizer = t5small.get_output_tokenizer()
t5_input_tokenizer = t5base.get_input_tokenizer()
t5_output_tokenizer = t5base.get_output_tokenizer()
t5_model.to(device)

pass

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605 [00:00<?, ?B/s]

In [7]:
# default parameters for T5
hparams = {
    'epochs' : 3,
    'learning_rate' : 0.00005,
    'batch_size' : 16,
    'weight_decay' : 0.0,
    'history_size' : 3,
    'dropout_rate' : 0.1,
    'include_story' : True,
    'model_size' : 'small'
}
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [8]:
# get rewritten answers from T5

def generate_new_dataset(input_dir,file_type='train',t5_type='base',t5_model=t5_model,method='append'):
### open the COQA dataset
    if file_type == 'train':
        # training dataset
        with open(os.path.join(input_dir, train_file), "r", encoding="utf-8") as reader:
            input_data = json.load(reader)
    elif file_type == 'dev':
        # dev dataset
        with open(os.path.join(input_dir, predict_file), "r", encoding="utf-8") as reader:
            input_data = json.load(reader)
    else:
        raise Exception("must specify a file type: train/dev")
        pass

    # create empty dictionary
    t5_data = {'input':[], 'references':[], 'context':[]}

    # create dataset for T5
    for input_d in input_data["data"]:
        questions = []
        assert len(input_d['questions']) == len(input_d['answers'])
        for id_q in range(len(input_d['questions'])):
            if id_q == 0:
                questions.append(input_d['questions'][id_q]['input_text'])
            else:
                questions.append(input_d['answers'][id_q-1]['input_text'])
                questions.append(input_d['questions'][id_q]['input_text'])
            t5_data['input'].append(questions.copy())
            t5_data['references'].append([''])
            t5_data['context'].append(input_d['story'])

    # T5 make dataset
    if t5_type == 'base':
        dataset = t5base.make_dataset(t5_data, hparams, cuda = True)
    elif t5_type == 'small':
        dataset = t5base.make_dataset(t5_data, hparams, cuda = True)
    else:
        raise Exception("must specify a t5 model type: base/small")
        pass

    # generate rewritten questions
    loader = DataLoader(dataset=dataset, batch_size=hparams['batch_size'])
    t5_model.cuda()
    t5_model.train(False)
    rewritten_qs = []
    for dic in loader:
        output = t5_model.generate(input_ids = dic['input_ids'], attention_mask = dic['attention_mask'])
        pred = t5_output_tokenizer.batch_decode(output, skip_special_tokens = True)
        rewritten_qs += pred

    ### Create new training/dev dataset json file for Bert
    # append or replace with the generated question 
    count = 0
    for story_id in range(len(input_data["data"])):
        for question_id in range(len(input_data["data"][story_id]['questions'])):
            if method == 'append':
                # append the rewritten question
                input_data["data"][story_id]['questions'][question_id]['input_text'] += (' '+rewritten_qs[count])
            elif method == 'replace':
                # replace with the rewritten question
                input_data["data"][story_id]['questions'][question_id]['input_text'] = rewritten_qs[count]
            else:
                raise Exception("must specify a method: append/replace")
                pass
            count += 1

    # save as new dataset json file
    file_name = 'coqa-{}-v1.0-{}_with_T5.json'.format(file_type,method)
    # with open(os.path.join( input_dir, 'coqa-dev-v1.0-with_T5.json'), 'w', encoding="utf-8") as outfile:
    with open(os.path.join(input_dir, file_name), 'w', encoding="utf-8") as outfile:
        json.dump(input_data, outfile)


generate_new_dataset(input_dir,file_type='dev',t5_type='base',t5_model=t5_model,method='append')

Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors


In [12]:
### wrtiting predictions with fine-tuned model

def convert_to_list(tensor):
    return tensor.detach().cpu().tolist()


def Write_predictions(model, tokenizer, device, variant_name,cache_file_name=None,predict_file_name=None,method=''):
    # generate catch file processed from the json dataset
    dataset, examples, features = load_dataset(tokenizer, evaluate=True, cache_file_name=cache_file_name, predict_file_name=predict_file_name)
    
    if not os.path.exists(output_directory+'/'+variant_name):
        os.makedirs(output_directory+'/'+variant_name)
        
    #   wrtiting predictions once training is complete
    evalutation_sampler = SequentialSampler(dataset)
    evaluation_dataloader = DataLoader(dataset, sampler=evalutation_sampler, batch_size=evaluation_batch_size)
    mod_results = []
    for batch in tqdm(evaluation_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            # each batch has 4 elements, the last is the examle_indeces
            inputs = {"input_ids": batch[0],"token_type_ids": batch[1],"attention_mask": batch[2]}
            # indices of ConvQA example in this batch
            example_indices = batch[3]
            outputs = model(**inputs)
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            output = [convert_to_list(output[i]) for output in outputs]
            start_logits, end_logits, yes_logits, no_logits, unk_logits = output
            result = Result(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, yes_logits=yes_logits, no_logits=no_logits, unk_logits=unk_logits)
            mod_results.append(result)

    # Get predictions for development dataset and store it in predictions.json
    output_prediction_file = os.path.join(output_directory+'/'+variant_name, "predictions{}.json".format(method))
    get_predictions(examples, features, mod_results, 20, 30, True, output_prediction_file, False, tokenizer)

## Prediction

predict on dev dataset

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_parameter_directory = [ f.path for f in os.scandir(output_directory) if f.is_dir() ]

method = '_append'

# use catch file name
cache_file_name = 'bert-base-uncased_dev_with_T5_append'
# use the predict file name
predict_file_name = 'coqa-dev-v1.0-append_with_T5.json'
# # reset the catch file name
# cache_file_name = None
# # reset the predict file name
# predict_file_name = None

model_parameter_directory

['/content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models/Bert_from_original_Surya_epoch4',
 '/content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models/model_weights0',
 '/content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models/Bert_with_T5_rewritten_epoch4']

In [15]:
# for m in model_parameter_directory:
m = model_parameter_directory[2]
variant_name = m.split('/')[-1]
print(variant_name)
# m = m + '/pytorch_model_2.bin'
model = BertBaseUncasedModel.from_pretrained(m) 
tokenizer = BertTokenizer.from_pretrained(m, do_lower_case=True)
model.to(device)
Write_predictions(model, tokenizer, device, variant_name, cache_file_name, predict_file_name,method=method)

Bert_with_T5_rewritten_epoch4
Loading cache /content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/bert-base-uncased_dev_with_T5_append


Evaluating: 100%|██████████| 8775/8775 [03:18<00:00, 44.27it/s]
Writing preditions: 100%|██████████| 7983/7983 [00:31<00:00, 250.61it/s]


## Evaluation

In [None]:
evaluator = CoQAEvaluator(input_dir+'/'+predict_file)

variant_name = 'Bert_with_T5_rewritten_epoch4'

pre_file_bert = output_directory+'/'+variant_name+'/'+'predictions{}.json'.format(method)

# evaluate
with open(pre_file_bert) as f:
    pred_data = CoQAEvaluator.preds_to_dict(pre_file_bert)

# write evaluate results
with open(output_directory+'/'+variant_name+'/'+'evaluation{}.json'.format(method), 'w') as f:
    json.dump(evaluator.model_performance(pred_data), f, indent=2)

# show
print(json.dumps(evaluator.model_performance(pred_data), indent=2))