In [None]:
%%capture
!pip install nlp simpletransformers unidecode -q

In [None]:
import logging
import json
import nlp
import math
import random
from itertools import chain
import string
from datasets import load_dataset
import argparse
import collections
import os
import re
import unidecode
import sys
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
cd drive/MyDrive/Model Fine Tuning/

/content/drive/MyDrive/Model Fine Tuning


# Logging Initialization

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
ablation = "POLICYQA_BASELINE_ORIGINAL"
run_size = "Full"

In [None]:
ablation

'POLICYQA_BASELINE_ORIGINAL'

# Dataset Loading

## Fine tune Dataset

In [None]:
def read_json_as_list(dir, filename: str):
  with open(f"{dir}/{filename}.json", 'r') as file:
      data = file.read()
      parsed_data = json.loads(data)
      return list(parsed_data.values())

def flatten_list_of_lists(lst):
  flattened_list = list(chain.from_iterable(lst))
  return flattened_list

def create_split(data,train_size):

  first = int(math.floor(train_size * len(data)))
  second = 16000 + int(math.floor(0.1 * len(data)))
  # third = first  + second + int(math.floor(0.1 * len(data)))

  return data[:first],data[16000:second],data[second:]

ARTICLES_REGEX = re.compile(r"\b(a|an|the)\b", re.UNICODE)

OPTS = None

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return ARTICLES_REGEX.sub(" ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def preprocess_eval(data):
  for i,dict in enumerate(data):
    for j,qa in enumerate(dict["qas"]):
      answers = data[i]["qas"][j]["answers"]
      if len(answers) > 0:
        data[i]["qas"][j]["answers"][0]["text"] = normalize_answer(answers[0]["text"])

  return data

def preprocess_data(data):

  for i,dict in enumerate(data):
    passage = dict["context"]
    for j,qa in enumerate(dict["qas"]):
      answers = data[i]["qas"][j]["answers"]
      if len(answers) > 0:
        data[i]["qas"][j]["answers"][0]["text"] = normalize_answer(answers[0]["text"])

  return data

def normalize_string(s):
    """Remove punctuation, convert to lower case, and remove accents from a string."""
    s = unidecode.unidecode(s)  # remove accents
    s = re.sub(r'\W+', '', s).lower()  # remove punctuation and convert to lower case
    return s

def create_mapping(original, normalized):
    """Create a mapping from each character in the normalized string to the corresponding character in the original string."""
    mapping = {}
    j = 0
    for i in range(len(original)):
        if j < len(normalized) and original[i].lower() == normalized[j]:
            mapping[j] = i
            j += 1
    return mapping

def find_answer_in_passage(passage, answer):
    """Find the version of the answer with correct punctuation and casing in the passage."""
    normalized_passage = normalize_string(passage)
    normalized_answer = normalize_string(answer)

    answer_start = normalized_passage.find(normalized_answer)

    if answer_start == -1:
        return None  # answer not found in passage

    answer_end = answer_start + len(normalized_answer)

    # create a mapping from each character in the normalized passage to the original passage
    mapping = create_mapping(passage, normalized_passage)

    # find the start and end points in the original passage
    original_start = mapping.get(answer_start, 0)
    original_end = mapping.get(answer_end-1, len(passage)-1) + 1  # add 1 because end index is exclusive

    # return the corresponding substring from the original passage
    return passage[original_start:original_end]

def preprocess_answer(answer,passage):
  ans = None
  while not ans:
    ans = find_answer_in_passage(passage, answer)
    if " " not in answer:
      break
    answer = str(answer.split(' ', 1)[1])

  return ans

def remove_indices(lst, indices):
    return [value for index, value in enumerate(lst) if index not in indices]

def get_answer_index(paragraph: str, answer: str):
    return paragraph.lower().find(answer.lower())

def preprocess_data(data):
  for i,dict in enumerate(data):
    passage = dict["context"]
    rem_idx = []

    for j,qa in enumerate(dict["qas"]):
      answers = data[i]["qas"][j]["answers"]

      if len(answers) > 0:
        ans = preprocess_answer(answers[0]["text"], passage)

        if ans:
          data[i]["qas"][j]["answers"][0]["text"] = ans
          data[i]["qas"][j]["answers"][0]["answer_start"] = get_answer_index(passage,ans)
        else:
          rem_idx.append(j)

    data[i]["qas"] = remove_indices(data[i]["qas"], rem_idx)


  return data

def preprocess_squad(data):
  dataset = []
  for topic in data:
      for paragraphs in topic["paragraphs"]:
        dataset.append({"context":paragraphs["context"],"qas":paragraphs["qas"]})

  return dataset


In [None]:
with open("../Low Resource/TechQA_AUGMENTED_ONE_SHOT_REDO.json", "r") as f:
    train_data = json.load(f)["data"]

In [None]:
for i,data in enumerate(train_data):
    for j,para in enumerate(data["paragraphs"]):
      for k,qa in enumerate(para["qas"]):
        train_data[i]["paragraphs"][j]["qas"][k]["answers"][0]["answer_start"] = int(train_data[i]["paragraphs"][j]["qas"][k]["answers"][0]["answer_start"])


In [None]:
train_data = preprocess_squad(train_data)

In [None]:
with open("../Low Resource/dev.json", "r") as f:
    eval_data = json.load(f)["data"]


In [None]:
for i,data in enumerate(eval_data):
    for j,para in enumerate(data["paragraphs"]):
      for k,qa in enumerate(para["qas"]):
        eval_data[i]["paragraphs"][j]["qas"][k]["answers"][0]["answer_start"] = int(eval_data[i]["paragraphs"][j]["qas"][k]["answers"][0]["answer_start"])


In [None]:
eval_data = preprocess_squad(eval_data)

# Model

## Model Arguments

In [None]:
model_args = {
    "learning_rate": 3e-5,
    "num_train_epochs":5,
    "output_dir": f'outputs/{ablation}',
    "best_model_dir": f'outputs/{ablation}/best_model',
    "max_seq_length": 384,
    "doc_stride": 128,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "train_batch_size": 16,
    "save_steps": 100000,
    "do_eval": True,
    "do_train": True,
    "do_lower_steps": True,
    "verbose": False,
    "evaluate_during_training_steps": 1006900,
    "evaluate_during_training_silent": True,
    "evaluate_during_training": True,
}


# model_name = f"outputs/{ablation}/best_model"
model_name = "roberta-base"
model_type = "roberta"

model = QuestionAnsweringModel(
    model_type, model_name , args=model_args, use_cuda = True
)


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

## Model Training/Evaluation

In [None]:
# Train the model
model.train_model(train_data, eval_data = eval_data)

convert squad examples to features: 100%|██████████| 51267/51267 [00:45<00:00, 1120.12it/s]
add example index and unique id: 100%|██████████| 51267/51267 [00:00<00:00, 890136.50it/s]


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/3216 [00:00<?, ?it/s]



Running Evaluation:   0%|          | 0/477 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/3216 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/477 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/3216 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/477 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/3216 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/477 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/3216 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/477 [00:00<?, ?it/s]

(16080,
 {'global_step': [3216, 6432, 9648, 12864, 16080],
  'correct': [627, 678, 691, 685, 666],
  'similar': [2088, 2022, 2043, 2047, 2065],
  'incorrect': [1094, 1109, 1075, 1077, 1078],
  'train_loss': [1.840348720550537,
   1.1970703601837158,
   1.2169616222381592,
   0.8346544504165649,
   0.8656442761421204],
  'eval_loss': [-7.357941889412998,
   -7.847738142033543,
   -8.068494496855346,
   -8.634687827568134,
   -8.981009237421384]})