# This file is for fine tuning the model and some data exploration. The model has already been fine-tuned and is pushed to the huggingface hub. It can be used without running this again.

# Library requirements

In [None]:
! pip install datasets
! pip install transformers
! pip install accelerate -U

# Logging in to HuggingFace

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Data exploration and splitting

In [None]:
from datasets import load_dataset_builder, load_dataset, get_dataset_split_names, get_dataset_config_names

In [None]:
# building the dataset object first to see info about the data
dataset_object = load_dataset_builder("mlqa", "mlqa.en.en")

In [None]:
# Get all information about the dataset
dataset_object.info

In [None]:
# Get specific description and features from dataset
display(dataset_object.info.description)
print("\n")
display(dataset_object.info.features)

In [None]:
# then loading the entire dataset
full_dataset = load_dataset("mlqa", "mlqa.en.en")

In [None]:
# the splits of the dataset
display(get_dataset_split_names("mlqa", "mlqa.en.en"))

In [None]:
# the test split in the original data already has around 11.5k instances,
# so we're just going to divide that into 3 datasets for train, val, and test

# splitting that into train and test set
split_dataset_traintest = full_dataset["test"].train_test_split(test_size=0.3, seed=42)
split_dataset_traintest

In [None]:
# now, splitting the new train set into train and val set
split_dataset_trainval = split_dataset_traintest["train"].train_test_split(test_size=0.3, seed=42)
split_dataset_trainval

# Train model

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from google.colab import files

In [None]:
# getting the model and tokenizer from hugging face

tokenizer = AutoTokenizer.from_pretrained("huggingface-course/bert-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("huggingface-course/bert-finetuned-squad")

In [None]:
# function from huggingface docs- https://huggingface.co/docs/transformers/tasks/question_answering

def preprocess_function(all_data):
  questions = [q.strip() for q in all_data["question"]]
  inputs = tokenizer(
      questions,
      all_data["context"],
      max_length=384,
      truncation="only_second",
      return_offsets_mapping=True,
      padding="max_length",
  )

  offset_mapping = inputs.pop("offset_mapping")
  answers = all_data["answers"]
  start_positions = []
  end_positions = []

  for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
      idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
      idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
      start_positions.append(0)
      end_positions.append(0)
    else:
    # Otherwise it's the start and end token positions
      idx = context_start
      while idx <= context_end and offset[idx][0] <= start_char:
        idx += 1
      start_positions.append(idx - 1)

      idx = context_end
      while idx >= context_start and offset[idx][1] >= end_char:
        idx -= 1
      end_positions.append(idx + 1)

  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
  return inputs

In [None]:
# tokenize the train, val, and test sets

tokenized_dataset_train = split_dataset_trainval["train"].map(preprocess_function, batched=True)
tokenized_dataset_val = split_dataset_trainval["test"].map(preprocess_function, batched=True)
tokenized_dataset_test = split_dataset_traintest["test"].map(preprocess_function, batched=True)

In [None]:
display(tokenized_dataset_train)
display(tokenized_dataset_val)
display(tokenized_dataset_test)

In [None]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

In [None]:
# setting the hyperparameters, making a trainer object, and training the model
training_args = TrainingArguments(
    output_dir="diff_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

In [None]:
# getting train log history, losses per epoch for training and validation

display(trainer.state.log_history)

text = 'These are losses per epoch for training and validation'
with open('diff_training_results.csv','a') as f:
    f.write(text)
    f.write("\n" + str(trainer.state.log_history)+"\n")
    f.write("\n")

In [None]:
overall_loss = trainer.evaluate()
display(overall_loss)

text = 'This is the overall evaluation loss on validation set'
with open('diff_training_results.csv','a') as f:
    f.write(text)
    f.write("\n" + str(overall_loss)+"\n")
    f.write("\n")

In [None]:
display(model.eval())

text = 'This is the model evaluation info'
with open('diff_training_results.csv','a') as f:
    f.write(text)
    f.write("\n" + str(model.eval())+"\n")
    f.write("\n")

In [None]:
files.download("diff_training_results.csv")

# Upload model to hub

In [None]:
# pushing the model to hub, can directly access it this pre-trained model fine-tuned on our dataset
trainer.push_to_hub()