The GPT-2 based system implementation is based on by [gpt-2-simple package by Max Woolf](https://minimaxir.com/2019/09/howto-gpt2/)

## Importing libraries

In [None]:
%tensorflow_version 1.x
!pip install -q gpt-2-simple
import gpt_2_simple as gpt2
from datetime import datetime
from collections import counter

## Initializing the model

GPT-2 with 774 million parameter was used

In [None]:
gpt2.download_gpt2(model_name="774M")

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Later it will possible to save GPT-2 weights in the Drive

In [None]:
gpt2.mount_gdrive()

## Model fine-tuning

Defining the file with the training data

In [None]:
file_name = "train.txt"

Setting the fine-tuning parameters and starting the training session

In [None]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset=file_name, # path to the the training data
              model_name='774M', # GPT-2 with 774 million parameters was chosen
              steps=3000, # number of training steps
              restore_from='fresh', 
              run_name='run1',
              print_every=10,
              sample_every=200,
              save_every=500
              )

Saving the weight to the Drive

In [None]:
gpt2.copy_checkpoint_to_gdrive(run_name='checkpoint_run1.tar')

## Model initialization

Getting the model weights from Google Drive

In [None]:
gpt2.copy_checkpoint_from_gdrive(run_name='run1')

Starting the session and loading the model

In [None]:
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess, run_name='run1')

## Obtaining the results

Splitting the validation set into question and answers sets

In [None]:
val_questions = []
q = -2
for i in range(len(val)):
  q += 2
  if i + q < len(val):
    val_questions.append(val[i + q])

val_answers = []
a = -1
for i in range(len(val)):
  a += 2
  if i + a < len(val):
    val_answers.append(val[i + a])

Generating the answers to questions from the validation set

In [None]:
eval_answers = []
for question in val_questions:
  answer = gpt2.generate(sess,
                length=300,
                temperature=0.7,
                top_k=40,
                prefix=question,
                nsamples=1,
                batch_size=5,
                )
  eval_answers.append(answer)

Computing Precision, Recal, F1-Score

In [None]:
num_c = []
num_p = []
num_g = []

for a in range(len(eval_answers)):

  common = collections.Counter(val_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
  num_common = sum(common.values())

  num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

  num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

  num_c.append(num_common)
  num_p.append(num_pred)
  num_g.append(num_gold)

precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
f1_score = (2 * precision * recall) / (precision + recall)