# Install Dependencies

In [1]:
!pip install torch



In [2]:
!pip install pandas



In [3]:
!pip install transformers



## Import Packages

In [4]:
import pandas as pd 
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel, GPT2Config, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers.tokenization_utils_base import BatchEncoding
import torch.optim as optim
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# Obtain and Process Dataset

In [5]:
train_dataframe = pd.read_json('train-v2.0.json')

In [6]:
train_dataframe

Unnamed: 0,version,data
0,v2.0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,v2.0,{'title': 'Sino-Tibetan_relations_during_the_M...
3,v2.0,"{'title': 'IPod', 'paragraphs': [{'qas': [{'qu..."
4,v2.0,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...
...,...,...
437,v2.0,"{'title': 'Infection', 'paragraphs': [{'qas': ..."
438,v2.0,"{'title': 'Hunting', 'paragraphs': [{'qas': [{..."
439,v2.0,"{'title': 'Kathmandu', 'paragraphs': [{'qas': ..."
440,v2.0,"{'title': 'Myocardial_infarction', 'paragraphs..."


### Finding Number of Rows

In [7]:
train_dataframe.shape[0]

442

## Dataset Preparation

### Row Preview

In [8]:
train_dataframe["data"][4]

{'title': 'The_Legend_of_Zelda:_Twilight_Princess',
 'paragraphs': [{'qas': [{'question': 'What category of game is Legend of Zelda: Twilight Princess?',
     'id': '56cd8a5f62d2951400fa668e',
     'answers': [{'text': 'action-adventure', 'answer_start': 128}],
     'is_impossible': False},
    {'question': 'What consoles can be used to play Twilight Princess?',
     'id': '56cd8a5f62d2951400fa668f',
     'answers': [{'text': 'GameCube and Wii', 'answer_start': 194}],
     'is_impossible': False},
    {'question': 'When was Twilight Princess launched in North America?',
     'id': '56cd8a5f62d2951400fa6691',
     'answers': [{'text': 'November 2006', 'answer_start': 569}],
     'is_impossible': False},
    {'question': 'When could GameCube owners purchase Twilight Princess?',
     'id': '56cd8a5f62d2951400fa6692',
     'answers': [{'text': 'December 2006', 'answer_start': 688}],
     'is_impossible': False},
    {'question': 'What company developed Legend of Zelda: Twilight Princess?',

### Extracting Question and Answers

In [9]:
questions = []
answers = []

### Remove questions with blank answers and append to arrays

logic : iterate over total rows -> total paragraphs -> total qas -> if length of answer is >= 1 then append questions, answers

In [10]:
for row in train_dataframe["data"]:
    for paragraph in row["paragraphs"]:
        for qas in paragraph["qas"]:
            if(len(qas['answers'])>0):
                questions.append(qas['question'])
                answers.append(qas['answers'][0]['text'])

In [11]:
questions

['When did Beyonce start becoming popular?',
 'What areas did Beyonce compete in when she was growing up?',
 "When did Beyonce leave Destiny's Child and become a solo singer?",
 'In what city and state did Beyonce  grow up? ',
 'In which decade did Beyonce become famous?',
 'In what R&B group was she the lead singer?',
 'What album made her a worldwide known artist?',
 "Who managed the Destiny's Child group?",
 'When did Beyoncé rise to fame?',
 "What role did Beyoncé have in Destiny's Child?",
 'What was the first album Beyoncé released as a solo artist?',
 'When did Beyoncé release Dangerously in Love?',
 'How many Grammy awards did Beyoncé win for her first solo album?',
 "What was Beyoncé's role in Destiny's Child?",
 "What was the name of Beyoncé's first solo album?",
 'After her second solo album, what other entertainment venture did Beyonce explore?',
 'Which artist did Beyonce marry?',
 'To set the record for Grammys, how many did Beyonce win?',
 'For what movie did Beyonce rec

In [12]:
answers

['in the late 1990s',
 'singing and dancing',
 '2003',
 'Houston, Texas',
 'late 1990s',
 "Destiny's Child",
 'Dangerously in Love',
 'Mathew Knowles',
 'late 1990s',
 'lead singer',
 'Dangerously in Love',
 '2003',
 'five',
 'lead singer',
 'Dangerously in Love',
 'acting',
 'Jay Z',
 'six',
 'Dreamgirls',
 '2010',
 'Beyoncé',
 'Cadillac Records',
 'June 2005',
 "B'Day",
 'Dreamgirls',
 'Jay Z',
 'Sasha Fierce',
 'love, relationships, and monogamy',
 'influential',
 'Forbes',
 '2000s',
 'Forbes',
 'modern-day feminist',
 '2013 and 2014',
 '118 million',
 '60 million',
 '118 million',
 '20',
 'Forbes',
 "Destiny's Child",
 "her mother's maiden name",
 'African-American',
 'Methodist',
 'Xerox',
 'hairdresser and salon owner',
 'Solange',
 'Joseph Broussard',
 'Xerox',
 'salon',
 'Solange',
 'Joseph Broussard.',
 'Methodist',
 'Fredericksburg',
 'Darlette Johnson',
 'Houston',
 'dance instructor Darlette Johnson',
 "St. John's United Methodist Church",
 'music magnet school',
 'Imagine'

In [13]:
print(len(questions))
print(len(answers))

86821
86821


## Tokenize Question and Answers

### Initialize Tokenizer with End Of Sentence Token

In [14]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

Sample Question

In [15]:
questions[0]

'When did Beyonce start becoming popular?'

Tokenized Version

In [16]:
tokenizer.encode(questions[0])

[2215, 750, 37361, 344, 923, 5033, 2968, 30]

Decoding after Tokenizing

In [17]:
tokenizer.decode(tokenizer.encode(questions[0]))

'When did Beyonce start becoming popular?'

Encode Question/Answer Pairs using Batch Encode to obtain attention mask, input_ids etc.

In [18]:
input_sequences = tokenizer.batch_encode_plus(questions,answers, padding="max_length")

In [19]:
class BatchEncodingDataset(torch.utils.data.Dataset):
    def __init__(self, batch_encoding: BatchEncoding):
        self.batch_encoding = batch_encoding
    
    def __len__(self):
        return len(self.batch_encoding['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.batch_encoding['input_ids'][idx]),
            'attention_mask': torch.tensor(self.batch_encoding['attention_mask'][idx])
        }

In [20]:
train_dataset = BatchEncodingDataset(input_sequences)

# Obtain Model

In [21]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [22]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# Training the model

In [23]:
# Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',  # output directory
    num_train_epochs=1,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=1000,
    optimizer_args={"lr": 5e-5, "eps": 1e-8, "correct_bias": False}
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    data_collator=data_collator,          # data collator
)

trainer.train()

***** Running training *****
  Num examples = 86821
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5427
  Number of trainable parameters = 124439808

KeyboardInterrupt



In [None]:
# Save the fine-tuned model
trainer.save_model('./model-1')