# Fine Tune GPT-2 Model

Open Questions:
- Is it useful to add \<bot> statement as preparing step?

Hint: Use the dialogs.txt file to train the model on google colab.

### Imports

In [2]:
!python --version

Python 3.9.1


In [4]:
#!python -m pip install torch
#!python -m pip install transformers

In [32]:
#from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import os

import json

import transformers
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

### Load and Prepare the data

In [47]:
class Dialog_Data(Dataset):
    
    def __init__(self, tokenizer, data_dir_path="./data", read_one_file=False):
        self.tokenizer = tokenizer
        self.data_dir_path = data_dir_path
        self.read_data(read_one_file, read_one_file)

    def read_data(self, data_dir_path, read_one_file, should_save_as_one_file=True):
        data = []
        if read_one_file:
            with open("./dialogs.txt", "r") as f:
                raw = f.read()
            for dialog in raw.split(";"):
                data += [dialog]
        else:
            for dialog in os.listdir(self.data_dir_path):
                    with open(f"{self.data_dir_path}/{dialog}", "r") as f:
                        for idx, line in enumerate(f.read().split("\n\n")):
                            content = ":".join(line.split(":")[1:]).strip()
                            if idx == 0:
                                data += [f"Create me a unique interactive story to calm with the topic: {content}"]
                            else:
                                data += [content]
            if should_save_as_one_file:
                save_data = ""
                for idx, elem in enumerate(data):
                    if idx == 0:
                        save_data += f"{elem}"
                    else:
                        save_data += f";{elem}"
                with open("./dialogs.txt", "w") as f:
                    f.write(save_data)

        # add markers: 
        for idx in range(0, len(data)-1):    # last elem should be skipped
            data[idx] = f"<startofstring> {data[idx]} <bot>:{data[idx+1]} <endofstring>"

        self.data = data[:-1]
        self.encoded_data = self.tokenizer(self.data, truncation=True)

        self.input_ids = self.encoded_data['input_ids']
        self.attention_mask = self.encoded_data['attention_mask']
 
    def __len__(self):
        return len(self.data)
 
    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [42]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({  "pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

1

In [48]:
data = Dialog_Data(tokenizer=tokenizer)
data = DataLoader(data, batch_size=64)

### Load pretrained model

In [36]:
model = transformers.GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [9]:
config = transformers.GPT2Config.from_pretrained("gpt2")
config.do_sample = config.task_specific_params['text-generation']['do_sample']
config.max_length = config.task_specific_params['text-generation']['max_length']
model = transformers.GPT2LMHeadModel.from_pretrained("gpt2", config=config)

### First test

In [12]:
tokenizer.decode(model.generate(tokenizer.encode("Create me an interactive story to calm me down.", return_tensors="pt"))[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Create me an interactive story to calm me down.\n\nI'm an experienced writer and I've been writing for a long time and it's something to take a deep breath in. You always say you can't write your own dialogue without having done"

In [None]:
enc = tokenizer(['The elf queen'], return_tensors='pt')
print('enc =', enc)
print(tokenizer.batch_decode(enc['input_ids']))

In [None]:
out = model.generate(input_ids=enc['input_ids'],
attention_mask=enc['attention_mask'], max_length=20)
print('out=', out)
print(tokenizer.batch_decode(out))

### Fine Tune Model

In [None]:

optimizer = Adam(model.parameters(), lr=0.0001)
epochs=10

loss_hist = []
steps = 0

for cur_epoch in range(0, epochs):
    model.train()
    for input_ids, attention_mask in data:
        # shift labels?
        optimizer.zero_grad()
        loss = model(input_ids, attention_maks=attention_mask, labels=input_ids).loss
        loss_hist += [loss.item()]
        loss.backward()
        optimizer.step()
        steps += 1
    torch.save(model.state_dict(), "./model_state.pt")
    print(f'Epoch {cur_epoch+1}/{epochs}, Training Loss: {loss.item():.4f}, Steps: {steps}')

# plot loss
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
ax.plot(np.arange(len(loss_hist)), loss_hist, label='Loss')
ax.set_xlabel('Learning progress')
ax.set_ylabel('Loss (normalized mean absolute error)')
ax.set_title('Loss over time')
ax.legend()
ax.grid()

In [None]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

### Save model

### Use the model

In [None]:
def inference(prompt:str, tokenizer):
    prompt = f"<startofstring> {data[idx]} <bot>:"
    prompt = tokenizer(prompt)
    output = model.generate(prompt)
    return tokenizer.decode(output[0])