# PapuGaPT2

useful resources: </br>
https://towardsdatascience.com/how-to-fine-tune-gpt-2-for-text-generation-ae2ea53bc272 </br>
https://towardsdatascience.com/conditional-text-generation-by-fine-tuning-gpt-2-11c1a9fc639d </br>

In [1]:
from transformers import TextDataset, AutoTokenizer, AutoModelForCausalLM, \
pipeline, set_seed, Trainer, AdamW, get_linear_schedule_with_warmup, GPT2Config, \
GPT2LMHeadModel

import datasets
import time
import torch
import random
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np

from tqdm import tqdm
from datetime import datetime
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [2]:
path_to_file = "data/songs_data.txt"
BATCH_SIZE = 16
RANDOM_SEED = 123
MAX_LEN = 512
EPOCHS = 2

In [3]:
model = AutoModelForCausalLM.from_pretrained('flax-community/papuGaPT2')
tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')
special_tokens_dict = {
     'bos_token': '<BOS>', 
     'eos_token': '<EOS>', 
     'unk_token': '<UNK>',
     'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
print(f"Added {num_added_tokens} new tokes.")

set_seed(RANDOM_SEED) # reproducibility
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

Added 4 new tokes.


<torch._C.Generator at 0x7fa61656ebd0>

In [4]:
model.resize_token_embeddings(len(tokenizer))
 
# freezing bottom 6 layers of the model

for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i >= 6:
        for parameter in m.parameters():
            parameter.requires_grad = True 

# un-freezing weights in normalization layers     
for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

# unfreeze model head (which outputs the probability over vocab)
for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [5]:
sentences = []
with open("./../../data/songs_data.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        if line != "\n":
            sentences.append(line)
            
print(f"number of input sentences = {len(sentences)}")

sentences = sentences[:len(sentences)//2]

number of input sentences = 190585


In [6]:
class DiscoDataset(Dataset):
    def __init__(self, data, tokenizer, gpt2_type='gpt2', max_length=MAX_LEN):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for i in tqdm(data):
            encodings_dict = tokenizer(i,'<BOS>' + i + '<EOS>',
                                     truncation=True,
                                     max_length=max_length,
                                     padding='max_length')

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]
        
songs_dataset = DiscoDataset(sentences, tokenizer, max_length=MAX_LEN)

songs_dataset_train_dataloader = DataLoader(songs_dataset,
                              sampler=RandomSampler(songs_dataset),
                              batch_size=BATCH_SIZE)

100%|██████████| 95292/95292 [00:47<00:00, 2007.11it/s]


In [7]:
a = next(iter(songs_dataset_train_dataloader))

In [8]:
len(a)

2

In [9]:
a[0].shape

torch.Size([16, 512])

In [10]:
a[1].shape

torch.Size([16, 512])

In [None]:
# helper function for logging time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# hyperparameters
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50


optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
total_steps = len(songs_dataset_train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

start_time = time.time()
total_steps_epoch = total_steps // EPOCHS
# poem_stanza_model = poem_stanza_model.to(device)

for epoch_i in range(0, EPOCHS):

    print(f'Epoch {epoch_i + 1} of {EPOCHS}')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(songs_dataset_train_dataloader):
        
        if step % 10 == 0:
            curr_time = datetime.now().strftime("%H:%M:%S")
            print(f"Step {step} out of {total_steps_epoch} steps ({curr_time}, epoch = {epoch_i}).")
        
        b_input_ids = batch[0]#.to(device)
        b_labels = batch[0]#.to(device)
        b_masks = batch[1]#.to(device)

        model.zero_grad()        
        
        #print(b_input_ids.shape)
        #print(b_input_ids[0])
        #print(b_masks.shape)
        
        outputs = model(b_input_ids,
                                    labels=b_labels,
                                    attention_mask=b_masks,
                                    token_type_ids=None)

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(songs_dataset_train_dataloader)       
    training_time = format_time(time.time() - t0)

    print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')

    t0 = time.time()

print(f'Total Training Time: {format_time(time.time()-start_time)}')

#torch.save(model.state_dict(), "models/PapuGaPT2_finetuned/PapuGaPT2_finetuned.pth")



Epoch 1 of 2
Step 0 out of 5956 steps (12:54:48, epoch = 0).


In [None]:
2+2

In [None]:
#dataset = datasets.load_dataset("text", data_files=path_to_file)

In [None]:
#dataset

In [None]:
# train_data = dataset.map(
#     lambda example: tokenizer(example['text'], padding=True, truncation=True, max_length=512),#, return_tensors="pt"
#     batched=True,
#     batch_size=16
# )
# train_data = train_data.remove_columns(["text"])
# train_data.set_format("torch")

In [None]:
#model.resize_token_embeddings(len(tokenizer))

In [None]:
#model.resize_token_embeddings(len(tokenizer))

In [None]:
# from transformers import DataCollatorForTokenClassification
# data_collator = DataCollatorForTokenClassification(tokenizer)


# trainer = Trainer(
#     model=model,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     #args=TrainingArguments(output_dir="/content/drive/MyDrive/NLP/model"),
#     train_dataset=train_data["train"]
# )
# trainer.train()

In [None]:
"""
preprocessing - one song is one input 


songs = []
with open("data/songs_data.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    song = ""
    for line in lines:
        if line == "\n":
            songs.append(song)
            song = ""
        else:
            song += line
            song += " "
songs.append(song)
print(f"number of songs = {len(songs)}")

ls = [len(i) for i in songs]
#fig, ax = plt.subplots(1, 1, figsize=(4,4))
sn.displot(ls)
plt.show()

old_songs = songs.copy()
songs = []
for i in old_songs:
    if len(i)>0:
        songs.append(i)
        
print(f"number of songs afer removing 0s = {len(songs)}")
ls = [len(i) for i in songs]
sn.displot(ls)
plt.show()
"""