In [None]:
#Install libraries
!pip install datasets
!pip install transformers
!pip install wandb

In [None]:
#Wandb - Create and login to https://wandb.ai/ and paste the access token
import wandb
wandb.login()


In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import torch
from torch.utils.data import random_split

In [None]:
#Load dataset from hugging face - https://huggingface.co/datasets/demelin/moral_stories
dataset=load_dataset('demelin/moral_stories','full')

In [None]:
X_train = dataset['train']['norm'][:10000]
X_test = dataset['train']['norm'][:-2000]
print("Total Dataset - ", len(dataset['train']))
print("Train Dataset - ",len(X_train),"Test Dataset - ",len(X_test))

In [None]:
#Define tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
model.resize_token_embeddings(len(tokenizer))

In [None]:
max_length = max([len(tokenizer.encode(x)) for x in X_train])

In [None]:
class moral():
    def __init__(self, x, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in x:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


In [None]:
dataset = moral(X_train, tokenizer, max_length=max_length)
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
#Define Training parameters
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,learning_rate = 50**-5,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs',report_to="wandb",  run_name="gpt-2-test" )

In [None]:
# %env WANDB_WATCH=all
# %env WANDB_SILENT=true

In [None]:
#Train the model and tracking the performance in wandb. After training the model is saved in results folder
Trainer(model=model, args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()
wandb.finish()


In [None]:
#Inference using sample data
tokens = tokenizer("It's responsible to", return_tensors="pt").input_ids.cuda()

In [None]:
#Predicted logits with inference parameters
predToken = model.generate(tokens, do_sample=True, top_k=50, 
                          max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=5)

In [None]:
#Decoding predicted sentence
for i, predToken in enumerate(predToken):
    print("{}: {}".format(i, tokenizer.decode(predToken, skip_special_tokens=True)))

In [None]:
#Evaluation - Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated = []
  for sentence in tqdm(test_data):
    sentenceList = sentence.split(" ")
    partialSentence = sentenceList[:(len(sentenceList)//2)]
    partialSentence = ' '.join(partialSentence)
    tokens = tokenizer(partialSentence, return_tensors="pt").input_ids.cuda()
    output = model.generate(tokens, do_sample=True, top_k=50,max_length=300, top_p=0.95, temperature=1.9,num_return_sequences=1)
    genText = tokenizer.decode(output[0], skip_special_tokens=True)
    generated.append(genText)
  return generated

#Run the functions to generate the lyrics
TestGen = text_generation(X_test[:20])

In [None]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for generated,test in tqdm(zip(TestGen,X_test)):
  splitGenerated=generated.split(" ")
  reference = [item for item in splitGenerated if item not in splitGenerated[:len(splitGenerated)//2]]
  reference = ' '.join(reference)
  splitTest = test.split(" ")
  candidate = [item for item in test if item not in test[:len(test)//2]]
  candidate = ' '.join(candidate)
  bleu = sentence_bleu([reference], candidate, weights = [1])
  scores.append(bleu)

print('Bleu score - ',statistics.mean(scores))