In [1]:
#Install libraries
!pip install datasets
!pip install transformers
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 4.8 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 66.2 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 65.5 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 68.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 72.5 MB/s 
Installing collected p

In [2]:
#Wandb - Create and login to https://wandb.ai/ and paste the access token
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import torch
from torch.utils.data import random_split

In [4]:
#Load dataset from hugging face - https://huggingface.co/datasets/demelin/moral_stories
dataset=load_dataset('demelin/moral_stories','full')

Downloading builder script:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/69.2k [00:00<?, ?B/s]

Downloading and preparing dataset moral_stories/full (download: 7.64 MiB, generated: 6.37 MiB, post-processed: Unknown size, total: 14.02 MiB) to /root/.cache/huggingface/datasets/demelin___moral_stories/full/1.1.0/47de080a74d44a1d4785a2c16fe6c7a978ef218fb0dc319d8392d22337f7b806...


Downloading data:   0%|          | 0.00/8.02M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12000 [00:00<?, ? examples/s]

Dataset moral_stories downloaded and prepared to /root/.cache/huggingface/datasets/demelin___moral_stories/full/1.1.0/47de080a74d44a1d4785a2c16fe6c7a978ef218fb0dc319d8392d22337f7b806. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
X_train = dataset['train']['norm'][:10000]
X_test = dataset['train']['norm'][-2000:]
print("Total Dataset - ", len(dataset['train']))
print("Train Dataset - ",len(X_train),"Test Dataset - ",len(X_test))

Total Dataset -  12000
Train Dataset -  10000 Test Dataset -  2000


In [7]:
X_test[1]

"You shouldn't delight in other people's misfortune"

In [8]:
#Define tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

Embedding(50259, 768)

In [9]:
max_length = max([len(tokenizer.encode(x)) for x in X_train])

In [10]:
class moral():
    def __init__(self, x, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in x:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


In [11]:
dataset = moral(X_train, tokenizer, max_length=max_length)
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [15]:
#Define Training parameters
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs',report_to="wandb",  run_name="gpt-2-test" )

PyTorch: setting up devices


In [16]:
# %env WANDB_WATCH=all
# %env WANDB_SILENT=true

In [17]:
#Train the model and tracking the performance in wandb. After training the model is saved in results folder
Trainer(model=model, args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()
wandb.finish()


***** Running training *****
  Num examples = 8000
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 8000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
100,7.7182
200,2.0175
300,1.8183
400,2.104
500,1.7865
600,1.8617
700,1.6825
800,1.7498
900,1.6572
1000,1.7619


Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json
Model weights saved in ./results/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇███
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁██▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁
train/loss,█████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,1.0
train/global_step,8000.0
train/learning_rate,0.0
train/loss,1.5747
train/total_flos,69405696000000.0
train/train_loss,1.69659
train/train_runtime,739.58
train/train_samples_per_second,10.817
train/train_steps_per_second,10.817


In [18]:
#Inference using sample data
tokens = tokenizer("Parents are allowed to", return_tensors="pt").input_ids.cuda()

In [19]:
#Predicted logits with inference parameters
predToken = model.generate(tokens, do_sample=True, top_k=50, 
                          max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [20]:
#Decoding predicted sentence
for i, predToken in enumerate(predToken):
    print("{}: {}".format(i, tokenizer.decode(predToken, skip_special_tokens=True)))

0: Parents are allowed to look at children to their will.
1: Parents are allowed to be open in personal disputes.
2: Parents are allowed to stay within marriage agreement over a living situation.
3: Parents are allowed to stay within normal orders they like from each parent.
4: Parents are allowed to drive in an old family tradition at gatherings.


In [23]:
#Evaluation - Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated = []
  for sentence in tqdm(test_data):
    sentenceList = sentence.split(" ")
    partialSentence = sentenceList[:(len(sentenceList)//2)]
    partialSentence = ' '.join(partialSentence)
    tokens = tokenizer(partialSentence, return_tensors="pt").input_ids.cuda()
    output = model.generate(tokens, do_sample=True, top_k=50,max_length=300, top_p=0.95, temperature=1.9,num_return_sequences=1)
    genText = tokenizer.decode(output[0], skip_special_tokens=True)
    generated.append(genText)
  return generated

#Run the functions to generate the lyrics
TestGen = text_generation(X_test)

  0%|          | 0/2000 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/2000 [00:00<08:01,  4.15it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/2000 [00:00<09:01,  3.69it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/2000 [00:00<06:44,  4.94it/s]The attention mask and the pad token id were not set. As a con

In [25]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for generated,test in tqdm(zip(TestGen,X_test)):
  splitGenerated=generated.split(" ")
  reference = [item for item in splitGenerated if item not in splitGenerated[:len(splitGenerated)//2]]
  reference = ' '.join(reference)
  splitTest = test.split(" ")
  candidate = [item for item in test if item not in test[:len(test)//2]]
  candidate = ' '.join(candidate)
  bleu = sentence_bleu([reference], candidate, weights = [1])
  scores.append(bleu)

print('Bleu score - ',statistics.mean(scores))

2000it [00:00, 16399.12it/s]

Bleu score -  0.19337660002437043



