### Installing required packages

In [1]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install transformers
!pip install rouge
!pip install sentencepiece
!pip install evaluate

### Importing libraries for further use

In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

from transformers import set_seed
set_seed(42)

import time
import textwrap
import sentencepiece

import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from evaluate import load

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Text generation using pre-trained large language models

This notebook serves as an implementation of a final practical project for the course "Natural Language Processing" taking place in Faculty of Mathematics and Informatics, Vilnius University, 2023. The main goal of the project is to generate text, conditioned on the beginning of a sentence. In this notebook, several pre-trained models, including `GPT2`, `GPT2-xl`, `T5`, `XL-Net` are employed to complete this task. Before that, different decoding methods are being explored to find the best parameters. Further, the text is generated with each `transformer` using the most optimal parameters. The quality of generated results is then evaluated using Perplexity score.

### Initialization of the models and tokenizers

In [41]:
# Define a function to initialize 1 of 4 pre-trained models; the function
# returns the model and the tokenizer

def initialize_model(name, device):
  match name:
    # gpt2-mini model
    case 'gpt2':
      from transformers import GPT2Tokenizer, GPT2LMHeadModel

      tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                                padding_side="left")
      tokenizer.pad_token = tokenizer.eos_token
      model = GPT2LMHeadModel.from_pretrained('gpt2',
                                              pad_token_id=tokenizer.eos_token_id).to(device) # Add the EOS token as PAD token to avoid warnings
    # gpt2 model
    case 'gpt2-xl':
      from transformers import GPT2Tokenizer, GPT2LMHeadModel

      tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl',
                                                padding_side="left")
      tokenizer.pad_token = tokenizer.eos_token
      model = GPT2LMHeadModel.from_pretrained('gpt2-xl',
                                              pad_token_id=tokenizer.eos_token_id).to(device)
    # T5 model
    case 't5':
      from transformers import T5Tokenizer, T5ForConditionalGeneration

      tokenizer = T5Tokenizer.from_pretrained('t5-base')
      model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

    # xl-net model
    case 'xl-net':
      from transformers import XLNetTokenizer, XLNetLMHeadModel

      tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
      model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased').to(device)

    case '_':
      tokenizer = []
      model = []
      print('Model and Tokenizer not initialized. Returning empty lists')

  return tokenizer, model

### Additional functions for printing the results in a type-like effect

In [46]:
# A function that delays the output to make the type-like effect
def type_like_effect(text, delay=0.025):

    for char in text:
        print(char, end='', flush=True)
        time.sleep(delay)
    print()

# A function to decode the generated output of a model
def decode_result(output, tokenizer):

    output_text = tokenizer.decode(output, skip_special_tokens=True)
    output_text = truncate_until_period(output_text)
    return output_text

# A function that prints the output with a type-like effect surrounded by a box
def print_result(output, tokenizer):

    width = 120
    border_char = '-'

    output_text = decode_result(output, tokenizer)

    border = border_char * width
    wrapped_text = textwrap.fill(output_text, width-4)  # Adjust width for padding

    print(border)
    for line in wrapped_text.split('\n'):
        formatted_line = f"| {line:<{width-4}} |"
        type_like_effect(formatted_line)
    print(border)

# A function that truncates the sentence if it does not end on a period
def truncate_until_period(sentence):
    words = sentence.split()

    if len(words) == 0 or all(word[-1] not in '.?!' for word in words):
        return sentence
    else:
        if words[-1][-1] in '.?!':
            return sentence
        else:
            truncated_sentence = ' '.join(words[:-1])
            return truncate_until_period(truncated_sentence)


## Searching for the best decoding method for language generation with GPT2

Initializing the `GPT2` model and its tokenizer. Entering beginning of a sentence and encoding the text input.

In [6]:
input_text = "My favorite food is "

# Initialize the model and the tokenizer
tokenizer, model = initialize_model(name='gpt2', device=device)

# Encode the input text
input_tokens = tokenizer.encode(input_text, return_tensors='pt', padding=True).to(device)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### 1. **Greedy search**

**Greedy search** is a simple text generation strategy. It selects the word with the highest probability as its next word at each time step.

In [7]:
# Generate new tokens using Greedy search
with torch.no_grad():
  greedy_output = model.generate(
      input_tokens,
      max_new_tokens=25,
      num_return_sequences=1
  )

print_result(greedy_output[0], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea. I love it. I love it. I love it. I love it. I love it.                                 |
------------------------------------------------------------------------------------------------------------------------


The generated text seems to have a reasonable context, but starts *repeating* quickly. This is a frequent problem with these simple text generation models. We will try to mitigate this problem shortly. Major drawback of Greedy search is that it misses high probability words hidden behind a low probability word.

### 2. **Beam search**

**Beam search** reduces this problem by keeping the most likely num_beams of hypotheses at each time step and eventually choosing the hypothesis with the highest overall probability.

In [8]:
# Generate new tokens using Beam search and early_stopping
with torch.no_grad():
  beam_output = model.generate(
      input_tokens,
      max_new_tokens=25,
      num_beams=5,
      early_stopping=True
  )

print_result(beam_output[0], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea. I love it. I love it. I love it. I love it. I love it.                                 |
------------------------------------------------------------------------------------------------------------------------


The result is noticeably more fluent, however, the repetition still appears after some time steps. One way to reduce repetition is to introduce **n-gram penalties**.

### 3. **n-gram penalties**

The most common one makes sure, that no n-gram appears twice by manually setting the probability of the next word that could create already seen n-gram to 0.

In [9]:
# Add no_repeat_ngram_size constraint
with torch.no_grad():
  beam_output = model.generate(
      input_tokens,
      max_new_tokens=40,
      num_beams=5,
      no_repeat_ngram_size=2,
      early_stopping=True
  )

print_result(beam_output[0], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea.  I'm not sure if I'll ever be able to make it again, but I'm going to try it and see   |
| what I can come up with.                                                                                             |
------------------------------------------------------------------------------------------------------------------------


The resulting text finally has no repetition in it! Nevertheless, n-grams should be used with care, as the penalty does not allow the repetition of any n-word sequences.

**Beam search** is effective in tasks where the length of the generation is predictable, i.e. machine translations or summarization. However, it struggles with generating text without repetition,  which is very hard to control using n-gram penalties.

High quality human language does not follow a distribution of highest
probability words [Ari Holtzman et al. 2019] (quite paradoxal), which indicates that blindly picking the most probable word in a sequence may not yeld the best result.

### 4. **Sampling**

In order to generate more random output, instead of picking the most probable word, we need to **sample** the word from its (conditional) probability mass function (distribution).

In [10]:
# Activate sampling and deactivate top_k
with torch.no_grad():
  sample_output = model.generate(
      input_tokens,
      max_new_tokens=50,
      do_sample=True,
      top_k=0
  )

print_result(sample_output[0], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea. Often it's the same story. Love it, love making it and laugh at it with my 6 year old  |
| Auctie. My year is always awesome!! This place is great *I absolutely adore Mr.                                      |
------------------------------------------------------------------------------------------------------------------------


The generated text seems to not have any semantic sence. The models often generate incoherent text when sampling words from their distribution.

### 5. **Temperature**

To overcome this, distribution could be made sharper (decrease the probability of low probability candidates) by lowering the **temperature** parameter of the softmax function.

In [11]:
# Add temperature
with torch.no_grad():
  sample_output = model.generate(
      input_tokens,
      max_new_tokens=40,
      do_sample=True,
      top_k=0,
      temperature=0.7
  )

print_result(sample_output[0], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced coffee, and I love that I can order fresh. I usually order a spicy or spicy-but-not-sour    |
| mixed with a cup of vanilla ice cream.                                                                               |
------------------------------------------------------------------------------------------------------------------------


The output seems less gibberish with the temperature applied. When setting temperature -> 0, the sampling becomes identical to that of Greedy search, making the same problems appear once again.

### 6. **Top-K Sampling**

Top-K sampling is a sampling scheme, which filters and redistributes the
probability mass among the K most likely next words [Fan et al. 2018].

In [12]:
# Apply top_k sampling
with torch.no_grad():
  sample_output = model.generate(
      input_tokens,
      max_new_tokens=40,
      do_sample=True,
      top_k=45
  )

print_result(sample_output[0], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced. I have never been able to take a shower and just lay down, without putting anything down.  |
| My favorite way to pee in the bath is outside.                                                                       |
------------------------------------------------------------------------------------------------------------------------


The text is more human sounding than the previous results. However, top-K sampling technique does not adapt to the number of words that are filtered from the next word probability mass function.

### 7. **Top-P Sampling**

Instead of sampling from K words, top-P sampling chooses from the smallest possible set of words whose cumulative probability exceeds P. The mass is then redistributed between these words.

In [13]:
# Activate top_p sampling
with torch.no_grad():
  sample_output = model.generate(
      input_tokens,
      max_new_tokens=40,
      do_sample=True,
      top_p=0.92,
      top_k=0
  )

print_result(sample_output[0], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced alfredo. Great price. I didn't have much to eat and was very impressed with the quality of  |
| the fresh ingredients. The service was great and it arrived quickly.                                                 |
------------------------------------------------------------------------------------------------------------------------


### 8. **Final Result**

To compare the quality of generated text, let us get multiple independently sampled outputs using the parameter **return_num_sequences**. Also, lets add a **repetition_penalty** parameter, set **no_repeat_ngram_size** to 5, and alter the **temperature**, **top-k** and **top-p** parameters.

In [14]:
# Generate 3 sentences
with torch.no_grad():
  sample_outputs = model.generate(
      input_tokens,
      max_new_tokens=50,
      do_sample=True,
      repetition_penalty=1.3,
      no_repeat_ngram_size=5,
      temperature=0.95,
      top_p=0.8,
      top_k=40,
      num_return_sequences=3,
  )

for i, output in enumerate(sample_outputs):
  print_result(sample_outputs[i], tokenizer)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is that we can eat together.                                                                        |
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea.                                                                                        |
------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea. It's a great alternative to eating plain vanilla ice cream or regular sugar cookies    |
| because the sweetness and flav

## Using the best decoding parameters for other pre-trained models

4 transformer-based models are used: `GPT2`, `GPT2-xl`, `T5`, `XL-Net`. The tokenizers are being decoded using the parameters, obtained in the previous manual search.

In [48]:
input_text = "My favorite food is "

# Initializing the models and their tokenizers
tokenizer_gpt2, model_gpt2 = initialize_model(name='gpt2', device=device)
tokenizer_gpt2xl, model_gpt2xl = initialize_model(name='gpt2-xl', device=device)
tokenizer_t5, model_t5 = initialize_model(name='t5', device=device)
tokenizer_xlnet, model_xlnet = initialize_model(name='xl-net', device=device)

# Encoding the text input with each model's tokenizer
input_gpt2 = tokenizer_gpt2.encode(input_text, return_tensors='pt', padding=True).to(device)
input_gpt2xl = tokenizer_gpt2xl.encode(input_text, return_tensors='pt', padding=True).to(device)
input_t5 = tokenizer_t5.encode(input_text, return_tensors='pt').to(device)
input_xlnet = tokenizer_xlnet.encode(input_text, add_special_tokens=False, return_tensors="pt").to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### 1. **GPT2** Model

In [6]:
# Generate, decode and print the GPT2 model output
with torch.no_grad():
  output_gpt2 = model_gpt2.generate(
      input_gpt2,
      max_new_tokens=50,
      min_length=30,
      do_sample=True,
      repetition_penalty=1.3,
      no_repeat_ngram_size=5,
      temperature=0.95,
      top_p=0.8,
      top_k=100,
      num_return_sequences=1,
  )

for i, output in enumerate(output_gpt2):
  print_result(output_gpt2[i], tokenizer_gpt2)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea. It's something I really enjoy to have on hand, and it also has some good things going  |
| for you (this will be the most helpful part of any meal in my opinion).                                              |
------------------------------------------------------------------------------------------------------------------------


### 2. **GPT2-XL** Model

In [17]:
# Generate, decode and print the GPT2-xl model output
with torch.no_grad():
  output_gpt2xl = model_gpt2xl.generate(
      input_gpt2xl,
      max_new_tokens=50,
      min_length=30,
      do_sample=True,
      repetition_penalty=1.3,
      no_repeat_ngram_size=5,
      temperature=0.95,
      top_p=0.8,
      top_k=100,
      num_return_sequences=1,
  )

for i, output in enumerate(output_gpt2xl):
  print_result(output_gpt2xl[i], tokenizer_gpt2xl)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea (it was once my go-to dessert!) This recipe for Pumpkin Spice Cakes and Pancake         |
| Crumbles I made back in 2009, uses a little pumpkin spice.                                                           |
------------------------------------------------------------------------------------------------------------------------


### 3. **T5** Model

In [58]:
# Generate, decode and print the T5 model output
with torch.no_grad():
  output_t5 = model_t5.generate(
      input_t5,
      min_length=30,
      max_new_tokens=50,
      do_sample=True,
      repetition_penalty=1.3,
      no_repeat_ngram_size=5,
      temperature=0.95,
      top_p=0.8,
      top_k=100,
      num_return_sequences=1,
  )

for i, output in enumerate(output_t5):
  print_result(output_t5[i], tokenizer_t5)

------------------------------------------------------------------------------------------------------------------------
| food. My favorite food is fish. My favorite food I am most fond of is sushi. I like to eat sushi all the time.       |
------------------------------------------------------------------------------------------------------------------------


### 4. **XL-Net** Model

In [66]:
# Generate, decode and print the XL-Net model output
with torch.no_grad():
  output_xlnet = model_xlnet.generate(
      input_xlnet,
      max_length=50,
      temperature=1.0,
      top_k=50,
      top_p=0.96,
      repetition_penalty=1.2,
      do_sample=True,
      num_return_sequences=1
  )

for i, output in enumerate(output_xlnet):
  print_result(output_xlnet[i], tokenizer_xlnet)

------------------------------------------------------------------------------------------------------------------------
| My favorite food is as good and as a once be by when some how in all everything because of kind or back either which |
| the better it to get for you on any front no ones always back around again though up more time even if there has     |
------------------------------------------------------------------------------------------------------------------------


## Evaluating the quality of generated text using automated metric

**Perplexity** metric is used to evaluate the text, generated with the models. It is defined as the exponentiated average negative log-likelihood of a sequence of tokens. The range of this metric is [0, $\infty$). A lower score is better.




In [50]:
# Initialize the perplexity metric
perplexity_metric = load("perplexity", module_type="metric")

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

### 1. **GPT2 Perplexity**

In [51]:
# Calculate the perplexity metric for GPT2 model
decoded_gpt2 = decode_result(output_gpt2[0], tokenizer_gpt2)

perplexity_gpt2 = perplexity_metric.compute(predictions=[decoded_gpt2], model_id='gpt2')['perplexities'][0]

print(f'User Input: {input_text}')
print('Generated Output:')
print_result(output_gpt2[0], tokenizer_gpt2)
print(f"Perplexity: {perplexity_gpt2}")

  0%|          | 0/1 [00:00<?, ?it/s]

User Input: My favorite food is 
Generated Output:
------------------------------------------------------------------------------------------------------------------------
| My favorite food is iced tea. It's something I really enjoy to have on hand, and it also has some good things going  |
| for you (this will be the most helpful part of any meal in my opinion).                                              |
------------------------------------------------------------------------------------------------------------------------
Perplexity: 16.853282928466797


### 2. **GPT2-XL Perplexity**

In [None]:
# Calculate the perplexity metric for GPT2-xl model
decoded_gpt2xl = decode_result(output_gpt2xl[0], tokenizer_gpt2xl)

perplexity_gpt2xl = perplexity_metric.compute(predictions=[decoded_gpt2xl], model_id='gpt2-xl')['perplexities'][0]

print(f'User Input: {input_text}')
print('Generated Output:')
print_result(output_gpt2xl[0], tokenizer_gpt2xl)
print(f"Perplexity: {perplexity_gpt2xl}")

### 3. **T5 Perplexity**

Perplexity score is not compatable with T5 model

### 4. **XL-Net Perplexity**

In [67]:
# Calculate the perplexity metric for XL-Net model
decoded_xlnet = decode_result(output_xlnet[0], tokenizer_xlnet)

perplexity_xlnet = perplexity_metric.compute(predictions=[decoded_xlnet], model_id='xlnet-large-cased')['perplexities'][0]

print(f'User Input: {input_text}')
print('Generated Output:')
print_result(output_xlnet[0], tokenizer_xlnet)
print(f"Perplexity: {perplexity_xlnet}")

  0%|          | 0/1 [00:00<?, ?it/s]

User Input: My favorite food is 
Generated Output:
------------------------------------------------------------------------------------------------------------------------
| My favorite food is as good and as a once be by when some how in all everything because of kind or back either which |
| the better it to get for you on any front no ones always back around again though up more time even if there has     |
------------------------------------------------------------------------------------------------------------------------
Perplexity: 561.1319580078125
