# Generative LLMs: Decoder Models



**References:**
 - Standard language generation: https://huggingface.co/blog/how-to-generate
 - Constrained language generation: https://huggingface.co/blog/constrained-beam-search


In [18]:
import os
import sys

import transformers
from transformers import GenerationConfig, AutoTokenizer, AutoModel, utils, BartForConditionalGeneration 
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

utils.logging.set_verbosity_error()  # Remove line to see warnings

def cuda_info():
    print()
    print("cuda.is_available: \t", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("cuda.device_count: \t", torch.cuda.device_count())
        print("cuda.current_device: \t", torch.cuda.current_device())
        print("cuda.device: \t\t", torch.cuda.device(torch.cuda.current_device()))
        print()
        print("cuda.get_device_name: \t", torch.cuda.get_device_name(torch.cuda.current_device()))
        print("total memory: \t\t", torch.cuda.get_device_properties(0).total_memory)
        print("reserved memory:\t", torch.cuda.memory_reserved(0))
        print("allocated memory:\t", torch.cuda.memory_allocated(0))


    device = "cuda:" + str(torch.cuda.current_device()) if torch.cuda.is_available() else "cpu"
    print()
    print("device name: \t\t", device)
    print("transformers: \t\t", transformers.__version__)
    print("pytorch: \t\t", torch.__version__)
    
def decode_and_print(model, config, sentence):

    encoded_input_ids_1 = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
    
    with torch.no_grad():
        generation_output = model.generate(
            input_ids = encoded_input_ids_1,
            generation_config = generation_config,
            return_dict_in_generate = True,
            output_scores = True
        )

    for s in generation_output.sequences:
        output = tokenizer.decode(s, skip_special_tokens=True)
        print(output)
        

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

cuda_info()


cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7f0a4ecff560>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 3388997632
allocated memory:	 3218541056

device name: 		 cuda:0
transformers: 		 4.47.0
pytorch: 		 2.6.0+cu124


# Decoder models

## DialogGPT

https://huggingface.co/microsoft/DialoGPT-large


In [47]:
model_name = "microsoft/DialoGPT-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
cuda_info()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7f0ab79f23c0>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 6794772480
allocated memory:	 8519680

device name: 		 cuda:0
transformers: 		 4.47.0
pytorch: 		 2.6.0+cu124


In [56]:
# Let's chat for 5 lines
chat_history_ids = None
user_input = ["Hello, how are you?", "I'm ok too. Today is a good day.", "What will you be doing today?", "I like to watch movies with my friends.", "Was Shakespear a good writer?"]
for step in user_input:
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(step + tokenizer.eos_token, return_tensors='pt').to(device)

    # append the new user input tokens to the chat history
    bot_input_ids = new_user_input_ids if chat_history_ids is None else torch.cat([chat_history_ids, new_user_input_ids], dim=-1).to(device)

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("User     : {}".format(step))
    print("DialoGPT : {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))


User     : Hello, how are you?
DialoGPT : I'm good, you?
User     : I'm ok too. Today is a good day.
DialoGPT : Good to hear.
User     : What will you be doing today?
DialoGPT : I'm going to be doing nothing.
User     : I like to watch movies with my friends.
DialoGPT : That sounds fun.
User     : Was Shakespear a good writer?
DialoGPT : He was a great writer.


## BART

In [21]:
# Initialize tokenizer and model. Be sure to set output_attentions=True.
# Load BART fine-tuned for summarization on CNN/Daily Mail dataset
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name, output_attentions=True).to(device)
cuda_info()




cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7f0ab792ff50>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 6595543040
allocated memory:	 1633885184

device name: 		 cuda:0
transformers: 		 4.47.0
pytorch: 		 2.6.0+cu124


# Decoding Strategies


## Decoding parameters and example

In [22]:
generation_config = model.generation_config

generation_config.temperature = 0.4
generation_config.top_p = 0.8
generation_config.top_k = 10
generation_config.num_beams = 4
generation_config.max_new_tokens = 150

print(generation_config)


GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "max_new_tokens": 150,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "output_attentions": true,
  "pad_token_id": 1,
  "temperature": 0.4,
  "top_k": 10,
  "top_p": 0.8
}



In [23]:

# create ids of encoded input vectors
sentence = 'The House Budget Committee passed a spending bill.'

decode_and_print(model, generation_config, sentence)




House Budget Committee passes a spending bill. House Budget Committee passed a spending bills. House budget committee passed a bill to fund the government. The spending bill was passed by the House of Representatives. The Senate will vote on the spending bill later this month. The bill is expected to be approved by the Senate on Thursday.


## Greedy Decoding

In [24]:
generation_config = model.generation_config
generation_config.do_sample = False
generation_config.num_beams = 1
generation_config.max_new_tokens = 150


In [25]:
sentence = 'The House Budget Committee passed a spending bill.'

decode_and_print(model, generation_config, sentence)




House Budget Committee passed a spending bill. House Budget Committee passing a spendingBill. House budget Committee passed spending bill, passed bill. Bill passed by House Budget committee. House passed spending Bill. House passes spending bill; bill passed by Senate. House votes on bill. Senate votes on spending bill and passes bill.


## Sampling

### Multinomial Sampling

### Top-k Sampling

In [26]:
sentence = 'The House Budget Committee passed a spending bill.'

generation_config = model.generation_config
generation_config.do_sample = True
generation_config.num_beams = 1
generation_config.temperature = 1

print(generation_config)


GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "max_new_tokens": 150,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "output_attentions": true,
  "pad_token_id": 1,
  "top_k": 10,
  "top_p": 0.8
}



In [27]:
for n in range(1,6):
    
    print("## Top k ", n*10)
    generation_config.top_k = n*10
    decode_and_print(model, generation_config, sentence)
    print()


## Top k  10
House Budget Committee passed a spending bill. House Budget Committee pass a spending measure. House budget committee passed a bill to fund the government. House Speaker John Boehner says the bill is a way of funding the government and doesn't mean a huge increase. The spending bill would fund the U.S. government for the next five years.

## Top k  20
House Budget Committee passed a spending bill. House Budget Committee Passed a spending Act. House passed a bill to fund the government. House spent $1.1 billion on the budget. House budget committee voted down $200 million for the military. House vote on a spending plan to fund government.

## Top k  30
House Budget Committee passed a spending bill. House Budget CommitteePassed a spending legislation. House has yet to pass a budget. It would be the first time the House has passed a budget since 2010. It was the first vote since a failed effort in the spring to pass the spending bill in 2010.

## Top k  40
House Budget Commit

### Top-p sampling

In [28]:
sentence = 'The House Budget Committee passed a spending bill.'

generation_config = model.generation_config
generation_config.do_sample = True
generation_config.num_beams = 1
generation_config.temperature = 1

print(generation_config)


GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "max_new_tokens": 150,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "output_attentions": true,
  "pad_token_id": 1,
  "top_p": 0.8
}



In [29]:
for n in range(1,6):
    generation_config.top_p = 0.2*n-0.05
    print("## Top p ", generation_config.top_p)
    decode_and_print(model, generation_config, sentence)
    print()


## Top p  0.15000000000000002
House Budget Committee passed a spending bill. House Budget Committee passing a spendingBill. House budget Committee passed spending bill, passed bill. Bill passed by House Budget committee. House passed spending Bill. House passes spending bill; bill passed by Senate. House pass spending bill with no amendments.

## Top p  0.35000000000000003
House Budget Committee passed a spending bill. House Budget Committee passing a spending bills. House budget committee passed a bill. Senate Budget Committee passes a spendingBill. House passes a bill to spend money. House passed a Spending Bill. House votes on the bill. Bill passes.

## Top p  0.55
House Budget Committee passed a spending bill. House Budget Committee Passed a spending Bill. House budget committee passed a bill. Senate Budget Committee passes a spending measure. Senate budget committee passes a bill with spending provisions. Senate passed a spend bill. The House budget bill passed. The Senate budget 

### Return sequences

In [30]:
sentence = 'The House Budget Committee passed a spending bill.'

encoded_input_ids_1 = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)

with torch.no_grad():
    generation_output = model.generate(
        input_ids = encoded_input_ids_1,
        num_return_sequences=5, 
        generation_config = generation_config,
        return_dict_in_generate = True,
        output_scores = True
    )

for s in generation_output.sequences:
    print("Output: ")
    output = tokenizer.decode(s, skip_special_tokens=True)
    print(output)
    print()


Output: 
House Budget Committee passed a spending bill, and House Budget Committee also passed the Senate version. House Budget House passed spending bill. House Speaker Paul Ryan passes a spending measure. House has passed a budget. Speaker Ryan says they're going to pass a bill. In exchange, they say it's likely the President will increase the deficit by a few billion dollars.

Output: 
House Budget Committee passed a spending bill. House Budget Committee pass a spendingBill. House Speaker John Boehner votes for spending bill to pass. House and Senate committees consider budgets for the fiscal year 2014-15, 2015-16. First approval of legislation necessary. Next item: Second passing a spending Bill.

Output: 
House Budget Committee passed spending bill. House Budget Committeepassed bill with majority vote. House could pass new spending bill next week. House Speaker John Boehner says bill is just a start. They are all waiting for more money. That bill may be introduced again. They have

## Beam Search

In [31]:
sentence = 'The House Budget Committee passed a spending bill.'

generation_config = model.generation_config
generation_config.do_sample = False
generation_config.num_beams = 1



In [32]:
for n in range(1,6):

    print("## Beam size of ", n)
    generation_config.num_beams = n
    decode_and_print(model, generation_config, sentence)
    print()


## Beam size of  1




House Budget Committee passed a spending bill. House Budget Committee passing a spendingBill. House budget Committee passed spending bill, passed bill. Bill passed by House Budget committee. House passed spending Bill. House passes spending bill; bill passed by Senate. House votes on bill. Senate votes on spending bill and passes bill.

## Beam size of  2
House Budget Committee passed a spending bill. House Budget Committee passing a bill to fund the government. House budget committee passed a bill that would fund the U.S. government through 2018. House passed a budget bill that will fund the country's government through 2019. The bill was passed by the House of Representatives and the Senate.

## Beam size of  3
House Budget Committee passes a spending bill. House Budget Committee passed a spending bills. House budget committee passed a bill to fund the government. The bill was passed by the House of Representatives. The Senate will vote on the bill later this month. The spending bill

# Decoding with Constraints



In [33]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import transformers
import torch 

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

device = "cpu"

def cuda_info():
    print()
    print("cuda.is_available: \t", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("cuda.device_count: \t", torch.cuda.device_count())
        print("cuda.current_device: \t", torch.cuda.current_device())
        print("cuda.device: \t\t", torch.cuda.device(torch.cuda.current_device()))
        print()
        print("cuda.get_device_name: \t", torch.cuda.get_device_name(torch.cuda.current_device()))
        print("total memory: \t\t", torch.cuda.get_device_properties(0).total_memory)
        print("reserved memory:\t", torch.cuda.memory_reserved(0))
        print("allocated memory:\t", torch.cuda.memory_allocated(0))


    device = "cuda:" + str(torch.cuda.current_device()) if torch.cuda.is_available() else "cpu"
    print()
    print("device name: \t\t", device)
    print("transformers: \t\t", transformers.__version__)
    print("pytorch: \t\t", torch.__version__)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

cuda_info()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]


cuda.is_available: 	 True
cuda.device_count: 	 1
cuda.current_device: 	 0
cuda.device: 		 <torch.cuda.device object at 0x7f0a80763260>

cuda.get_device_name: 	 NVIDIA GeForce RTX 3050 Ti Laptop GPU
total memory: 		 4294508544
reserved memory:	 6794772480
allocated memory:	 178054144

device name: 		 cuda:0
transformers: 		 4.47.0
pytorch: 		 2.6.0+cu124


## Repetitions and word lists
### n-gram Repetitions

In [34]:
sentence = 'The House Budget Committee passed a spending bill'

encoded_input_ids_1 = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)

with torch.no_grad():
    generation_output = model.generate(
        input_ids = encoded_input_ids_1,
        no_repeat_ngram_size=1,
        return_dict_in_generate = True,
        output_scores = True
    )

for s in generation_output.sequences:
    print("Output: ")
    output = tokenizer.decode(s, skip_special_tokens=True)
    print(output)
    print()


Output: 
The House Budget Committee passed a spending bill on Thursday that would cut the deficit by $1.3 trillion over 10 years, or about 1



### Force words and bad words


In [35]:
sentence = 'The soldiers'
input_ids = tokenizer(sentence, return_tensors="pt", add_special_tokens=False).input_ids.to(device)

## Forced words
force_disjunctive = ["day two", "day one"]
force_phrasal = "leave now or die"

force_words_ids = [ tokenizer(force_disjunctive, add_special_tokens=False).input_ids,
                    tokenizer(force_phrasal, add_special_tokens=False).input_ids
                  ]

print("## Force word ids:")
for word_ids in force_words_ids:
    if isinstance(word_ids[0], list):
        print("  DisjunctiveConstraint: ", word_ids)
    else:
        print("  PhrasalConstraint: ", word_ids)


## Force word ids:
  DisjunctiveConstraint:  [[820, 734], [820, 530]]
  PhrasalConstraint:  [47408, 783, 393, 4656]


In [36]:
## Bad words
bad_words_set = ["whom", "year"]
bad_words_ids = tokenizer(bad_words_set, add_special_tokens=False).input_ids

print("## Bad word ids:")
for word_ids in bad_words_ids:
    if isinstance(word_ids[0], list):
        print("DisjunctiveConstraint: ", word_ids)
    else:
        print("PhrasalConstraint: ", word_ids)


## Bad word ids:
PhrasalConstraint:  [1929, 296]
PhrasalConstraint:  [1941]


In [37]:

generation_output = model.generate(
    input_ids = input_ids,
    force_words_ids=force_words_ids,
    bad_words_ids=bad_words_ids,
    num_beams = 10,
    num_return_sequences=1,
    no_repeat_ngram_size=6,
    remove_invalid_values=True,
    output_scores = True
)

for s in generation_output:
    print("## Output: ")
    output = tokenizer.decode(s, skip_special_tokens=True)
    print(output)
    print()




## Output: 
The soldiers in the field were not the only ones who were injured.

day twoleave now or die



## Constraints



### Phrasal Constraint

In [38]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PhrasalConstraint

#tokenizer = AutoTokenizer.from_pretrained("t5-base")
#model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to(device)

encoder_input_str = "The soldiers"
input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids.to(device)


force_flexible_set = 'at the base'
tk_list = tokenizer(force_flexible_set, add_special_tokens=False).input_ids

constraints = [
    PhrasalConstraint(tk_list)
]

outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=1,
    no_repeat_ngram_size=5,
    max_length = 30,
    remove_invalid_values=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The soldiers, who had been stationed at the base, had been ordered to leave the area.

The soldiers, who were stationedat the base


### Disjunctive Constraints

In [39]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PhrasalConstraint, DisjunctiveConstraint

encoder_input_str = "The soldiers"
input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids.to(device)

force_words_set1 = [" stationed", "night"]
words_ids_set1 = tokenizer(force_words_set1, add_special_tokens=False).input_ids
print(words_ids_set1)

constraints = [
    DisjunctiveConstraint(words_ids_set1)
]


[[25967], [3847]]


In [40]:
tokenizer.convert_ids_to_tokens(25967)

'Ġstationed'

In [41]:
outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=1,
    max_length = 30,
    no_repeat_ngram_size=6,
    remove_invalid_values=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The soldiers, who had been stationed at the base, were taken to a nearby hospital, where they were treated for minor injuries and released.




### List of Constraints

In [42]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PhrasalConstraint, DisjunctiveConstraint

# The prompt
encoder_input_str = "The soldiers"
input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids.to(device)

# First constraint
force_words_set1 = [" stationed", "in the field"]
words_ids_set1 = tokenizer(force_words_set1, add_special_tokens=False).input_ids
constraint_1 = DisjunctiveConstraint(words_ids_set1)

print()
print(force_words_set1)
print(constraint_1.trie.trie)

# Second constraint
force_words_set2 = [" hospital"]
words_ids_set2 = tokenizer(force_words_set2, add_special_tokens=False).input_ids
constraint_2 = DisjunctiveConstraint(words_ids_set2)

print()
print(force_words_set2)
print(constraint_2.trie.trie)

# Third constraint
force_flexible_set = " at the battle"
phrasal_constraints = tokenizer(force_flexible_set, add_special_tokens=False).input_ids
constraint_3 = PhrasalConstraint(phrasal_constraints)

print()
print(force_flexible_set)
print(constraint_3.token_ids)

# The list of constraints
constraints = [ constraint_1, constraint_2,constraint_3 ]



[' stationed', 'in the field']
{25967: {}, 259: {262: {2214: {}}}}

[' hospital']
{4436: {}}

 at the battle
[379, 262, 3344]


In [43]:
outputs = model.generate(
    input_ids,
    constraints=constraints,
    num_beams=10,
    num_return_sequences=1,
    max_length = 30,
    no_repeat_ngram_size=5,
    remove_invalid_values=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
The soldiers stationed at the base were not allowed to leave the base until the end of the war.

"We were told at the battle hospital
