In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

In [None]:
dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(dataset_name)

In [None]:
example_indices = [40, 200]
dash_line = '-'.join('' for x in range(100))
for i, index in enumerate(example_indices):
    print(dash_line)
    print(f"Example {i+1}")
    print(dash_line)
    print("INPUT DIALOGUE:")
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print("BASELINE SUMMARY:")  
    print(dataset['train'][index]['summary'])
    print(dash_line)
    print()

In [None]:
model_name = 'google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
sentence = "What is the capital of India?"

sentence_encoded = tokenizer(sentence, return_tensors='pt')
sentence_decoded = tokenizer.decode(sentence_encoded['input_ids'][0], skip_special_tokens=True)

print(f"Encoded: {sentence_encoded}")
print(f"Decoded: {sentence_decoded}")

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
    
    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(model.generate(inputs['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)
    
    print(dash_line)
    print(f"Example {i+1}")
    print(dash_line)
    print("INPUT PROMPT:\n{}".format(dialogue))
    print(dash_line)
    print("BASELINE SUMMARY:\n{}".format(summary))
    print(dash_line)
    print("MODEL GENERATED SUMMARY:\n{}".format(output))

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
    
    prompt = f"""
    Summarize the following dialogue:
    {dialogue}
    
    Summary:
    """
    
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(inputs['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)
    
    print(dash_line)
    print(f"Example {i+1}")
    print(dash_line)
    print("INPUT PROMPT:\n{}".format(dialogue))
    print(dash_line)
    print("BASELINE SUMMARY:\n{}".format(summary))
    print(dash_line)
    print("MODEL GENERATED SUMMARY:\n{}".format(output))

In [None]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']
    
    prompt = f"""
    Dialogue:
    {dialogue}
    
    What was going on?
    """
    
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(model.generate(inputs['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)
    
    print(dash_line)
    print(f"Example {i+1}")
    print(dash_line)
    print("INPUT PROMPT:\n{}".format(dialogue))
    print(dash_line)
    print("BASELINE SUMMARY:\n{}".format(summary))
    print(dash_line)
    print("MODEL GENERATED SUMMARY:\n{}".format(output))

In [None]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ""
    for i, index in enumerate(example_indices_full):
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']
        
        prompt += f"""
        Dialogue:
        {dialogue}
        
        What was going on?
        {summary}
        
        """
        
        dialogue = dataset['test'][example_index_to_summarize]['dialogue']
        prompt += f"""
        Dialogue:
        {dialogue}
        
        What was going on?
        """
    return prompt

In [None]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)

In [None]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)

print(dash_line)
print("BASELINE SUMMARY:\n{}".format(summary))
print(dash_line)
print("MODEL GENERATED SUMMARY:\n{}".format(output))

### Few-Shot inference

In [None]:
example_indices_full = [20, 40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)

In [None]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs['input_ids'], max_new_tokens=50)[0], skip_special_tokens=True)

print(dash_line)
print("BASELINE SUMMARY:\n{}".format(summary))
print(dash_line)
print("MODEL GENERATED SUMMARY:\n{}".format(output))

### Teste dos parâmetros de inferência

In [None]:
generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, top_k = 3, top_p = 0.1, temperature = 0.8)

summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs['input_ids'], generation_config=generation_config)[0], skip_special_tokens=True)

print(dash_line)
print("BASELINE SUMMARY:\n{}".format(summary))
print(dash_line)
print("MODEL GENERATED SUMMARY:\n{}".format(output))