# Dialogue Summarization With Flan T5 Base LLM

# 1.0 Installations and Libraries

In [1]:
!pip install --upgrade pip
!pip install --disable-pip-version-check
!pip install torch==1.13.1
!pip install torchdata==0.5.1 --quiet

!pip install transformers==4.27.2
!pip install datasets==2.11.0 --quiet

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
[0mLooking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting torch==1.13.1
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m?[0m eta [36m0:00:00[0m[36m0:00:01[0m00:02[0m
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==1.13.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu11==8.5.0.96 (from torch==1.13.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.

In [2]:
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

# 2.0 Summarizing Dialogue Without Prompt Engineering

## 2.1 Loading Dialogue Dataset

In [3]:
# Loading Dialogue Dataset
dataset = load_dataset("knkarthick/dialogsum")

Downloading readme:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading and preparing dataset csv/knkarthick--dialogsum to /home/ec2-user/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Printing a couple of dialogues with their baseline summaries
example = [5, 10, 15, 20]

dash_line = "-".join("" for x in range(100))

for i, index in enumerate(example):
    print(dash_line)
    print("Example ", i + 1)
    print(dash_line)
    print("DIALOGUE:")
    print(dataset["test"][index]["dialogue"])
    print(dash_line)
    print("BASELINE SUMMARY:")
    print(dataset["test"][index]["summary"])
    print(dash_line)
    print()

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
DIALOGUE:
#Person1#: You're finally here! What took so long?
#Person2#: I got stuck in traffic again. There was a terrible traffic jam near the Carrefour intersection.
#Person1#: It's always rather congested down there during rush hour. Maybe you should try to find a different route to get home.
#Person2#: I don't think it can be avoided, to be honest.
#Person1#: perhaps it would be better if you started taking public transport system to work.
#Person2#: I think it's something that I'll have to consider. The public transport system is pretty good.
#Person1#: It would be better for the environment, too.
#Person2#: I know. I feel bad about how much my car is adding to the pollution problem in this city.
#Person1#: Taking the subway would be a lot less stressful than driving as wel

## 2.2 Loading T5 Model

In [5]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## 2.3 Loading Tokenizer

In [6]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
example_sentence = "This is an example"

# encode example sentence
encoded_example_sentence = tokenizer(example_sentence, return_tensors = "pt")

# decode example sentence after encoding
decoded_example_sentence = tokenizer.decode(encoded_example_sentence["input_ids"][0], skip_special_tokens = True)

print("Example Sentence: {}".format(example_sentence))
print("Encoded Sentence: {}".format(encoded_example_sentence["input_ids"][0]))
print("Decoded Sentence: {}".format(decoded_example_sentence))

Example Sentence: This is an example
Encoded Sentence: tensor([100,  19,  46, 677,   1])
Decoded Sentence: This is an example


## 2.4 Generate Summaries

In [10]:
# Encode the dialogue, feed to the language model, decode the output to see summaries

for i, index in enumerate(example):
    dialogue = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]
    
    # tokenizer the dialogue which will be used as input for the model
    inputs = tokenizer(dialogue, return_tensors = "pt")
    # decode the output so that it is returned as text
    output = tokenizer.decode(
        model.generate(inputs["input_ids"],
                       max_new_tokens = 50,)[0],
        skip_special_tokens = True
    )
    
    # print the output of the model
    print(dash_line)
    print("Example ", i + 1)
    print(dash_line)
    print("INPUT DIALOGUE: {}".format(dialogue))
    print(dash_line)
    print("BASELINE SUMMARY: {}".format(summary))
    print(dash_line)
    print("GENERATED SUMMARY: {}".format(output))

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE: #Person1#: You're finally here! What took so long?
#Person2#: I got stuck in traffic again. There was a terrible traffic jam near the Carrefour intersection.
#Person1#: It's always rather congested down there during rush hour. Maybe you should try to find a different route to get home.
#Person2#: I don't think it can be avoided, to be honest.
#Person1#: perhaps it would be better if you started taking public transport system to work.
#Person2#: I think it's something that I'll have to consider. The public transport system is pretty good.
#Person1#: It would be better for the environment, too.
#Person2#: I know. I feel bad about how much my car is adding to the pollution problem in this city.
#Person1#: Taking the subway would be a lot less stressful than driving 

# 3.0 Summarizing Dialogue With An Instruction Prompt

## 3.1 Zero Shot Inference With An Instruction Prompt

Take the dialogue and convert it into an instruction prompt. Wrap the dialogue in a descriptive instruction. The following is one of the pre-built prompts for this particular language model.

In [12]:
# same thing but this time providing a prompt for the language model.

for i, index in enumerate(example):
    dialogue = dataset["test"][index]["dialogue"]
    summary = dataset["test"][index]["summary"]
    
    prompt = """
    Dialogue:
    
    {}
    
    What was going on?
    """.format(dialogue)
    
    # tokenize the prompt which will be used as input for the model
    inputs = tokenizer(prompt, return_tensors = "pt")
    # decode the output so that it is returned as text
    output = tokenizer.decode(
        model.generate(inputs["input_ids"],
                       max_new_tokens = 50,)[0],
        skip_special_tokens = True
    )
    
    # print the output of the model
    print(dash_line)
    print("Example ", i + 1)
    print(dash_line)
    print("INPUT PROMPT: {}".format(prompt))
    print(dash_line)
    print("BASELINE SUMMARY: {}".format(summary))
    print(dash_line)
    print("GENERATED SUMMARY - ZERO SHOT: {}".format(output))

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT: 
    Dialogue:
    
    #Person1#: You're finally here! What took so long?
#Person2#: I got stuck in traffic again. There was a terrible traffic jam near the Carrefour intersection.
#Person1#: It's always rather congested down there during rush hour. Maybe you should try to find a different route to get home.
#Person2#: I don't think it can be avoided, to be honest.
#Person1#: perhaps it would be better if you started taking public transport system to work.
#Person2#: I think it's something that I'll have to consider. The public transport system is pretty good.
#Person1#: It would be better for the environment, too.
#Person2#: I know. I feel bad about how much my car is adding to the pollution problem in this city.
#Person1#: Taking the subway would be a lot less s

# 4.0 Summarizing Dialogue with One Shot Inference

**One Shot** is the practice of providing the LLM with one example of prompt-response pairs that match whatever task you are trying to solve. This is called "**In-Context Learning**"

## 4.1 One Shot Inference

In [15]:
def make_prompt(examples_full, examples_to_summarize):
    """
    This function takes in two lists. The first list contains the prompt-response pairs that the model will see before
    making a summary on the examples_to_summarize.
    
    Inputs:
        examples_full (List): List of prompt-response pairs that model will look at beforehand
        examples_to_summarize (List): List of dialogues that we want our model to summarize
    Outputs:
        prompt: prompt that will serve as input for our model. The prompt will contain response pairs as well as the 
        dialogues which wont have a summary attached. Those will be the dialogues we want our model to summarize.
    """
    
    prompt = ""
    for index in examples_full:
        dialogue = dataset["test"][index]["dialogue"]
        summary = dataset["test"][index]["summary"]
        
        # construct the prompt
        prompt += """
    Dialogue:
        
    {}
        
    What was going on?
    {}
    """.format(dialogue, summary)
        
    for index in examples_to_summarize:
        dialogue = dataset["test"][index]["dialogue"]
        
        prompt += """
    Dialogue:
        
    {}
        
    What was going on?
    """.format(dialogue)
        
    return prompt

In [16]:
# Generate the prompt that will be used as input for One-Shot Inference
examples_full = [5]
examples_to_summarize = [15]

one_shot_prompt = make_prompt(examples_full, examples_to_summarize)

print(one_shot_prompt)


    Dialogue:
        
    #Person1#: You're finally here! What took so long?
#Person2#: I got stuck in traffic again. There was a terrible traffic jam near the Carrefour intersection.
#Person1#: It's always rather congested down there during rush hour. Maybe you should try to find a different route to get home.
#Person2#: I don't think it can be avoided, to be honest.
#Person1#: perhaps it would be better if you started taking public transport system to work.
#Person2#: I think it's something that I'll have to consider. The public transport system is pretty good.
#Person1#: It would be better for the environment, too.
#Person2#: I know. I feel bad about how much my car is adding to the pollution problem in this city.
#Person1#: Taking the subway would be a lot less stressful than driving as well.
#Person2#: The only problem is that I'm going to really miss having the freedom that you have with a car.
#Person1#: Well, when it's nicer outside, you can start biking to work. That will gi

In [17]:
# Perform One-Shot Inference with the generated prompt

for index in examples_to_summarize:
    summary = dataset["test"][index]["summary"]
    
    inputs = tokenizer(one_shot_prompt, return_tensors = "pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens = 50,
        )[0],
        skip_special_tokens = True
    )
    
    print(dash_line)
    print("BASELINE SUMMARY: {}".format(summary))
    print(dash_line)
    print("GENERATED SUMMARY: {}".format(output))

Token indices sequence length is longer than the specified maximum sequence length for this model (834 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
BASELINE SUMMARY: #Person1# wants to create a company and is going to write a business plan. #Person2# gives #Person1# suggestions on how to summarise business ideas, describe the service, differ from competitors and attract investment in a good business plan. #Person1# decides to stick to the old job.
---------------------------------------------------------------------------------------------------
GENERATED SUMMARY: Person1 is going to start a business. He is going to write a business plan and get some investors. He will write a market analysis and market analysis. He will also write a financial analysis.


# 5.0 Summarizing Dialogue with Few Shot Inference

Similar to One-Shot Inference, **Few-Shot** Inference provides multiple examples for the model to see before performing whatever task you want.

## 5.1 Few Shot Inference

In [18]:
# generate the prompt that will be used for few shot inference

examples_full = [5, 10, 20]
examples_to_summarize = [15]

few_shot_prompt = make_prompt(examples_full, examples_to_summarize)

print(few_shot_prompt)


    Dialogue:
        
    #Person1#: You're finally here! What took so long?
#Person2#: I got stuck in traffic again. There was a terrible traffic jam near the Carrefour intersection.
#Person1#: It's always rather congested down there during rush hour. Maybe you should try to find a different route to get home.
#Person2#: I don't think it can be avoided, to be honest.
#Person1#: perhaps it would be better if you started taking public transport system to work.
#Person2#: I think it's something that I'll have to consider. The public transport system is pretty good.
#Person1#: It would be better for the environment, too.
#Person2#: I know. I feel bad about how much my car is adding to the pollution problem in this city.
#Person1#: Taking the subway would be a lot less stressful than driving as well.
#Person2#: The only problem is that I'm going to really miss having the freedom that you have with a car.
#Person1#: Well, when it's nicer outside, you can start biking to work. That will gi

In [19]:
# Perform Few-Shot Inference with the generated prompt

for index in examples_to_summarize:
    summary = dataset["test"][index]["summary"]
    
    inputs = tokenizer(few_shot_prompt, return_tensors = "pt")
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens = 50,
        )[0],
        skip_special_tokens = True
    )
    
    print(dash_line)
    print("BASELINE SUMMARY: {}".format(summary))
    print(dash_line)
    print("GENERATED SUMMARY - Few Shot: {}".format(output))

---------------------------------------------------------------------------------------------------
BASELINE SUMMARY: #Person1# wants to create a company and is going to write a business plan. #Person2# gives #Person1# suggestions on how to summarise business ideas, describe the service, differ from competitors and attract investment in a good business plan. #Person1# decides to stick to the old job.
---------------------------------------------------------------------------------------------------
GENERATED SUMMARY - Few Shot: Person1 is going to start a business. He is going to write a business plan and get some investors.


# 6.0 Generative Configuration Parameters For Inference

We can change the configuration parameters of the generate() method:
* max_new_tokens - defines the maximum number of tokens to generate.
* do_sample
* temperature - the creativity of the tokens generated.

In [None]:
generation_config = GenerationConfig(max_new_tokens=50)
# generation_config = GenerationConfig(max_new_tokens=10)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.1)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.5)
# generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=1.0)

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0], 
    skip_special_tokens=True
)