# Application oriented of LLMs

outline
- datasets
    - for summarization
    - for question answering

- models
    - instruct finetuning model

- customizing prompts for the corresponding tasks
    - summarization
    - question answering

In [None]:
!pip install transformers datasets

In [13]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# datasets

summarization dataset: DIALOGSum (https://huggingface.co/datasets/knkarthick/dialogsum), License: cc-by-nc-sa-4.0<br>
question and answering dataset: SQuAD2.0 (https://huggingface.co/datasets/squad_v2), License:cc-by-sa-4.0

## load summarization dataset

In [2]:
summarization_dataset = load_dataset("knkarthick/dialogsum")

Found cached dataset csv (/home/tslab/phusaeng/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
summarization_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

## load question-answering dataset

In [4]:
qa_dataset = load_dataset("squad_v2")
qa_dataset

Found cached dataset squad_v2 (/home/tslab/phusaeng/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

# load model

Instruction finetuning models with Flan-T5 models (https://arxiv.org/pdf/2210.11416.pdf):
- flan t5 base (https://huggingface.co/google/flan-t5-base): Apache License 2.0
- flan t5 large (https://huggingface.co/google/flan-t5-large): Apache License 2.0

In [5]:
# base model
tokenizer_base = AutoTokenizer.from_pretrained('google/flan-t5-base')
t5_base = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base', torch_dtype=torch.bfloat16) # bfloat16 is faster than fp32
t5_base = t5_base.to('cuda')

In [6]:
# large model
tokenizer_large = AutoTokenizer.from_pretrained('google/flan-t5-large')
t5_large = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-large', torch_dtype=torch.bfloat16) # bfloat16 is faster than fp32
t5_large = t5_large.to('cuda')

In [7]:
# count params
def count_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    def num_to_str(num):
        return format(num, ',')
    return f"trainable_params: {num_to_str(trainable_params)}\nall_params: {num_to_str(all_params)}\npercentage of trainable params: {100*trainable_params/all_params}%"
print(f"base model parameters: {count_parameters(t5_base)}")
print(f"large model parameters: {count_parameters(t5_large)}")

base model parameters: trainable_params: 247,577,856
all_params: 247,577,856
percentage of trainable params: 100.0%
large model parameters: trainable_params: 783,150,080
all_params: 783,150,080
percentage of trainable params: 100.0%


# customizing prompts for the corresponding tasks

## summarization task

In [8]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=1000):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )
  # Generate
  device = model.device
  # print(f'device of the model: {device}')
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens,
    # do_sample=True,
    # top_k=1,
  )
  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
  generated_text_answer = generated_text_with_prompt[0]
  return generated_text_answer

def summarize(
    dialogue,
    model, 
    tokenizer,
    prompt_format=None,
):
  if prompt_format is None:
    prompt = """
    Summarize the following conversation.

    {dialogue}

    Summary:
    """
  else:
    prompt = prompt_format
  # print(f'INPUT PROMPT:\n{prompt.format(dialogue=dialogue)}')
  output = inference(prompt.format(dialogue=dialogue), model, tokenizer)
  return output

### default prompt

In [9]:
idx = 1
dialogue_test = summarization_dataset['train'][:100]['dialogue']
baseline_output = summarization_dataset['train'][:100]['summary']
output_base = summarize(
    dialogue_test[idx],
    t5_base,
    tokenizer_base,
    prompt_format=None,
)
output_large = summarize(
    dialogue_test[idx],
    t5_large,
    tokenizer_large,
    prompt_format=None,
)
print(f"INPUT:\n{dialogue_test[idx]}")
print('-'*100)
print(f'BASELINE OUTPUT:\n{baseline_output[idx]}')
print('-'*100)
print(f'BASE MODEL OUTPUT:\n{output_base}')
print('-'*100)
print(f'LARGE MODEL OUTPUT:\n{output_large}')

INPUT:
#Person1#: Hello Mrs. Parker, how have you been?
#Person2#: Hello Dr. Peters. Just fine thank you. Ricky and I are here for his vaccines.
#Person1#: Very well. Let's see, according to his vaccination record, Ricky has received his Polio, Tetanus and Hepatitis B shots. He is 14 months old, so he is due for Hepatitis A, Chickenpox and Measles shots.
#Person2#: What about Rubella and Mumps?
#Person1#: Well, I can only give him these for now, and after a couple of weeks I can administer the rest.
#Person2#: OK, great. Doctor, I think I also may need a Tetanus booster. Last time I got it was maybe fifteen years ago!
#Person1#: We will check our records and I'll have the nurse administer and the booster as well. Now, please hold Ricky's arm tight, this may sting a little.
----------------------------------------------------------------------------------------------------
BASELINE OUTPUT:
Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vacci

### customize prompt

In [10]:
new_prompt_format = """
### Input:
Summarize the following conversation step-by-step:
{dialogue}

### Output:
What is the name of person2?
"""
# What is #person2# trying to do in the conversation
output_base = summarize(
    dialogue_test[idx],
    t5_base,
    tokenizer_base,
    prompt_format=new_prompt_format,
)
output_large = summarize(
    dialogue_test[idx],
    t5_large,
    tokenizer_large,
    prompt_format=new_prompt_format,
)
print(f"INPUT:\n{dialogue_test[idx]}")
print('-'*100)
print(f'BASELINE OUTPUT:\n{baseline_output[idx]}')
print('-'*100)
print(f'BASE MODEL OUTPUT:\n{output_base}')
print('-'*100)
print(f'LARGE MODEL OUTPUT:\n{output_large}')

INPUT:
#Person1#: Hello Mrs. Parker, how have you been?
#Person2#: Hello Dr. Peters. Just fine thank you. Ricky and I are here for his vaccines.
#Person1#: Very well. Let's see, according to his vaccination record, Ricky has received his Polio, Tetanus and Hepatitis B shots. He is 14 months old, so he is due for Hepatitis A, Chickenpox and Measles shots.
#Person2#: What about Rubella and Mumps?
#Person1#: Well, I can only give him these for now, and after a couple of weeks I can administer the rest.
#Person2#: OK, great. Doctor, I think I also may need a Tetanus booster. Last time I got it was maybe fifteen years ago!
#Person1#: We will check our records and I'll have the nurse administer and the booster as well. Now, please hold Ricky's arm tight, this may sting a little.
----------------------------------------------------------------------------------------------------
BASELINE OUTPUT:
Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vacci

## question-answering tasks

### default prompt

In [21]:
idx = 0 
test_qa = qa_dataset['train']['question'][idx]
answer_qa = qa_dataset['train']['answers'][idx]['text'][0]
output_base = inference(test_qa, t5_base, tokenizer_base)
output_large = inference(test_qa, t5_large, tokenizer_large)
print(f"INPUT:\n{test_qa}")
print('-'*100)
print(f'BASELINE ANSWER:\n{answer_qa}')
print('-'*100)
print(f"BASE MODEL OUTPUT:\n{output_base}")
print('-'*100)
print(f"LARGE MODEL OUTPUT:\n{output_large}")

INPUT:
When did Beyonce start becoming popular?
----------------------------------------------------------------------------------------------------
BASELINE ANSWER:
in the late 1990s
----------------------------------------------------------------------------------------------------
BASE MODEL OUTPUT:
in the early 1980s
----------------------------------------------------------------------------------------------------
LARGE MODEL OUTPUT:
1990


### customize prompt

In [25]:
test_qa = qa_dataset['train']['question'][idx] + " explain in step-by-step" # <-- add more prompt
answer_qa = qa_dataset['train']['answers'][idx]['text'][0]
output_base = inference(test_qa, t5_base, tokenizer_base)
output_large = inference(test_qa, t5_large, tokenizer_large)
print(f"INPUT:\n{test_qa}")
print('-'*100)
print(f'BASELINE ANSWER:\n{answer_qa}')
print('-'*100)
print(f"BASE MODEL OUTPUT:\n{output_base}")
print('-'*100)
print(f"LARGE MODEL OUTPUT:\n{output_large}")

INPUT:
When did Beyonce start becoming popular? explain in step-by-step
----------------------------------------------------------------------------------------------------
BASELINE ANSWER:
in the late 1990s
----------------------------------------------------------------------------------------------------
BASE MODEL OUTPUT:
Beyonce started becoming popular in the early 1980s.
----------------------------------------------------------------------------------------------------
LARGE MODEL OUTPUT:
Beyonce became popular in the late 1990s.
