In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, TrainingArguments, Trainer
import torch
import evaluate
import pandas as pd
import numpy as np

In [None]:
def print_number_of_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params


In [None]:
DATASET_NAME = "knkarthick/dialogsum"
MODEL_NAME = "google/flan-t5-base"

In [6]:
dataset = load_dataset(DATASET_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

In [None]:
total_params, trainable_params = print_number_of_model_parameters(model)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Percentage of trainable parameters: {trainable_params / total_params * 100:.2f}%")

Total parameters: 247577856
Trainable parameters: 247577856
Percentage of trainable parameters: 100.00%


In [13]:
def make_n_shot_summary_prompt(example_ids=None, summarize_id=0, data=dataset, my_set='test'):
    prompt = ''
    if example_ids:
        for i in example_ids:
            dialogue = data[my_set]['dialogue'][i]
            human_summary = data[my_set]['summary'][i]
    
            prompt += f"""
DIALOGUE:

{dialogue}

SUMMARY:

{human_summary}
"""
        
    dialogue = data[my_set]['dialogue'][summarize_id]

    prompt += f"""
DIALOGUE:

{dialogue}

SUMMARY:
"""
    return prompt
    

def get_model_completion(prompt, tokenizer=tokenizer, model=model, gen_config=None):
    sentence_encoded = tokenizer(prompt, return_tensors='pt')
    completion = model.generate(sentence_encoded.input_ids,
                               num_beams=1,
                               do_sample=True,
                               max_new_tokens=1000,
                               generation_config=gen_config)[0]
    return tokenizer.decode(completion, skip_special_tokens=True)
    

In [14]:
prompt = make_n_shot_summary_prompt(summarize_id=200, data=dataset)
print(prompt)


DIALOGUE:

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

SUMMARY:



In [15]:
get_model_completion(prompt)

'#Person2#: What do you want to do as someone else mentioned?'

In [23]:
sentence_encoded = tokenizer(prompt, return_tensors='pt')
tokens = tokenizer.convert_ids_to_tokens(sentence_encoded.input_ids[0])
print(tokens)


['▁', 'DIA', 'LOG', 'UE', ':', '▁#', 'P', 'erson', '1', '#', ':', '▁Have', '▁you', '▁considered', '▁upgrading', '▁your', '▁system', '?', '▁#', 'P', 'erson', '2', '#', ':', '▁Yes', ',', '▁but', '▁I', "'", 'm', '▁not', '▁sure', '▁what', '▁exactly', '▁I', '▁would', '▁need', '.', '▁#', 'P', 'erson', '1', '#', ':', '▁You', '▁could', '▁consider', '▁adding', '▁', 'a', '▁painting', '▁program', '▁to', '▁your', '▁software', '.', '▁It', '▁would', '▁allow', '▁you', '▁to', '▁make', '▁up', '▁your', '▁own', '▁fly', 'ers', '▁and', '▁banner', 's', '▁for', '▁advertising', '.', '▁#', 'P', 'erson', '2', '#', ':', '▁That', '▁would', '▁be', '▁', 'a', '▁', 'definite', '▁bonus', '.', '▁#', 'P', 'erson', '1', '#', ':', '▁You', '▁might', '▁also', '▁want', '▁to', '▁upgrade', '▁your', '▁hardware', '▁because', '▁it', '▁is', '▁pretty', '▁outdated', '▁now', '.', '▁#', 'P', 'erson', '2', '#', ':', '▁How', '▁can', '▁we', '▁do', '▁that', '?', '▁#', 'P', 'erson', '1', '#', ':', '▁You', "'", 'd', '▁probably', '▁need', '▁

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [27]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, truncation=True, padding='max_length', return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], truncation=True, padding='max_length', return_tensors='pt').input_ids
    return example


In [29]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['id', 'topic', 'dialogue', 'summary'])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

In [39]:
tokenized_dataset_small = tokenized_dataset.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [40]:
tokenized_dataset_small

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})