In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig, TrainingArguments, Trainer
import torch
import evaluate
import pandas as pd
import numpy as np
import datetime

In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("Using CPU device")

Using MPS device


In [3]:
def print_number_of_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params


In [4]:
DATASET_NAME = "knkarthick/dialogsum"
MODEL_NAME = "google/flan-t5-base"

In [5]:
dataset = load_dataset(DATASET_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16) # might want float32 for MPS
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

In [6]:
total_params, trainable_params = print_number_of_model_parameters(model)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Percentage of trainable parameters: {trainable_params / total_params * 100:.2f}%")

Total parameters: 247577856
Trainable parameters: 247577856
Percentage of trainable parameters: 100.00%


In [38]:
def make_n_shot_summary_prompt(example_ids=None, summarize_id=0, data=dataset, my_set='test'):
    prompt = ''
    if example_ids:
        for i in example_ids:
            dialogue = data[my_set]['dialogue'][i]
            human_summary = data[my_set]['summary'][i]
    
            prompt += f"""
Summarize the following conversation.

{dialogue}

Summary:

{human_summary}
"""
        
    dialogue = data[my_set]['dialogue'][summarize_id]

    prompt += f"""
Summarize the following conversation.

{dialogue}

Summary:
"""
    return prompt
    

def get_model_completion(prompt, tokenizer=tokenizer, model=model, gen_config=None, 
                         do_sample=False, max_new_tokens=1000, num_beams=1):
    sentence_encoded = tokenizer(prompt, return_tensors='pt').to('mps') 
    if not hasattr(model, 'base_model'):
        completion = model.generate(sentence_encoded.input_ids,
                               num_beams=num_beams,
                               do_sample=do_sample,
                               max_new_tokens=max_new_tokens,
                               generation_config=gen_config)[0]
    else: # need only kwargs for PEFT models
        completion = model.generate(input_ids=sentence_encoded.input_ids,
                               num_beams=num_beams,
                               do_sample=do_sample,
                               max_new_tokens=max_new_tokens,
                               generation_config=gen_config)[0]
    return tokenizer.decode(completion, skip_special_tokens=True)


In [9]:
prompt = make_n_shot_summary_prompt(summarize_id=200, data=dataset)
print(prompt)


Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:



In [10]:
get_model_completion(prompt)

"#Person1#: I'm thinking of upgrading my computer."

In [12]:
sentence_encoded = tokenizer(prompt, return_tensors='pt')
tokens = tokenizer.convert_ids_to_tokens(sentence_encoded.input_ids[0])
print(tokens)


['▁', 'DIA', 'LOG', 'UE', ':', '▁#', 'P', 'erson', '1', '#', ':', '▁Have', '▁you', '▁considered', '▁upgrading', '▁your', '▁system', '?', '▁#', 'P', 'erson', '2', '#', ':', '▁Yes', ',', '▁but', '▁I', "'", 'm', '▁not', '▁sure', '▁what', '▁exactly', '▁I', '▁would', '▁need', '.', '▁#', 'P', 'erson', '1', '#', ':', '▁You', '▁could', '▁consider', '▁adding', '▁', 'a', '▁painting', '▁program', '▁to', '▁your', '▁software', '.', '▁It', '▁would', '▁allow', '▁you', '▁to', '▁make', '▁up', '▁your', '▁own', '▁fly', 'ers', '▁and', '▁banner', 's', '▁for', '▁advertising', '.', '▁#', 'P', 'erson', '2', '#', ':', '▁That', '▁would', '▁be', '▁', 'a', '▁', 'definite', '▁bonus', '.', '▁#', 'P', 'erson', '1', '#', ':', '▁You', '▁might', '▁also', '▁want', '▁to', '▁upgrade', '▁your', '▁hardware', '▁because', '▁it', '▁is', '▁pretty', '▁outdated', '▁now', '.', '▁#', 'P', 'erson', '2', '#', ':', '▁How', '▁can', '▁we', '▁do', '▁that', '?', '▁#', 'P', 'erson', '1', '#', ':', '▁You', "'", 'd', '▁probably', '▁need', '▁

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [7]:
def tokenize_function(example):
    """Tokenizes the input and output text for the model, 
    including a hardcoded prompt to summarize the conversation."""
    
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    output = tokenizer(prompt, truncation=True, padding='max_length', return_tensors='pt')
    output['labels'] = tokenizer(example['summary'], truncation=True, padding='max_length', return_tensors='pt').input_ids
    return output


In [8]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, 
                                remove_columns=['id', 'topic', 'dialogue', 'summary']
                                )

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset_small = tokenized_dataset.filter(lambda example, index: index % 10 == 0, with_indices=True)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [18]:
tokenized_dataset_small

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1246
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 150
    })
})

# LoRA

In [10]:
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
    r=32,
    lora_alpha=32, #16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [None]:
peft_model = get_peft_model(model, lora_config) # modifies base model?
total_params, trainable_params = print_number_of_model_parameters(peft_model)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Percentage of trainable parameters: {trainable_params / total_params * 100:.2f}%")

Total parameters: 251116800
Trainable parameters: 3538944
Percentage of trainable parameters: 1.41%


In [12]:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"../models/peft-dialogue-summary-training-{timestamp}"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    num_train_epochs=1,
    learning_rate=1e-3,
    logging_steps=20,
    per_device_train_batch_size=2,
    # fp16=True,  # for mixed-precision training, but doesn't work on apple silicon
    max_steps=-1, 
    label_names=["labels"],
    include_num_input_tokens_seen=True
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset['train'],
    #eval_dataset=tokenized_dataset_small,
)

In [13]:
peft_trainer._train_batch_size

2

In [None]:
peft_trainer.train()
# CPU: Took 24 minutes to do 1 step which is 8 samples
# GPU: Took < 1 minute to do 125 samples
# GPU: Took 8:27 minutes to do 1250 samples

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
20,26.4236
40,4.5
60,2.6328
80,0.8387
100,0.4448
120,0.3432
140,0.3287
160,0.3284
180,0.3319
200,0.2815


KeyboardInterrupt: 

In [None]:
#peft_model.save_pretrained('../models/peft-dialogue-summary-training-{timestamp}_lora_results')

In [41]:
my_id = 200
prompt = make_n_shot_summary_prompt(summarize_id=my_id, data=dataset)
print(prompt)


Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:



In [42]:
torch.mps.manual_seed(42)
sentence_encoded = tokenizer(prompt, return_tensors='pt').to('mps')  # Move the entire batch to MPS
peft_model = peft_model.to(device)
model_orig = model_orig.to(device) 
completion = peft_model.generate(input_ids=sentence_encoded.input_ids,
                            num_beams=1,
                            do_sample=True,
                            max_new_tokens=1000,
                            generation_config=None)[0]  # No need to call .to('mps') again
tokenizer.decode(completion, skip_special_tokens=True)


"#Person2# thinks upgrading to #Person1#'s system like being able to make up Flyers for advertising has something to do with #Person2#'s hardware, and #Person2#'s hardware."

Checkpoint 3000, ID 201

"#Person2# invites #Person2# from Mexico to the Holiday Inn. #Person1# thinks it has something from #Person1#'s career in foreign country. #Person2#'s daughter the name of #Person1# and #Person1# has back Columbias heritage."

In [None]:
next(peft_model.parameters()).device

In [39]:
completion = get_model_completion(prompt, model=model_orig)
print(completion)

The flight is scheduled to arrive at the Holiday Inn in China.


In [24]:
peft_model.eval()
completion = get_model_completion(prompt, model=peft_model)
print(completion)

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [36]:
dataset['test'][my_id]['summary']

'#Person1# helps #Person2# to choose a new phone.'

In [40]:
for name, _ in model.named_parameters():
    print(name)

shared.weight
encoder.block.0.layer.0.SelfAttention.q.base_layer.weight
encoder.block.0.layer.0.SelfAttention.q.lora_A.default.weight
encoder.block.0.layer.0.SelfAttention.q.lora_B.default.weight
encoder.block.0.layer.0.SelfAttention.k.weight
encoder.block.0.layer.0.SelfAttention.v.base_layer.weight
encoder.block.0.layer.0.SelfAttention.v.lora_A.default.weight
encoder.block.0.layer.0.SelfAttention.v.lora_B.default.weight
encoder.block.0.layer.0.SelfAttention.o.weight
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight
encoder.block.0.layer.0.layer_norm.weight
encoder.block.0.layer.1.DenseReluDense.wi_0.weight
encoder.block.0.layer.1.DenseReluDense.wi_1.weight
encoder.block.0.layer.1.DenseReluDense.wo.weight
encoder.block.0.layer.1.layer_norm.weight
encoder.block.1.layer.0.SelfAttention.q.base_layer.weight
encoder.block.1.layer.0.SelfAttention.q.lora_A.default.weight
encoder.block.1.layer.0.SelfAttention.q.lora_B.default.weight
encoder.block.1.layer.0.SelfAttention.k.we

In [12]:
from copy import deepcopy
model_orig = deepcopy(model)

In [25]:
from peft import PeftModel
peft_model = PeftModel.from_pretrained(model,
    "../models/peft-dialogue-summary-training-2025-05-08_21-14-28/checkpoint-3000/",
    torch_device=device)



In [34]:
peft_model.load_adapter("../models/peft-dialogue-summary-training-2025-05-08_21-14-28/checkpoint-2000/",
                        adapter_name="chk-2000")

<All keys matched successfully>

In [35]:
peft_model.peft_config.keys()
peft_model.set_adapter("chk-2000")

In [None]:
# 15:06 in the video


# Need to clean up code (especially re: setting MPS if possible, and use throughout). And functions for prompt completion.
# Need to figure out why base model is modified (it now has lora params and a base_model attribute). Why? Is it peft?

# Why does a lot of the test data repeat? Check this right after loading.