In [None]:
# On utilise la méthode de PEFT LoRA sur du résumé de dialogues

In [None]:
# 1 Set up Kernel Load required dependencies, dataset and LLMs

In [None]:
# 1.1 Set up Kernel and required dependencies

In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install --upgrade pip setuptools wheel
#!{sys.executable} -m pip install --disable-pip-version-check torch==1.13.1 torchdata==0.5.1
!{sys.executable} -m pip install --disable-pip-version-check torch torchdata
!{sys.executable} -m pip install transformers==4.27.2 datasets==2.11.0 \
    evaluate==0.4.0 rouge_score==0.1.2 loralib==0.1.1 peft==0.3.0 
# Ces 4 modules sont nouveaux par rapport à Week1



In [2]:
# Importation des composants nécessaires
from datasets import load_dataset
# AutoModelForSeq2SeqLM pour accéder à FLAN-T5
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
# 1.2 Load dataset and LLM

In [3]:
huggingface_dataset_name="knkarthick/dialogsum"
dataset=load_dataset(huggingface_dataset_name)
dataset

Found cached dataset csv (C:/Users/V/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [4]:
# On peut choisir la taille du modèle en fonction de la performance de l'environnement
model_name='google/flan-t5-small'
original_model=AutoModelForSeq2SeqLM.from_pretrained(model_name,torch_dtype=torch.bfloat16)
tokenizer=AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
# On crée une fonction pour montrer les différents paramètres du modèles, en particulier les trainables
def print_number_of_trainable_model_parameters(model):
    trainable_model_params=0
    all_model_params=0
    for _,param in model.named_parameters():
        all_model_params+=param.numel()
        if param.requires_grad:
            trainable_model_params+=param.numel()
    return f"trainable model parameters : {trainable_model_params} \nall model parameters : {all_model_params} \npercentage of trainable model parameters : {trainable_model_params/all_model_params}"

In [6]:
print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters : 76961152 
all model parameters : 76961152 
percentage of trainable model parameters : 1.0


In [None]:
# 1.3 Test the model with zero-shot inferency

In [7]:
index=200
dash_line='-'.join('' for x in range(100))
dialogue=dataset['test'][index]['dialogue']
summary=dataset['test'][index]['summary']

prompt=f"""
Summarize the following conversation.

{dialogue}

Summary:
    """
    
# Input constructed prompt instead of the dialogue
inputs=tokenizer(prompt,return_tensors='pt')
output=tokenizer.decode(original_model.generate(inputs['input_ids'],max_new_tokens=200)[0],skip_special_tokens=True)
    
print(dash_line)
print('Exemple ')
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')


---------------------------------------------------------------------------------------------------
Exemple 
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person

In [None]:
# 2 Perform full fine-tuning

In [None]:
# 2.1 Pre-process the dialog-Summary dataset

In [None]:
# On convertit ici les paires prompt-réponses en input d'entraînement pour le modèle
# On souhaite que le format du training prompt soit :
# Summarize the following conversation.
# {dialog}
# Summary:
# On définit ci-dessous une fonction qui construit et tokenize cette instruction-type

In [8]:
def tokenize_function(example):
    start_prompt="Summarize the following conversation.\n\n"
    end_prompt="\n\nSummary: "
    prompt=[start_prompt+dialogue+end_prompt for dialogue in example['dialogue']]
    example['input_ids']=tokenizer(prompt,padding='max_length',truncation=True, return_tensors='pt').input_ids
    example['labels']=tokenizer(example['summary'],padding='max_length',truncation=True, return_tensors='pt').input_ids
    return example
# Le dataset contient 3 splits : train, validation, test
# La fonction ci-dessus s'applique à l'ensemble
tokenized_datasets=dataset.map(tokenize_function,batched=True)
tokenized_datasets=tokenized_datasets.remove_columns(['id','topic','dialogue','summary'])
print(tokenized_datasets)

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
})


In [9]:
# On n'utilise qu'une partie du dataset pour préserver les ressources
tokenized_datasets=tokenized_datasets.filter(lambda example, index : index % 1000 ==0, with_indices=True)
print(tokenized_datasets)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 13
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
})


In [10]:
# Format des datasets
print(f"Shapes of the datasets:")
print(f"Training : {tokenized_datasets['train'].shape}")
print(f"Validation : {tokenized_datasets['validation'].shape}")
print(f"Test : {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training : (13, 2)
Validation : (1, 2)
Test : (2, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 13
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1
    })
})


In [None]:
# 2.2 Fine-tune the model with the pre-processed dataset

In [11]:
# On utilise le module Training de HuggingFace
# On fixe un certain nombre d'hyper-paramètres selon des valeurs conventionnelles

output_dir=f'./dialogue-summary-training-{str(int(time.time()))}'

training_args=TrainingArguments(output_dir=output_dir,
                                learning_rate=1e-5,
                                num_train_epochs=1,
                                weight_decay=0.01,
                                logging_steps=1,
                                max_steps=1)

trainer=Trainer(model=original_model,
               args=training_args,
               train_dataset=tokenized_datasets['train'],
               eval_dataset=tokenized_datasets['validation'])

In [12]:
# Avec 1 epoch et 1 max_steps, le modèle ne sera que très peu amélioré, 
# mais ça ne prend que "quelques minutes" à faire tourner
trainer.train()



Step,Training Loss
1,54.0


TrainOutput(global_step=1, training_loss=54.0, metrics={'train_runtime': 7038.5145, 'train_samples_per_second': 0.001, 'train_steps_per_second': 0.0, 'total_flos': 1487124037632.0, 'train_loss': 54.0, 'epoch': 0.5})

In [13]:
# On sauvegarde le modèle qu'on vient d'entraîner
trainer.save_model('./FlanT5-after-fine-tune-231029')

In [None]:
# 2.3 Evaluate the model qualitatively (human evaluation)

In [None]:
# Calcul de l'évaluation ROUGE
original_model_results=rouge.compute(predictions=original_model_summaries,
                                    reference=human_baseline_summaries[0:len(original_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)
instruct_model_results=rouge.compute(predictions=instruct_model_summaries,
                                    reference=human_baseline_summaries[0:len(instruct_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)
                                     
print(f'ORIGINAL MODEL:\n{original_model_results})
print(f'INSTRUCT MODEL:\n{instruct_model_results})

In [14]:
# On prend le modèle qu'on vient d'entraîner en full fine-tuning
instruct_model=AutoModelForSeq2SeqLM.from_pretrained('./FlanT5-after-fine-tune-231029',torch_dtype=torch.bfloat16)

In [15]:
# On compare sa performance avec le modèle original

index=200
dialogue=dataset['test'][index]['dialogue']
summary=dataset['test'][index]['summary']

prompt=f"""
Summarize the following conversation.

{dialogue}

Summary:
    """
    
input_ids=tokenizer(prompt,return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200,num_beams=1))
original_model_text_output=tokenizer.decode(original_model_outputs[0],skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200,num_beams=1))
instruct_model_text_output=tokenizer.decode(instruct_model_outputs[0],skip_special_tokens=True)
    
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')


---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Get your computer backed up.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
How would you like to upgrade your computer?


In [None]:
# 2.4 Evaluate the Model quantitatively (ROUGE Metric)

In [16]:
rouge=evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [19]:
# On prépare l'évaluation sur 10 dialogues

dialogues=dataset['test'][0:10]['dialogue']
human_baseline_summaries=dataset['test'][0:10]['summary']

original_model_summaries=[]
instruct_model_summaries=[]

for _,dialogue in enumerate(dialogues):
    prompt=f"""
Summarize the following dialogue.

{dialogue}

Summary: """
    input_ids=tokenizer(prompt,return_tensors='pt').input_ids
    
    original_model_outputs = original_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output=tokenizer.decode(original_model_outputs[0],skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output=tokenizer.decode(instruct_model_outputs[0],skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
zipped_summaries=list(zip(human_baseline_summaries,original_model_summaries,instruct_model_summaries)) 
df=pd.DataFrame(zipped_summaries,columns=['human_baseline_summaries','original_model_summaries','instruct_model_summaries'])
df
    

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,Is it OK to send an intra-office memorandum to...,...
1,In order to prevent employees from wasting tim...,"#Person1#: Please, please send the memo to all...",...
2,Ms. Dawson takes a dictation for #Person1# abo...,You're not going to be a desk sucks.,...
3,#Person2# arrives late because of traffic jam....,I'm a car driver.,The traffic jam was so bad that I couldn't get...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam was a long time ago.,The traffic jam was so bad that I couldn't get...
5,#Person2# complains to #Person1# about the tra...,Taking the subway would be a lot less stressful.,The traffic jam was so bad that I couldn't get...
6,#Person1# tells Kate that Masha and Hero get d...,People aren't able to understand what happened.,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,#Person1#: Getting married.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,"Brian, thanks for the birthday party.","Brian, how are you?"


In [28]:
print(original_model_summaries)
print(human_baseline_summaries[0:len(original_model_summaries)])
print(len(original_model_summaries))
print(len(human_baseline_summaries[0:len(original_model_summaries)]))

['Is it OK to send an intra-office memorandum to all employees?', '#Person1#: Please, please send the memo to all employees before 4 pm.', "You're not going to be a desk sucks.", "I'm a car driver.", 'The traffic jam was a long time ago.', 'Taking the subway would be a lot less stressful.', "People aren't able to understand what happened.", 'Masha and Hero are getting divorced.', '#Person1#: Getting married.', 'Brian, thanks for the birthday party.']
['Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.', 'In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.', 'Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# stil

In [33]:
rouge.compute(predictions=['Is it OK to send an intra-office memorandum to all employees?'],
                                    reference=['Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.'],
                                    use_aggregator=True,
                                    use_stemmer=True)

TypeError: 'NoneType' object is not subscriptable

In [30]:
# Calcul de l'évaluation ROUGE
original_model_results=rouge.compute(predictions=original_model_summaries,
                                    reference=human_baseline_summaries[0:len(original_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)
instruct_model_results=rouge.compute(predictions=instruct_model_summaries,
                                    reference=human_baseline_summaries[0:len(instruct_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)

TypeError: 'NoneType' object is not subscriptable

In [20]:
# Amélioration absolue en points de %
print("Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL")
improvement=(np.array(list(instruct_model_results.values()))-np.array(list(original_model_results.values())))
for key,value in zip(instruct_model_results.keys(),improvement):
    print(f'{key}: {value*100:.2f}%')    

Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL


NameError: name 'instruct_model_results' is not defined

In [None]:
# 3 Perform Parameter-Efficient Fine-Tuning

In [None]:
# 3.1 Set up the PEFT/LoRA model for fine-tuning

In [None]:
from peft import LoraConfig, get_peft_model, TaskType
# Rang r=32, relativement grand
# Ci-dessous, TaskType.SEQ_2_SEQ_LM correspond à Flan-T5
lora_config=LoraConfig(r=32,lora_alpha=32,
                       target_modules=['q','v'],
                       lora_dropout=0.05,
                       bias="none",
                       task_type=TaskType.SEQ_2_SEQ_LM)

In [None]:
peft_model=get_peft_model(original_model,lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
# 3.2 Train PEFT Adapter 

In [None]:
# On utilise à nouveau le module Training de HuggingFace
# Le learning rate est supérieur à celui utilisé pour le full fine-tuning

output_dir=f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args=TrainingArguments(output_dir=output_dir,
                                auto_find_batch_size=True,
                                learning_rate=1e-3,
                                num_train_epochs=1,
                                logging_steps=1,
                                max_steps=1)

peft_trainer=Trainer(model=peft_model,
               args=peft_training_args,
               train_dataset=tokenized_datasets['train'])

In [None]:
# On entraîne le nouveau modèle
peft_trainer.train()

In [None]:
# On sauvegarde le nouveau modèle
peft_model_path="./peft-dialogue-summary-checkpoint"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
# Quand on charge un Peft adapter, celui-ci a très peu de paramètres
# On le "fusionne" avec un grand modèle pre-trained, ici Flan-T5
# is_trainable=False pour indiquer qu'on veut utiliser le modèle PEFT adapté pour inférence uniquement, pas pour un
# entraînement complémentaire. Ainsi, Pytorch ne charge pas les paramètres d'optimisation, ce qui libère de l'espace
from peft import PeftModel, PeftConfig
peft_model_base=AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base',torch_dtype=torch.bfloat16)
tokenizer=AutoTokenizer.from_pretrained('google/flan-t5-base')

peft_model=PeftModel.from_pretrained(peft_model_base,
                                    peft_model_path,
                                    torch_dtype=torch.bfloat16,
                                    is_trainable=False)

In [None]:
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
# 3.3 Evaluate the model qualitatively (human evaluation)

In [None]:
# On compare sa performance avec les modèles précédents

index=200
dialogue=dataset['test'][index]['dialogue']
summary=dataset['test'][index]['summary']

prompt=f"""
Summarize the following conversation.

{dialogue}

Summary:
    """
    
input_ids=tokenizer(prompt,return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200,num_beams=1))
original_model_text_output=tokenizer.decode(original_model_outputs[0],skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200,num_beams=1))
instruct_model_text_output=tokenizer.decode(instruct_model_outputs[0],skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200,num_beams=1))
peft_model_text_output=tokenizer.decode(peft_model_outputs[0],skip_special_tokens=True)
    
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL:\n{peft_model_text_output}')

In [None]:
# 3.4 Evaluate the Model quantitatively (ROUGE Metric)

In [None]:
# On prépare l'évaluation sur 10 dialogues

dialogues=datasets['test'][0:10]['dialogue']
human_baseline_summaries=datasets['test'][0:10]['summary']

original_model_summaries=[]
instruct_model_summaries=[]
peft_model_summaries=[]

for idx,dialogue in enumerate(dialogues):
    prompt=f"""
Summarize the following dialogue.

{dialogue}

Summary: """
    input_ids=tokenizer(prompt,return_tensors='pt').input_ids
    human_baseline_text_output=human_baseline_summaries[idx]
    
    original_model_outputs = original_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output=tokenizer.decode(original_model_outputs[0],skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)
    
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output=tokenizer.decode(instruct_model_outputs[0],skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
    peft_model_outputs = peft_model.generate(input_ids=input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output=tokenizer.decode(peft_model_outputs[0],skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)
    
zipped_summaries=list(zip(human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries)) 
df=pd.Dataframe(zipped_summaries,columns=['human_baseline_summaries','original_model_summaries','instruct_model_summaries','peft_model_summaries'])
df
    

In [None]:
# Calcul de l'évaluation ROUGE
original_model_results=rouge.compute(predictions=original_model_summaries,
                                    reference=human_baseline_summaries[0:len(original_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)
instruct_model_results=rouge.compute(predictions=instruct_model_summaries,
                                    reference=human_baseline_summaries[0:len(instruct_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)
                                     
peft_model_results=rouge.compute(predictions=peft_model_summaries,
                                    reference=human_baseline_summaries[0:len(peft_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)

print(f'ORIGINAL MODEL:\n{original_model_results})
print(f'INSTRUCT MODEL:\n{instruct_model_results})
print(f'PEFT MODEL:\n{peft_model_results})

In [None]:
# Amélioration absolue en points de % par rapport à ORIGINAL MODEL
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")
improvement=(np.array(list(peft_model_results.values()))-np.array(list(original_model_results.values())))
for key,value in zip(peft_model_results.keys(),improvement):
    print(f'{key}: {value*100:.2f}%')   

In [None]:
# Amélioration absolue en points de % par rapport à INSTRUCT MODEL
print("Absolute percentage improvement of PEFT MODEL over INSTRUCT MODEL")
improvement=(np.array(list(peft_model_results.values()))-np.array(list(instruct_model_results.values())))
for key,value in zip(peft_model_results.keys(),improvement):
    print(f'{key}: {value*100:.2f}%')   