In [3]:
# !pip install datasets
# !pip install transformers
# !pip install evaluate
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install peft

import torch

device ='cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
#!python -c "from datasets import load_dataset; print(load_dataset('squad', split='train')[0])"

In [5]:
# !pip install ipywidgets
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer,GenerationConfig
import torch
import evaluate

import pandas as pd
import numpy as np

In [6]:
huggingface_dataset_name = "EdinburghNLP/xsum" #"knkarthick/dialogsum"

# «knkarthick/dialogsum» — это датасет для summarization диалога от Hugging Face. 
# Он состоит из 13 460 диалогов с соответствующими вручную размеченными резюме и темами. 

# Поля данных:

#     dialogue: текст диалога; 
#     summary: написанное человеком резюме диалога; 
#     topic: написанная человеком тема или однострочник диалога; 
#     id: уникальный идентификатор файла примера. 

# Датасет полезен для обучения и оценки моделей специально для задач summarization диалога. 

# 

In [7]:
dataset =load_dataset(huggingface_dataset_name)
# print(load_dataset('knkarthick/dialogsum', split='train')[0])
print(load_dataset('EdinburghNLP/xsum', split='train')[0])



In [8]:
model_name = "google/flan-t5-small" #"google-t5/t5-small" #"google/switch-base-8"
#"google/flan-t5-xl" #"google/flan-t5-large" #"google/flan-t5-base"
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# FLAN-T5 base — это языковая модель, разработанная Google. Она настроена более чем на 1000 
# дополнительных задач, охватывающих несколько языков. 

# Модель достигает высокой точности в некоторых задачах и особенно эффективна в таких областях, 
# как рассуждение и ответ на вопросы. Она обучена на огромном датасете текстов из интернета, 
# книг и другого материала. 

# Некоторые особенности FLAN-T5 base:

#     Быстрая обработка. Модель может быстро и эффективно решать широкий спектр задач. 
#     Высокая точность. Модель показывает высокую точность в различных задачах, например, 75,2% по 
#     пятишаговому MMLU.
#     Эффективность. Модель можно запускать на различных устройствах, включая CPU и GPU 
#     (с разными точностями, например FP16 и INT8). 

# Однако у модели есть и ограничения: она не тестировалась в реальных приложениях и может 
# генерировать неподходящий контент или повторять предубеждения исходных данных.

In [9]:
input_text = "translate Dutch to English: Hoe gaat het?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = base_model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> What do you do?</s>




In [10]:
# input_text = "2 minus 6 equals?"
input_text = """Answer the following question by reasoning step by step. 
                The cafeteria had 23 apples. If they used 20 for lunch, and bought 6 more, 
                how many apple do they have?"""

input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = base_model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> The cafeteria had 23 - 20 = 20 apples. The cafeteria had 20


In [11]:
input_text = "Who are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = base_model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> a sexy sexy sexy sexy 


In [12]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(base_model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


In [13]:
i= 13 #42
# dialogue = dataset['test'][i]['dialogue']
dialogue = dataset['test'][i]['document']
summary = dataset['test'][i]['summary']


prompt = f"Summarize the following dialogue  {dialogue}  Summary:"


input_ids = tokenizer(prompt, return_tensors="pt").input_ids
output = tokenizer.decode(base_model.generate(input_ids, max_new_tokens=200)[0],skip_special_tokens=True)

In [14]:
print(f"Input Prompt : {prompt}")
print("--------------------------------------------------------------------")
print("Human evaluated summary ---->")
print(summary)
print("---------------------------------------------------------------------")
print("Baseline model generated summary : ---->")
print(output)

Input Prompt : Summarize the following dialogue  The National League sold the Republic of Ireland midfielder to the Cherries for £175,000 in 2012 and had a 15% sell-on clause included in the deal.
O'Kane moved for an undisclosed fee, but Nicholson says any money will go to help the cash-strapped club.
"I don't think I'll be getting anything," Nicholson told BBC Devon.
"There's more important things."
The Gulls are still looking for new owners having been taken over by a consortium of local business people last summer.
They were forced to close down the club's academy and drastically reduce the playing budget after millionaire former owner Thea Bristow left the club.  Summary:
--------------------------------------------------------------------
Human evaluated summary ---->
Torquay United boss Kevin Nicholson says none of the money from Eunan O'Kane's move to Leeds from Bournemouth will go to the playing squad.
---------------------------------------------------------------------
Baseli

In [15]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    # prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["document"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'document', 'summary',])

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter:   0%|          | 0/204045 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11332 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [17]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (2041, 2)
Validation: (114, 2)
Test: (114, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2041
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 114
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 114
    })
})


In [18]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(r=32,lora_alpha = 32, target_modules=["q","v"],
                         lora_dropout = 0.5, bias ="none", task_type  =TaskType.SEQ_2_SEQ_LM)

In [19]:
peft_model_train = get_peft_model(base_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model_train))

trainable model parameters: 1376256
all model parameters: 78337408
percentage of trainable model parameters: 1.76%


In [20]:
output_dir = f"./peft-dialogue-summary-training"

peft_training_args = TrainingArguments(
     output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=5,

)


# peft_trainer = Trainer(
#     model=peft_model_train,
#     args=peft_training_args,
#     train_dataset=tokenized_datasets["train"],
# )


# Выполняем обучение
peft_trainer = Trainer(
	model = peft_model_train,
	args = peft_training_args,
	train_dataset = tokenized_datasets["train"],
	eval_dataset = tokenized_datasets["validation"],
    )

In [21]:
peft_trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,3.8514
1000,1.9131


TrainOutput(global_step=1280, training_loss=2.6624920845031737, metrics={'train_runtime': 345.7813, 'train_samples_per_second': 29.513, 'train_steps_per_second': 3.702, 'total_flos': 1940157895802880.0, 'train_loss': 2.6624920845031737, 'epoch': 5.0})

In [22]:
peft_trainer.evaluate(tokenized_datasets['test'])

{'eval_loss': 1.7207143306732178,
 'eval_runtime': 1.6001,
 'eval_samples_per_second': 71.247,
 'eval_steps_per_second': 9.375,
 'epoch': 5.0}

In [23]:
peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [24]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       './peft-dialogue-summary-checkpoint-local',
                                                                             is_trainable=False)

In [25]:
peft_model_outputs = peft_model.generate(input_ids=input_ids, 
                                         generation_config=GenerationConfig(
                                             max_new_tokens=660, 
                                             num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], 
                                          skip_special_tokens=True)

In [26]:
print(f"Input Prompt : {prompt}")
print("--------------------------------------------------------------------")
print("Human evaluated summary ---->")
print(summary)
print("---------------------------------------------------------------------")
print("Baseline model generated summary : ---->")
print(output)
print("---------------------------------------------------------------------")
print("Peft model generated summary : ---->")
print(peft_model_text_output)


Input Prompt : Summarize the following dialogue  The National League sold the Republic of Ireland midfielder to the Cherries for £175,000 in 2012 and had a 15% sell-on clause included in the deal.
O'Kane moved for an undisclosed fee, but Nicholson says any money will go to help the cash-strapped club.
"I don't think I'll be getting anything," Nicholson told BBC Devon.
"There's more important things."
The Gulls are still looking for new owners having been taken over by a consortium of local business people last summer.
They were forced to close down the club's academy and drastically reduce the playing budget after millionaire former owner Thea Bristow left the club.  Summary:
--------------------------------------------------------------------
Human evaluated summary ---->
Torquay United boss Kevin Nicholson says none of the money from Eunan O'Kane's move to Leeds from Bournemouth will go to the playing squad.
---------------------------------------------------------------------
Baseli

In [27]:
prompt = """Large Language Models (LLMs) have revolutionized the natural 
            language processing by excelling in tasks such as text generation, translation, 
            summarization and question answering. Despite their impressive capabilities, 
            these models may not always be suitable for specific tasks or domains due 
            to compatibility issues. To overcome this fine tuning is performed. Fine 
            tuning allows the users to customize pre-trained language models for 
            specialized tasks. This involves refining the model on a limited dataset 
            of task-specific information, enhancing its performance in that particular 
            task while retaining its overall language proficiency."""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

peft_model_outputs = peft_model.generate(input_ids=input_ids, 
                                         generation_config=GenerationConfig(
                                             max_new_tokens=1000, 
                                             num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], 
                                          skip_special_tokens=True)

print("Peft model generated summary : ---->")
print(peft_model_text_output)

Peft model generated summary : ---->
Large language models are revolutionizing the natural language processing process.


In [28]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Device used : {device}")
base_model.to(device)
input_ids = input_ids.to(device)
output = tokenizer.decode(base_model.generate(input_ids, max_new_tokens=200)[0],skip_special_tokens=True)
print("Baseline model generated summary : ---->")
print(output)

Device used : cuda:0
Baseline model generated summary : ---->
Large language models are revolutionizing the natural language processing process.
