In [1]:
%%time
!pip install --upgrade --quiet transformers datasets accelerate evaluate bitsandbytes loralib peft wandb
!pip install --quiet rouge-score tensorboard py7zr

CPU times: user 1.33 s, sys: 271 ms, total: 1.6 s
Wall time: 1min 15s


In [2]:
import pandas as pd
import numpy as np 
import torch
from tqdm import tqdm
from random import randrange
import plotly.express as px
import gc
import os
import wandb
from kaggle_secrets import UserSecretsClient

from datasets import load_dataset,concatenate_datasets
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,DataCollatorForSeq2Seq,
                          BitsAndBytesConfig,Seq2SeqTrainingArguments,Seq2SeqTrainer)
from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training , TaskType
import evaluate

from accelerate import Accelerator
accelerator = Accelerator()
user_secrets = UserSecretsClient()


2024-07-25 15:23:21.226823: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 15:23:21.226968: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 15:23:21.373392: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Helper

In [3]:
class cVariables: 
    
    __shared_instance = None
    @staticmethod
    def get_instance():
        if cVariables.__shared_instance == None: cVariables()
        return __shared_instance
    def __init__(self):
        if cVariables.__shared_instance != None : raise Exception("This class is a singleton class !")
        else:  cVariables.__shared_instance = self
        #----------------------
        self.ATTEMPT_NO = 0 # i reduce sample of data to able to train model because size in gpu
        # this parameter  is responsible for dividing data into and out
        # get_SizeSampleTrain and get_SizeSampleEval to return start and end of sample of data
        #----------------------

    def get_SizeSampleTrain(self):
        assert self.ATTEMPT_NO < 7 , "ATTEMPT_NO should be less than 7"
        TRAIN_SIZE=14732//6
        TRAIN_LIST = [i*TRAIN_SIZE for i in range(0,8)]
        return  TRAIN_LIST[self.ATTEMPT_NO] , TRAIN_LIST[self.ATTEMPT_NO+1]
    def get_SizeSampleEval(self):
        assert self.ATTEMPT_NO < 7 , "ATTEMPT_NO should be less than 7"
        if self.ATTEMPT_NO == 6 :
            return TRAIN_LIST[-1] , 14732
        EVAL_SIZE=818//6
        EVAL_LIST = [i*EVAL_SIZE for i in range(0,7)]
        return EVAL_LIST[self.ATTEMPT_NO] , EVAL_LIST[self.ATTEMPT_NO+1]

    Paths={
    'data' : 'samsum',
    'model': 'google/flan-t5-large',       
    'new_checkpoint': f'FlanT5Summarization-samsum',
    'wandb_proj': 'Summarization by Finetuning FlanT5-LoRA',
    'wandb_run':f'flant5Summarization',
    }
    Hayperparameters={
     'max_source_length':512,
     'max_target_length':128,
     'batch_size_train':128,
     'batch_size_eval':64,
     'epochs':3,
     'lr':3e-5,
     'l2':0.01,
    }
    Tokens={'huggingface' :user_secrets.get_secret("huggingface"),
            'wandb': user_secrets.get_secret("wandb")}
var = cVariables()

In [4]:
def clear_gpu():
    print(gc.collect()) 
    torch.cuda.empty_cache()
    print(gc.collect())

In [5]:
rouge = evaluate.load("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Ensure the predictions and labels are in the correct format
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]

    pred_ids = np.argmax(pred_ids, axis=-1) if pred_ids.ndim == 3 else pred_ids

    # Convert tensors to lists
    pred_ids = pred_ids.tolist()
    labels_ids = labels_ids.tolist()

    # Decode generated summaries and labels (converting token IDs back to text)
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids = [[token for token in label if token != -100] for label in labels_ids]
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Compute ROUGE scores
    rouge_output = rouge.compute(predictions=pred_str, references=label_str)

    return {
        "rouge1": rouge_output["rouge1"],
        "rouge2": rouge_output["rouge2"],
        "rougeL": rouge_output["rougeL"],
        "rougeLsum": rouge_output["rougeLsum"],
    }


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(var.Paths['model'],token=var.Tokens['huggingface'])

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
def process_dataset(data):
    inputs = ["summarize: " + item for item in data["dialogue"]]

    model_inputs = tokenizer(inputs,add_special_tokens=True,
                max_length=var.Hayperparameters['max_source_length'],
                padding='max_length',
                truncation=True,
                return_tensors='pt')
    model_target = tokenizer(inputs,add_special_tokens=True,
                max_length=var.Hayperparameters['max_target_length'],
                padding='max_length',
                truncation=True,
                return_tensors='pt')
    model_target["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in model_target] for model_target in model_target["input_ids"]]
    
    model_inputs["labels"] = model_target["input_ids"]
    return model_inputs

In [8]:
os.environ["WANDB_API_KEY"] = var.Tokens['wandb']
os.environ["WANDB_DEBUG"] = "true"
os.environ["WANDB_PROJECT"]=var.Paths['wandb_proj']
os.environ["WANDB_NAME"] = var.Paths['new_checkpoint']

wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mziayd-usf[0m ([33mdaigt_team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.17.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240725_152338-bzfwtjcj[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mFlanT5Summarization-samsum[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/daigt_team/Summarization%20by%20Finetuning%20FlanT5-LoRA[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/daigt_team/Summarization%20by%20Finetuning%20FlanT5-LoRA/runs/bzfwtjcj[0m


<hr>

# Load Data

In [9]:
dataset = load_dataset(var.Paths['data'],trust_remote_code=True)
dataset 

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [10]:
dataset['train'][:3]

{'id': ['13818513', '13728867', '13681000'],
 'dialogue': ["Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
  'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great',
  "Tim: Hi, what's up?\r\nKim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating\r\nTim: What did you plan on doing?\r\nKim: Oh you know, uni stuff and unfucking my room\r\nKim: Maybe tomorrow I'll move my ass and do everything\r\nKim: We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies\r\nTim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores\r\nTim: It really helps\r\nKim: thanks, maybe I'll do that\r\nTim: I also like using post-its in kaban style"],
 'summary': ['Amanda baked cookies and will bring Jerry some tomorrow.',
  'Olivia and Olivier are voting for liberals in this election. ',
  'Kim may try the pomo

In [11]:
df_len = pd.DataFrame({
    'source': [len(x) for x in concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])["input_ids"]],
    'target': [len(x) for x in concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])["input_ids"]]
})
fig = px.box(df_len, x="source",title='Source assays')
fig.show()

Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

In [12]:
fig = px.box(df_len, x="target",title='Summary essays')
fig.show()

In [13]:
tokenized_dataset = dataset.map(process_dataset, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Cannot locate reference to <class '__main__.cVariables'>.


Cannot pickle <class '__main__.cVariables'>: __main__.cVariables has recursive self-references that trigger a RecursionError.



Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [14]:
# Slice the mapped datasets to get the smaller samples
start_train,end_train = var.get_SizeSampleTrain()
start_eval,end_eval = var.get_SizeSampleEval()

train_dataset = tokenized_dataset['train'].select(range(start_train,end_train))
validation_dataset = tokenized_dataset['validation'].select(range(start_eval,end_eval))

In [15]:
start_eval,end_eval ,start_train,end_train ,len(train_dataset),len(validation_dataset)

(0, 136, 0, 2455, 2455, 136)

<hr>

# Model

In [16]:
clear_gpu()

132
0


In [17]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True,)

model=AutoModelForSeq2SeqLM.from_pretrained(var.Paths['model'],
                                            quantization_config=quantization_config,
                                            device_map="auto",
                                            token=var.Tokens['huggingface'])
clear_gpu()

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

30
0


In [18]:
# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],

 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

# prepare int-8 model for training
model = prepare_model_for_kbit_training(model)

In [19]:
# # add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 787,868,672 || trainable%: 0.5989


In [20]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir=var.Paths['new_checkpoint'],
    num_train_epochs=var.Hayperparameters['epochs'],
    
    evaluation_strategy = 'steps',
    save_strategy = 'steps',
    load_best_model_at_end = True,
    logging_steps =5,
    eval_steps = 5,
    save_total_limit =2,
    predict_with_generate=True , # For generating summaries during evaluation

    
    lr_scheduler_type = "cosine",
    learning_rate = var.Hayperparameters['lr'],
    optim="adamw_torch",
    
    auto_find_batch_size=True,
    per_device_train_batch_size = var.Hayperparameters['batch_size_train'],
    per_device_eval_batch_size = var.Hayperparameters['batch_size_eval'],
    weight_decay = var.Hayperparameters['l2'],
    warmup_ratio=0.1,
    gradient_accumulation_steps=4,
    
    push_to_hub=True,
    hub_private_repo=True,
    hub_token=var.Tokens['huggingface'],
    run_name=var.Paths['new_checkpoint'],

    report_to=['wandb'],
)
clear_gpu()


`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



0
0


In [22]:
clear_gpu()
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
#     train_dataset=train_dataset,
#     eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)

model.config.use_cache = False
clear_gpu()

0
0
60
0


In [23]:
trainer.train()
clear_gpu()


torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.



Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
5,1.1072,0.916521,0.270539,0.113489,0.222552,0.222935
10,1.1039,0.908004,0.27093,0.113845,0.223021,0.223401
15,1.0848,0.891714,0.270585,0.1137,0.222846,0.223123
20,1.0706,0.86542,0.270907,0.114247,0.223158,0.223424
25,1.0461,0.833621,0.270611,0.113966,0.22283,0.223215
30,1.0187,0.796038,0.271802,0.114463,0.224032,0.224251
35,0.9774,0.753186,0.272318,0.115228,0.225014,0.225326
40,0.9326,0.706437,0.272633,0.115292,0.225301,0.225668
45,0.8834,0.657038,0.272793,0.116003,0.22588,0.22613
50,0.833,0.608029,0.273373,0.116084,0.226158,0.226338



Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.



36
0


In [24]:
trainer.save_model(output_dir="t5T1")


In [25]:
wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:               eval/loss ███▇▇▇▆▆▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:             eval/rouge1 ▁▁▁▁▁▂▂▃▃▃▃▃▄▄▄▄▃▄▄▄▅▆▇▇▇▇█▇██████
[34m[1mwandb[0m:             eval/rouge2 ▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▆▆▆▆▇▇██████
[34m[1mwandb[0m:             eval/rougeL ▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▇▇▇▇█▇██████
[34m[1mwandb[0m:          eval/rougeLsum ▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▇▇▇▇█▇██████
[34m[1mwandb[0m:            eval/runtime █▅▆▅▅▆▂▁▆▇▇▆▆▆▅▅▆▅▄▄▄▂▅▅▆▆▅▄▆▃▃▅▃▅
[34m[1mwandb[0m: eval/samples_per_second ▁▄▃▄▄▃▇█▃▂▂▃▃▃▄▄▃▄▅▄▅▇▄▄▃▃▄▅▃▆▆▄▆▄
[34m[1mwandb[0m:   eval/steps_per_second ▁▅▅▅▅▅▅█▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
[34m[1mwandb[0m:             train/epoch ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
[34m[1mwandb[0m:       train/global_step ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
[34m[1mwandb[0m:         train/grad_norm 

<hr>