# Fine tune with transformers

In [1]:
import os
from pathlib import Path
if os.path.split(os.getcwd())[-1] == 'research':
    os.chdir('..')
    print("Root directory is active")
else:
    pass
    print("Root directory was active")

# Importing necessary libraries

Root directory is active


In [2]:
import os,sys

from src.textsummarization.utils import read_yaml
from src.textsummarization.logger import Logger
from src.textsummarization.exception import TSException

logger = Logger()

In [3]:
config = read_yaml('config.yml', return_configbox=True)

[ 2024-12-27 22:52:58,765 ] 18 TextSummarizer - INFO - config.yml has been loaded successfully.


In [4]:
chat_sum_config = config.chat_summarization

In [5]:
try:
  from google.colab import drive
  IN_COLAB = True
  folder_to_mount = chat_sum_config.google_drive_folder
  drive.mount()
  logger.info("Running in Google Colab")
except:
  IN_COLAB = False
  logger.info("Running locally")


[ 2024-12-27 22:52:58,789 ] 9 TextSummarizer - INFO - Running locally


In [6]:
import transformers
from transformers import pipeline
from datasets import load_dataset
import torch
import evaluate
import numpy as np
import nltk
from box import ConfigBox
nltk.download('punkt')


  from .autonotebook import tqdm as notebook_tqdm


[ 2024-12-27 22:53:18,288 ] 54 datasets - INFO - PyTorch version 2.5.1 available.


[nltk_data] Downloading package punkt to C:\Users\ict-
[nltk_data]     tyson\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading dataset error

Sometimes when loading the dataset wile in GPU enviorment it will give the error that it cannot find the *samsum* dataset. The workaround is to load the dataset while in CPU mode then save it localy or on you drive. After that just switch back to GPU and load the dataset from the local file using *load_from_disk()*

In [7]:
data = load_dataset(chat_sum_config.dataset)
metric = evaluate.load(chat_sum_config.metric)
model_checkpoints = chat_sum_config.model_checkpoints

## Data tokenization

**max_input** and **max_target** can variy depending on the available computing power

In [8]:
max_input = chat_sum_config.max_input
max_target = chat_sum_config.max_target
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)

In [9]:
data['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

In [10]:
def preprocess_data(data_to_process):
  #get the dialogue text
  inputs = [dialogue for dialogue in data_to_process['dialogue']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['summary'], max_length=max_target, padding='max_length', truncation=True)

  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs

In [11]:
tokenize_data = data.map(preprocess_data, batched = True, remove_columns=['id', 'dialogue', 'summary'])

In [12]:
#sample the data
train_sample = tokenize_data['train'].shuffle(seed=42).select(range(5000))
validation_sample = tokenize_data['validation'].shuffle(seed=42).select(range(250))
test_sample = tokenize_data['test'].shuffle(seed=42).select(range(100))

In [13]:
tokenize_data['train'] = train_sample
tokenize_data['validation'] = validation_sample
tokenize_data['test'] = test_sample

## Training process

In [14]:
#load model
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

In [15]:
batch_size =chat_sum_config.batch_size

In [16]:
#collator to create batches. It preprocess data with the given tokenizer
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
args = transformers.Seq2SeqTrainingArguments(
    'artifacts/chat_summarization/conversation-summ',
    eval_strategy='epoch',
    learning_rate=float(chat_sum_config.learning_rate),
    per_device_train_batch_size=1,
    per_device_eval_batch_size= 1,
    gradient_accumulation_steps=2,
    weight_decay=float(chat_sum_config.weight_decay),
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    report_to='none',
    fp16= False if device.type=='cpu' else True,
    )
#only CUDA available -> fp16=True

In [19]:
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    processing_class=tokenizer
)

In [20]:
trainer.train()

  0%|          | 25/7500 [07:35<37:34:25, 18.10s/it]

KeyboardInterrupt: 

In [33]:
trainer.save_model('artifacts/chat_summarization/chat_summarization_pretrained_model')
tokenizer.save_pretrained("artifacts/chat_summarization/chat_summarization_tokenizer")
logger.info("Fine-tuning chat summarization model and tokenizer is saved in artifacts/chat_summarization directory")

## Testing the fine tuned model

In [34]:
conversation = """
Rann: Hey Harry, how have you been? Long time no see!
Harry: Hey! What a surprise!
Harry: Yes, you are right, we haven’t seen each other in a long time. How have you been?
Rann: There is an important campaign next week which is keeping me busy otherwise rest is going good in my life.
Rann: How about you?
Harry: Oh! I just finished a meeting with a very important client of mine and now I finally have some free time. I feel relieved that I’m done with it.
Rann: Good for you then. Hey! Let’s make a plan and catch up with each other after next week.
Rann: What do you say?
Harry: Sure, why not? Give me a call when you are done with your project.
Rann: Sure, then.
Rann: Bye, take care.
Harry: Bye buddy.
"""

In [35]:
model_inputs = tokenizer(conversation,  max_length=max_input, padding='max_length', truncation=True)

In [52]:
pipe = pipeline("summarization", model="chat_summarization_pretrained_model",tokenizer=tokenizer)

Device set to use cuda:0


In [None]:
gen_kwargs = dict(chat_sum_config.predict_params) #= {"length_penalty": 0.8, "num_beams":8, "max_length": 512}

In [59]:
pipe(conversation, **gen_kwargs)

Your max_length is set to 512, but your input_length is only 212. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=106)


[{'summary_text': "Harry and Rann haven't seen each other for a long time. Rann is busy with an important campaign next week. Harry has just finished a meeting with a client and has some free time. Harry will call Rann after he finishes his project."}]