In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/zaruri-data/validation.json
/kaggle/input/zaruri-data/train.json


In [2]:
# pip install transformers datasets


In [3]:
from datasets import load_dataset

data_files = {
    "train": "/kaggle/input/zaruri-data/train.json", #Path to the train data json
    "validation": "/kaggle/input/zaruri-data/validation.json" #Path to validation data json
}
dataset = load_dataset("json", data_files=data_files)
print(dataset)
# Should now show 2 splits: train/validation with multiple rows each.


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 520
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 67
    })
})


In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"  # or "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
def preprocess_function(examples):
    inputs = ["question: " + q for q in examples["question"]]
    targets = [ans for ans in examples["answer"]]
    
    # Tokenize question
    model_inputs = tokenizer(inputs, truncation=True)
    
    # Tokenize answer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]



Map:   0%|          | 0/67 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="my_t5_chatbot",
    
    # Evaluate every epoch (so we can compare metrics and decide "best")
    eval_strategy="epoch",
    
    # Save a checkpoint every epoch (so we can pick the best one)
    save_strategy="epoch",
    
    # Load the best model at the end of training
    load_best_model_at_end=True,
    
    # Metric to decide "best" model (e.g. "eval_loss")
    # or any custom metric you set up in compute_metrics()
    metric_for_best_model="eval_loss",
    
    # If 'eval_loss' is the metric, lower is better
    greater_is_better=False,
    
    # Limit how many checkpoints are kept
    # so you don’t clutter disk space
    save_total_limit=1,
    
    num_train_epochs=25,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    learning_rate=1e-4,
    logging_steps=50,
    report_to="none"  # or "tensorboard", etc.
)


In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

In [10]:
!rm -r /kaggle/working/my_t5_chatbot

In [11]:
trainer.train()
trainer.save_model("my_t5_chatbot")
tokenizer.save_pretrained("my_t5_chatbot")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,3.6211,2.494319
2,3.0587,2.206805
3,2.8644,2.051885
4,2.4921,1.946608
5,2.4379,1.872376
6,2.2746,1.804032
7,2.1188,1.753674
8,2.1086,1.71979
9,1.9652,1.698892
10,1.9203,1.659948


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('my_t5_chatbot/tokenizer_config.json',
 'my_t5_chatbot/special_tokens_map.json',
 'my_t5_chatbot/spiece.model',
 'my_t5_chatbot/added_tokens.json')

In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned T5 model and tokenizer
model_path = "my_t5_chatbot"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def generate_answer(question, max_length=250, num_beams=10):
    """
    Generate a coherent and concise answer using the fine-tuned T5 model.
    
    Args:
    - question (str): User's question.
    - max_length (int): Max length for generated answers.
    - num_beams (int): Number of beams for beam search.
    
    Returns:
    - str: The chatbot's answer.
    """
    input_text = f"question: {question}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    
    # Generate response with improved decoding options
    outputs = model.generate(
        input_ids, 
        max_length=max_length, 
        num_beams=num_beams, 
        early_stopping=True,
        repetition_penalty=2.0,  # Stronger penalty to avoid repetitive phrases
        length_penalty=1.2,       # Encourage slightly longer, meaningful responses
        no_repeat_ngram_size=3,   # Prevent repeating n-grams (like 'non-participating' spam)
        temperature=0.9,          # Add randomness for more human-like variety
        top_k=50,                 # Consider top 50 tokens at each step
        top_p=0.9                 # Nucleus sampling for diverse outputs
    )
    
    # Decode and return the generated answer
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
user_question = "What is Retire SMart Plus?"
print("User Question:", user_question)
print("Chatbot Answer:", generate_answer(user_question))


User Question: What is Retire SMart Plus?




Chatbot Answer: Retire SMart Plus is a non-linked, non-participating retirement plan designed to help you build your retirement corpus.


In [13]:
user_question = "What are the different types of insurance policies offered by SBI Life?"
print("User Question:", user_question)
print("Chatbot Answer:", generate_answer(user_question))

User Question: What are the different types of insurance policies offered by SBI Life?
Chatbot Answer: SBI Life offers a wide range of insurance policies, starting from the 1st policy year, to the end of the policy term.


In [14]:
import shutil

# Zip the folder
shutil.make_archive('/kaggle/working/my_t5_chatbot', 'zip', '/kaggle/working/my_t5_chatbot')



'/kaggle/working/my_t5_chatbot.zip'