# This is the notebook that was used to fine-tune t5-small model with "combined_queries.csv" -dataset
The dataset included 60000 examples of "Bad queries" and "Optimized queries"

In [2]:
# Assuming the data is uploaded to following PATH in kaggle
FILEPATH = "/kaggle/input/query-data-11/combined_queries.csv"

# Create dataset
from datasets import load_dataset
dataset = load_dataset("csv", data_files=FILEPATH, encoding="latin1")
dataset = dataset["train"].train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

!pip install -q transformers datasets peft accelerate bitsandbytes evaluate rouge-score

In [16]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Original Query', 'Optimized Query'],
        num_rows: 48000
    })
    test: Dataset({
        features: ['Original Query', 'Optimized Query'],
        num_rows: 12000
    })
})


In [18]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

model_name = "t5-small"  # or distilT5
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def tokenize_function(examples):
    # Tokenize the original query (input to the model)
    model_inputs = tokenizer(examples["Original Query"], padding="max_length", truncation=True, max_length=512)
    
    # Tokenize the optimized query (target/output for the model)
    labels = tokenizer(examples["Optimized Query"], padding="max_length", truncation=True, max_length=512)
    
    # Shift labels for decoder input (decoder input is typically shifted by one token)
    decoder_input_ids = [tokenizer.pad_token_id] + labels["input_ids"][:-1]  # Add padding token at the start and shift
    
    # Make sure that the returned values are lists (not just single tokens or strings)
    model_inputs["decoder_input_ids"] = decoder_input_ids
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


tokenized_dataset = dataset.map(tokenize_function)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [26]:
# LORA configuration
"""lora_config = LoraConfig(
    r=8,                   # Rank of the low-rank adapters
    lora_alpha=16,         # Scaling factor for low-rank adaptation
    target_modules=["q", "k", "v"],  # Common targets for transformers
    lora_dropout=0.05,     # Dropout for regularization
    bias="none",           # No bias in the low-rank adapters
    task_type="SEQ_2_SEQ_LM"
)"""

lora_config = LoraConfig(
    r=8,                    # Rank of the low-rank adapters
    lora_alpha=16,          # Scaling factor for low-rank adaptation
    target_modules=[
        "q", "k", "v",                    # Self-attention matrices
        "EncDecAttention.q", "EncDecAttention.k", "EncDecAttention.v",  # Cross-attention matrices in the decoder
        "wi", "wo"                        # Feedforward layers' weight matrices (optional)
    ],
    lora_dropout=0.05,      # Dropout for regularization
    bias="none",            # No bias in the low-rank adapters
    task_type="SEQ_2_SEQ_LM"
)

# Apply LORA to the model
model = get_peft_model(model, lora_config)

In [27]:
import numpy as np
import evaluate

# Load the metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def compute_metrics(p):
   
    # Extract logits and labels from the tuple
    predictions, labels = p.predictions, p.label_ids

    print(f"Predictions type: {type(predictions)}")
    print(f"Labels type: {type(labels)}")
    
    # Check if predictions are logits (3D shape: [batch_size, seq_length, vocab_size])
    if isinstance(predictions, tuple):
        predictions = predictions[0]  # If predictions is a tuple, take the logits (first element)
    
    if predictions.ndim == 3:
        # Apply argmax to get the token IDs (from logits to token IDs)
        predictions = np.argmax(predictions, axis=-1)  # Get the token IDs

    # Flatten the predictions and labels if they are nested lists
    if isinstance(predictions[0], list):
        predictions = [item for sublist in predictions for item in sublist]  # Flatten predictions
    if isinstance(labels[0], list):
        labels = [item for sublist in labels for item in sublist]  # Flatten labels

    # Decode predictions and labels using tokenizer
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu_score = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

    # Compute ROUGE score
    rouge_score = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": bleu_score["bleu"],
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
    }


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [28]:
import torch

if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

model.to("cuda")  # Move the model to GPU

DataParallel(
  (module): PeftModelForSeq2SeqLM(
    (base_model): LoraModel(
      (model): T5ForConditionalGeneration(
        (shared): Embedding(32128, 512)
        (encoder): T5Stack(
          (embed_tokens): Embedding(32128, 512)
          (block): ModuleList(
            (0): T5Block(
              (layer): ModuleList(
                (0): T5LayerSelfAttention(
                  (SelfAttention): T5Attention(
                    (q): lora.Linear(
                      (base_layer): Linear(in_features=512, out_features=512, bias=False)
                      (lora_dropout): ModuleDict(
                        (default): Dropout(p=0.05, inplace=False)
                      )
                      (lora_A): ModuleDict(
                        (default): Linear(in_features=512, out_features=8, bias=False)
                      )
                      (lora_B): ModuleDict(
                        (default): Linear(in_features=8, out_features=512, bias=False)
                      )
  

In [29]:
training_args = TrainingArguments(
    output_dir="./output",
    logging_dir="./logs",  # Optional: This will store logs for future use with TensorBoard
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-4,
    fp16=True,
    evaluation_strategy="no",
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    report_to="none",  # Disable WandB
    logging_first_step=True,  # Log at the first step to ensure visibility
    ddp_find_unused_parameters=False if torch.cuda.device_count() > 1 else None,
    remove_unused_columns=False
)



In [30]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=training_args,
    compute_metrics=compute_metrics
)

In [31]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,12.5989
100,4.5787
200,0.1275
300,0.0829
400,0.0625
500,0.0445
600,0.0317
700,0.0269
800,0.0235
900,0.0217




TrainOutput(global_step=7500, training_loss=0.08061803852717082, metrics={'train_runtime': 7057.1573, 'train_samples_per_second': 34.008, 'train_steps_per_second': 1.063, 'total_flos': 0.0, 'train_loss': 0.08061803852717082, 'epoch': 5.0})

In [79]:
trainer.model.module.save_pretrained("./output_model_3")

In [35]:
from peft import PeftModel
tokenizer = T5Tokenizer.from_pretrained(model_name)
model_name = "t5-small"  # or distilT5
base_model = T5ForConditionalGeneration.from_pretrained(model_name)  # Load base model
modell = PeftModel.from_pretrained(base_model,'./output_model_3', lora_config=lora_config)

In [33]:
model.module.save_pretrained("./output_model_3")

In [None]:
import sys
sys.modules.clear()  # Clear all imports

In [34]:
import shutil
shutil.make_archive("/kaggle/working/output_model_3", 'zip', "/kaggle/working/output_model_3")

'/kaggle/working/output_model_3.zip'

In [108]:
modell = modell.merge_and_unload()

In [117]:
from transformers import T5ForConditionalGeneration

# Load the T5 model
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [47]:
import torch
# Example input query
input_text = "optimize how much does golden retriever weight in kilograms"

# Tokenize the input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Perform inference with the model (access the base model's generate method)
with torch.no_grad():
    # Using model.base_model to access the original generate method
    outputs = modell.generate(input_ids=input_ids, max_length=128)

# Decode the generated output
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Output:", output_text)

Generated Output: golden retriever weight in kilograms


In [114]:
print(outputs)

tensor([[    0, 13436,   125,    19,    48,     1]])
