In [4]:
!pip install -q transformers datasets peft accelerate bitsandbytes evaluate rouge-score

In [2]:
# Assuming the data is uploaded to following PATH in kaggle
FILEPATH = "/kaggle/input/query-data/data.csv"

# Create dataset
from datasets import load_dataset
dataset = load_dataset("csv", data_files=FILEPATH, encoding="latin1")
dataset = dataset["train"].train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

model_name = "t5-small"  # or distilT5
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def tokenize_function(examples):
    model_inputs = tokenizer(examples["original_query"], padding="max_length", truncation=True, max_length=512)
    
    # Tokenizing the optimized query for the decoder side
    labels = tokenizer(examples["optimized_query"], padding="max_length", truncation=True, max_length=512)
    
    # For decoder input, the decoder_input_ids are typically the same as labels
    model_inputs["decoder_input_ids"] = labels["input_ids"]  # Decoder input ids
    
    # Labels are used as the targets (to compute loss)
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

NameError: name 'dataset' is not defined

In [None]:
# LORA configuration
lora_config = LoraConfig(
    r=8,                   # Rank of the low-rank adapters
    lora_alpha=16,         # Scaling factor for low-rank adaptation
    target_modules=["q", "k", "v"],  # Common targets for transformers
    lora_dropout=0.05,     # Dropout for regularization
    bias="none",           # No bias in the low-rank adapters
    task_type="SEQ_2_SEQ_LM"
)

# Apply LORA to the model
model = get_peft_model(model, lora_config)

In [6]:
import numpy as np
import evaluate

# Load the metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def compute_metrics(p):
   
    # Extract logits and labels from the tuple
    predictions, labels = p.predictions, p.label_ids

    print(f"Predictions type: {type(predictions)}")
    print(f"Labels type: {type(labels)}")
    
    # Check if predictions are logits (3D shape: [batch_size, seq_length, vocab_size])
    if isinstance(predictions, tuple):
        predictions = predictions[0]  # If predictions is a tuple, take the logits (first element)
    
    if predictions.ndim == 3:
        # Apply argmax to get the token IDs (from logits to token IDs)
        predictions = np.argmax(predictions, axis=-1)  # Get the token IDs

    # Flatten the predictions and labels if they are nested lists
    if isinstance(predictions[0], list):
        predictions = [item for sublist in predictions for item in sublist]  # Flatten predictions
    if isinstance(labels[0], list):
        labels = [item for sublist in labels for item in sublist]  # Flatten labels

    # Decode predictions and labels using tokenizer
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu_score = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

    # Compute ROUGE score
    rouge_score = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": bleu_score["bleu"],
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
    }


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [7]:
training_args = TrainingArguments(
    output_dir="./output",
    logging_dir="./logs",  # Optional: This will store logs for future use with TensorBoard
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    evaluation_strategy="no",
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    report_to="none",  # Disable WandB
    logging_first_step=True,  # Log at the first step to ensure visibility
)



In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=training_args,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()



Step,Training Loss
1,0.018
100,0.0198
200,0.0164
300,0.0145
400,0.0129
500,0.0113
600,0.011
700,0.0104
800,0.0098
900,0.0094




TrainOutput(global_step=1200, training_loss=0.011854723157982031, metrics={'train_runtime': 1674.4745, 'train_samples_per_second': 22.933, 'train_steps_per_second': 0.717, 'total_flos': 5249309029171200.0, 'train_loss': 0.011854723157982031, 'epoch': 3.0})

In [27]:
model.save_pretrained("./output_model")

In [2]:
!pip install peft --upgrade




In [None]:
import sys
sys.modules.clear()  # Clear all imports

In [2]:
import shutil
shutil.make_archive("/kaggle/working/output_model", 'zip', "/kaggle/working/output_model")

'/kaggle/working/output_model.zip'

In [None]:
import peft
print(peft.__version__)

from peft import get_peft_model, T5AdapterModel

# Check the contents of the peft module
print(dir(peft))

In [26]:
trainer.args.per_device_eval_batch_size = 16
trainer.args.device=device
metrics = trainer.evaluate()
print(metrics)

AttributeError: can't set attribute 'device'

In [15]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import get_peft_model  # LoRA-specific function

# Load the model and tokenizer
model_name = "t5-small"  # Or the specific model name you fine-tuned
model = T5ForConditionalGeneration.from_pretrained('./output/checkpoint-1200')  # Load from the fine-tuned checkpoint
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Apply LoRA (Adapter) to the model
model = get_peft_model(model, peft_config=lora_config)  # This integrates the LoRA adapters into the base model


In [29]:
import torch
# Example input query
input_text = "Optimize the query: 'average earnings of college graduate'"

# Tokenize the input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Perform inference with the model (access the base model's generate method)
with torch.no_grad():
    # Using model.base_model to access the original generate method
    outputs = model.generate(input_ids=input_ids, max_length=50, num_beams=5, early_stopping=True)

# Decode the generated output
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Output:", output_text)

Generated Output: Optimize the query
