<a href="https://colab.research.google.com/github/suryaa2910/prefix-language-model-NLP/blob/main/prefix%20language%20model%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Zocket_Suryaa_Task 2:**

**Problem Statement:**
Develop a prefix language model using Hugging Face and PyTorch. You can pick any dataset for a creative text generation task and you should report the perplexity metric. Hint: A subtle data preprocessing trick is required when setting the inputs and labels for implementing prefix LM.

**Model Selected:** t5-large

**Dataset Selected:** CNN daily mail

Installation of Required Packages

In [None]:
!pip install transformers==4.36.2
!pip install accelerate==0.25.0
!pip install datasets==2.15.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3
!pip install trl==0.7.7
!pip install tqdm==4.66.1
!pip install flash-attn==2.4.2

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.36.2
Collecting accelerate==0.25.0
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling coll

Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments,BartTokenizer, BartForConditionalGeneration

2024-03-25 04:19:26.266518: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-25 04:19:26.266616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-25 04:19:26.460208: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Authentication and Configuration
> Huggingface and wandb integration

In [None]:
user_secrets = UserSecretsClient()

login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN_2"))

os.environ["WANDB_API_KEY"]=user_secrets.get_secret("WANDB_API_KEY_2")
os.environ["WANDB_PROJECT"] = "Prefix language modelling"
os.environ["WANDB_NOTES"] = "Prefix language modelling using LORA"
os.environ["WANDB_NAME"] = "Prefix tuning"
os.environ["MODEL_NAME"] = "t5-large"

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Accelerate Memory Estimation
> This command estimates the memory requirements for the specified model using the Accelerate library.

In [None]:
!accelerate estimate-memory ${MODEL_NAME} --library_name transformers

Loading pretrained config for `t5-large` from `transformers`...
┌────────────────────────────────────────────────────┐
│        Memory Usage for loading `t5-large`         │
├───────┬─────────────┬──────────┬───────────────────┤
│ dtype │Largest Layer│Total Size│Training using Adam│
├───────┼─────────────┼──────────┼───────────────────┤
│float32│   125.5 MB  │ 2.75 GB  │      10.99 GB     │
│float16│   62.75 MB  │ 1.37 GB  │       5.5 GB      │
│  int8 │   31.38 MB  │ 703.5 MB │      2.75 GB      │
│  int4 │   15.69 MB  │351.75 MB │      1.37 GB      │
└───────┴─────────────┴──────────┴───────────────────┘


Model Quantization Configuration
> configures model quantization settings, including whether to load in 4-bit, the quantization type, and data types.

In [None]:
from transformers import BitsAndBytesConfig
from accelerate import Accelerator
import torch

load_in_4bit = True

if load_in_4bit:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16  # Change to torch.float16 for fp16
    )
    # copy the model to each device
    device_map = "auto"
    torch_dtype = torch.float16  # Change to torch.float16 for fp16
else:
    device_map = None
    quantization_config = None
    torch_dtype = None

 Loading Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset('cnn_dailymail','3.0.0',split='train')

In [None]:
dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [None]:
dataset= dataset.shuffle(seed=42).select([i for i in range(85000)])

In [None]:
dataset = dataset.train_test_split(test_size=0.1,seed=42)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 76500
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 8500
    })
})

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

device = "cuda"
model_name_or_path = "t5-large"
tokenizer_name_or_path = "t5-large"

text_column = "article"
label_column = "highlights"
max_length = 256
lr = 1e-5
num_epochs = 1
batch_size = 8

Tokenization and Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)


def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/76500 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/8500 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import default_data_collator

In [None]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

Prefix Language Model Configuration
> Since we are doing Text summarization task, we will use **AutoModelForSeq2SeqLM**

In [None]:
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup

In [None]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 983,040 || all params: 738,651,136 || trainable%: 0.13308583065659835


In [None]:
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

Trainer Initialization and Training

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=len(train_dataloader),
    save_total_limit=5,
    num_train_epochs=1,
    learning_rate=lr,
    logging_dir="./logs",
    logging_steps=len(train_dataloader),
    evaluation_strategy="steps",
    eval_steps=len(train_dataloader),
    load_best_model_at_end=True,
    remove_unused_columns=False,
    push_to_hub=False,
)

# Create the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=default_data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Print the results
print(results)

[34m[1mwandb[0m: Currently logged in as: [33msuryaa2910[0m ([33msuryaa2910_[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss


{'eval_loss': 2.9073822498321533, 'eval_runtime': 734.357, 'eval_samples_per_second': 11.575, 'eval_steps_per_second': 0.724, 'epoch': 1.0}


Perplexity Calculation

In [None]:
import numpy as np
def perplexity(eval_output):
    return np.exp(eval_output)

In [None]:
perplexity(results['eval_loss'])

18.30880789568708