In [11]:
!pip install datasets transformers peft evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [12]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

import evaluate
import torch
import numpy as np

dataset = load_dataset("glue", "sst2")

print(dataset)


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [13]:
model_checkpoint = "roberta-base"


id_label = {0: "Negative", 1: "Positive"}
label_id = {"Negative": 0, "Positive": 1}


roberta_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id_label, label2id=label_id
)

print(roberta_model)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# Add padding token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize token embeddings to accommodate the new padding token
roberta_model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Embedding(50265, 768, padding_idx=1)

In [15]:
def tokenize_function(examples):
    # Extract text
    text = examples["sentence"]

    # Set truncation side
    tokenizer.truncation_side = "left"

    # Tokenize and truncate text
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [17]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [18]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)  # Get predicted labels
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}


In [25]:
import torch

text_list = [
    "a feel-good picture in the best sense of the term.",
    "resourceful and ingenious entertainment.",
    "it 's just incredibly dull.",
    "the movie 's biggest offense is its complete and utter lack of tension.",
    "impresses you with its open-endedness and surprises.",
    "unless you are in dire need of a diesel fix, there is no real reason to see it."
]

print("Untrained model predictions:")
print("---------------------------")

for text in text_list:
    # Tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")

    # Compute logits
    logits = roberta_model(inputs).logits

    # Convert logits to label
    predictions = torch.argmax(logits).item()

    # Access label using predictions directly (removed .tolist())
    print(text +"-" +id_label[predictions])

Untrained model predictions:
---------------------------
a feel-good picture in the best sense of the term.-Negative
resourceful and ingenious entertainment.-Negative
it 's just incredibly dull.-Negative
the movie 's biggest offense is its complete and utter lack of tension.-Negative
impresses you with its open-endedness and surprises.-Negative
unless you are in dire need of a diesel fix, there is no real reason to see it.-Negative


In [26]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",  # Sequence classification task
    r=4,  # Rank of the LoRA decomposition
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.01,  # Dropout for LoRA layers
    target_modules=['query']  # Target module to apply LoRA
)

In [27]:
print(peft_config)


LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'query'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)


In [28]:
model = get_peft_model(roberta_model, peft_config)

# Print the number of trainable parameters
model.print_trainable_parameters()


trainable params: 665,858 || all params: 125,313,028 || trainable%: 0.5314


In [39]:
# Define training hyperparameters
lr = 1e-3  # Learning rate
batch_size = 16  # Batch size
num_epochs = 3  # Number of training epochs

In [40]:
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",  # Output directory
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save model after each epoch
    load_best_model_at_end=True  # Load best model at the end of training
)



In [41]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Create Trainer object
trainer = Trainer(
    model=roberta_model,  # Use the LoRA-adapted model
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2489,0.244475,{'accuracy': 0.9139908256880734}
2,0.2326,0.254118,{'accuracy': 0.9128440366972477}
3,0.1881,0.243105,{'accuracy': 0.930045871559633}


Trainer is attempting to log a value of "{'accuracy': 0.9139908256880734}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9128440366972477}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.930045871559633}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=12630, training_loss=0.2323711423956961, metrics={'train_runtime': 934.5155, 'train_samples_per_second': 216.205, 'train_steps_per_second': 13.515, 'total_flos': 3746066265750552.0, 'train_loss': 0.2323711423956961, 'epoch': 3.0})

In [42]:
import torch

# Move model to CPU
model.to("cpu")

print("Trained model predictions:")
print("---------------------------")

for text in text_list:
    # Tokenize input and move to CPU
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    # Compute logits
    logits = model(inputs).logits

    # Get predicted label
    predictions = torch.argmax(logits, dim=1).item()

    # Print the prediction
    print(f"{text} -> {id_label[predictions]}")


Trained model predictions:
---------------------------
a feel-good picture in the best sense of the term. -> Positive
resourceful and ingenious entertainment. -> Positive
it 's just incredibly dull. -> Negative
the movie 's biggest offense is its complete and utter lack of tension. -> Negative
impresses you with its open-endedness and surprises. -> Positive
unless you are in dire need of a diesel fix, there is no real reason to see it. -> Negative
