In [24]:
import torch
import random
import evaluate

from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model, AutoPeftModelForSequenceClassification
from transformers import RobertaModel, RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer

In [14]:
peft_model_name = 'roberta-base-peft'
modified_base = 'roberta-base-modified'
base_model = 'roberta-base'

In [3]:
dataset = load_dataset('ag_news')
tokenizer = RobertaTokenizer.from_pretrained(base_model)



In [4]:
len(dataset['train'])

120000

In [5]:
dataset['train'][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [6]:

def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)
    
    
tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
train_dataset=tokenized_dataset['train']

# divide the total test data into two equal sets using shard
test_dataset=tokenized_dataset['test'].shard(num_shards=2, index=0)
eval_dataset=tokenized_dataset['test'].shard(num_shards=2, index=1)

In [7]:
print(dataset['train'].features['label']) 
# look at the print to find about the right key.
class_labels = dataset['train'].features['label'].names
num_labels = len(class_labels)
print(f"number of labels: {num_labels}")
print(f"the labels: {class_labels}")

ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)
number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [8]:
# Create an (id:label) mapping for clasisfing
id2label = {i: label for i, label in enumerate(class_labels)}
print(id2label)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
print(data_collator)

{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
DataCollatorWithPadding(tokenizer=RobertaTokenizer(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, padding=True, m

In [9]:
# Trainign arguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    learning_rate=5e-5,
    num_train_epochs=1,
    per_device_train_batch_size=16,
)

def get_trainer(model, training_args, train_dataset):
      return  Trainer(
          model=model,
          args=training_args,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )



In [10]:
# model = AutoModelForSequenceClassification.from_pretrained(base_model, id2label=id2label)
# fine_tune_trainer = get_trainer(model, training_args, train_dataset)

# fine_tune_trainer.train()

In [11]:
# PEFT Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model_ft = AutoModelForSequenceClassification.from_pretrained(base_model, id2label=id2label)

peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
peft_model = get_peft_model(model_ft, peft_config)
peft_model.to(device)
print("PEFT MODEL")
peft_model.print_trainable_parameters()
peft_lora_finetuning_trainer = get_trainer(peft_model, training_args, train_dataset)
peft_lora_finetuning_trainer.train()

cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT MODEL
trainable params: 888,580 || all params: 125,537,288 || trainable%: 0.7078


  7%|▋         | 500/7500 [03:28<50:33,  2.31it/s]

{'loss': 0.6053, 'grad_norm': 6.2370758056640625, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.07}


                                                  
  7%|▋         | 500/7500 [03:56<50:33,  2.31it/s]

{'eval_loss': 0.29597532749176025, 'eval_runtime': 27.773, 'eval_samples_per_second': 136.823, 'eval_steps_per_second': 17.103, 'epoch': 0.07}


 13%|█▎        | 1000/7500 [07:26<45:38,  2.37it/s]  

{'loss': 0.2801, 'grad_norm': 2.6144440174102783, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.13}


                                                   
 13%|█▎        | 1000/7500 [07:54<45:38,  2.37it/s]

{'eval_loss': 0.28967592120170593, 'eval_runtime': 27.7747, 'eval_samples_per_second': 136.815, 'eval_steps_per_second': 17.102, 'epoch': 0.13}


 20%|██        | 1500/7500 [11:24<41:50,  2.39it/s]   

{'loss': 0.2794, 'grad_norm': 13.84005355834961, 'learning_rate': 4e-05, 'epoch': 0.2}


                                                   
 20%|██        | 1500/7500 [11:52<41:50,  2.39it/s]

{'eval_loss': 0.2772471308708191, 'eval_runtime': 27.7861, 'eval_samples_per_second': 136.759, 'eval_steps_per_second': 17.095, 'epoch': 0.2}


 27%|██▋       | 2000/7500 [15:22<37:00,  2.48it/s]   

{'loss': 0.2659, 'grad_norm': 6.099949836730957, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.27}


                                                   
 27%|██▋       | 2000/7500 [15:50<37:00,  2.48it/s]

{'eval_loss': 0.26382359862327576, 'eval_runtime': 27.7644, 'eval_samples_per_second': 136.866, 'eval_steps_per_second': 17.108, 'epoch': 0.27}


 33%|███▎      | 2500/7500 [19:20<35:01,  2.38it/s]   

{'loss': 0.2725, 'grad_norm': 0.5447511076927185, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.33}


                                                   
 33%|███▎      | 2500/7500 [19:48<35:01,  2.38it/s]

{'eval_loss': 0.2512412965297699, 'eval_runtime': 27.7426, 'eval_samples_per_second': 136.973, 'eval_steps_per_second': 17.122, 'epoch': 0.33}


 40%|████      | 3000/7500 [23:18<30:03,  2.50it/s]   

{'loss': 0.2841, 'grad_norm': 4.950230121612549, 'learning_rate': 3e-05, 'epoch': 0.4}


                                                   
 40%|████      | 3000/7500 [23:45<30:03,  2.50it/s]

{'eval_loss': 0.2456066906452179, 'eval_runtime': 27.7543, 'eval_samples_per_second': 136.916, 'eval_steps_per_second': 17.114, 'epoch': 0.4}


 47%|████▋     | 3500/7500 [27:15<28:03,  2.38it/s]   

{'loss': 0.2506, 'grad_norm': 11.517545700073242, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.47}


                                                   
 47%|████▋     | 3500/7500 [27:43<28:03,  2.38it/s]

{'eval_loss': 0.23858310282230377, 'eval_runtime': 27.7381, 'eval_samples_per_second': 136.996, 'eval_steps_per_second': 17.124, 'epoch': 0.47}


 53%|█████▎    | 4000/7500 [31:14<24:45,  2.36it/s]  

{'loss': 0.2565, 'grad_norm': 11.408463478088379, 'learning_rate': 2.3333333333333336e-05, 'epoch': 0.53}


                                                   
 53%|█████▎    | 4000/7500 [31:42<24:45,  2.36it/s]

{'eval_loss': 0.23821324110031128, 'eval_runtime': 27.7337, 'eval_samples_per_second': 137.017, 'eval_steps_per_second': 17.127, 'epoch': 0.53}


 60%|██████    | 4500/7500 [35:13<21:01,  2.38it/s]  

{'loss': 0.2556, 'grad_norm': 10.754769325256348, 'learning_rate': 2e-05, 'epoch': 0.6}


                                                   
 60%|██████    | 4500/7500 [35:41<21:01,  2.38it/s]

{'eval_loss': 0.23169654607772827, 'eval_runtime': 27.7343, 'eval_samples_per_second': 137.014, 'eval_steps_per_second': 17.127, 'epoch': 0.6}


 67%|██████▋   | 5000/7500 [39:11<17:44,  2.35it/s]  

{'loss': 0.2415, 'grad_norm': 4.573475360870361, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.67}


                                                   
 67%|██████▋   | 5000/7500 [39:39<17:44,  2.35it/s]

{'eval_loss': 0.23048031330108643, 'eval_runtime': 27.7858, 'eval_samples_per_second': 136.76, 'eval_steps_per_second': 17.095, 'epoch': 0.67}


 73%|███████▎  | 5500/7500 [43:10<14:42,  2.27it/s]  

{'loss': 0.2363, 'grad_norm': 3.0776736736297607, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.73}


                                                   
 73%|███████▎  | 5500/7500 [43:38<14:42,  2.27it/s]

{'eval_loss': 0.22698038816452026, 'eval_runtime': 27.7885, 'eval_samples_per_second': 136.747, 'eval_steps_per_second': 17.093, 'epoch': 0.73}


 80%|████████  | 6000/7500 [47:08<10:38,  2.35it/s]  

{'loss': 0.2431, 'grad_norm': 4.689082145690918, 'learning_rate': 1e-05, 'epoch': 0.8}


                                                   
 80%|████████  | 6000/7500 [47:35<10:38,  2.35it/s]

{'eval_loss': 0.22598956525325775, 'eval_runtime': 27.7307, 'eval_samples_per_second': 137.032, 'eval_steps_per_second': 17.129, 'epoch': 0.8}


 87%|████████▋ | 6500/7500 [51:06<07:07,  2.34it/s]  

{'loss': 0.2363, 'grad_norm': 3.0620431900024414, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.87}


                                                   
 87%|████████▋ | 6500/7500 [51:34<07:07,  2.34it/s]

{'eval_loss': 0.22596301138401031, 'eval_runtime': 27.7266, 'eval_samples_per_second': 137.053, 'eval_steps_per_second': 17.132, 'epoch': 0.87}


 93%|█████████▎| 7000/7500 [55:04<03:28,  2.39it/s]  

{'loss': 0.2395, 'grad_norm': 2.480018377304077, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.93}


                                                   
 93%|█████████▎| 7000/7500 [55:32<03:28,  2.39it/s]

{'eval_loss': 0.22215591371059418, 'eval_runtime': 27.7779, 'eval_samples_per_second': 136.799, 'eval_steps_per_second': 17.1, 'epoch': 0.93}


100%|██████████| 7500/7500 [59:02<00:00,  2.51it/s]  

{'loss': 0.2566, 'grad_norm': 2.1185505390167236, 'learning_rate': 0.0, 'epoch': 1.0}


                                                   
100%|██████████| 7500/7500 [59:30<00:00,  2.51it/s]

{'eval_loss': 0.22215646505355835, 'eval_runtime': 27.793, 'eval_samples_per_second': 136.725, 'eval_steps_per_second': 17.091, 'epoch': 1.0}


100%|██████████| 7500/7500 [59:30<00:00,  2.10it/s]

{'train_runtime': 3570.8821, 'train_samples_per_second': 33.605, 'train_steps_per_second': 2.1, 'train_loss': 0.2802134836832682, 'epoch': 1.0}





TrainOutput(global_step=7500, training_loss=0.2802134836832682, metrics={'train_runtime': 3570.8821, 'train_samples_per_second': 33.605, 'train_steps_per_second': 2.1, 'total_flos': 2.0500492798385664e+16, 'train_loss': 0.2802134836832682, 'epoch': 1.0})

In [15]:
# Save
tokenizer.save_pretrained(modified_base)
peft_model.save_pretrained(peft_model_name)

In [18]:
# Testing(Inference) of the LORA fine tune model.

# peft_model = AutoPeftModelForSequenceClassification.from_pretrained(peft_model, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(modified_base)

def classify(text):
    inputs = tokenizer(text,truncation=True, padding=True, return_tensors='pt')
    output = peft_model(**input)
    
    prediction = output.logits.argmax(dim=-1).item()
    
    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    

In [19]:
classify( "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

TypeError: PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                    (lora_magnitude_vector): ModuleDict()
                  )
                  (dropout): Dropout(p=0.1, inplace=False)
                )
                (output): RobertaSelfOutput(
                  (dense): Linear(in_features=768, out_features=768, bias=True)
                  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
              )
              (intermediate): RobertaIntermediate(
                (dense): Linear(in_features=768, out_features=3072, bias=True)
                (intermediate_act_fn): GELUActivation()
              )
              (output): RobertaOutput(
                (dense): Linear(in_features=3072, out_features=768, bias=True)
                (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
        )
      )
      (classifier): ModulesToSaveWrapper(
        (original_module): RobertaClassificationHead(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out_proj): Linear(in_features=768, out_features=4, bias=True)
        )
        (modules_to_save): ModuleDict(
          (default): RobertaClassificationHead(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (out_proj): Linear(in_features=768, out_features=4, bias=True)
          )
        )
      )
    )
  )
) argument after ** must be a mapping, not method

In [34]:
metric = evaluate.load('accuracy')

def evaluate_model(model, dataset):
    
    eval_dataloader = DataLoader(dataset.rename_column("label", "labels"), batch_size=8, collate_fn=data_collator).to
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=1)
        references = batch['labels']
        metric.add_batch(
            predictions=predictions,
            references=references
        )
    
    eval_metric = metric.compute()
    print(eval_metric)

In [35]:
# Base model evaluation on test data
evaluate_model(AutoModelForSequenceClassification.from_pretrained(base_model, id2label=id2label), test_dataset)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# LORA (peft) model evaluation on test data
evaluate_model(peft_model, test_dataset)

In [None]:
# Fully Fine tuned model evaluation on test data
# evaluate_model(fine_tune_trainer.model, test_dataset)