In [1]:
import torch
import random
import evaluate

from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model, AutoPeftModelForSequenceClassification
from transformers import RobertaModel, RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
peft_model_name = 'roberta-base-peft'
modified_base = 'roberta-base-modified'
base_model = 'roberta-base'

In [4]:
dataset = load_dataset('ag_news')
tokenizer = RobertaTokenizer.from_pretrained(base_model)



In [5]:
len(dataset['train'])

120000

In [6]:
dataset['train'][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [7]:

def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)
    
    
tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
train_dataset=tokenized_dataset['train']

# divide the total test data into two equal sets using shard
test_dataset=tokenized_dataset['test'].shard(num_shards=2, index=0)
eval_dataset=tokenized_dataset['test'].shard(num_shards=2, index=1)

In [8]:
print(dataset['train'].features['label']) 
# look at the print to find about the right key.
class_labels = dataset['train'].features['label'].names
num_labels = len(class_labels)
print(f"number of labels: {num_labels}")
print(f"the labels: {class_labels}")

ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)
number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [9]:
# Create an (id:label) mapping for clasisfing
id2label = {i: label for i, label in enumerate(class_labels)}
print(id2label)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
print(data_collator)

{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
DataCollatorWithPadding(tokenizer=RobertaTokenizer(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, padding=True, m

In [10]:
# Trainign arguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    learning_rate=5e-5,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    fp16=True if torch.cuda.is_available() else False

)

def get_trainer(model, training_args, train_dataset):
      return  Trainer(
          model=model,
          args=training_args,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )



In [11]:
model = AutoModelForSequenceClassification.from_pretrained(base_model, id2label=id2label)
model.to(device)
fine_tune_trainer = get_trainer(model, training_args, train_dataset)

fine_tune_trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  7%|▋         | 500/7500 [02:25<34:56,  3.34it/s]

{'loss': 0.4131, 'grad_norm': 2.9868056774139404, 'learning_rate': 4.668666666666667e-05, 'epoch': 0.07}


                                                  
  7%|▋         | 500/7500 [02:39<34:56,  3.34it/s]

{'eval_loss': 0.34810832142829895, 'eval_runtime': 13.908, 'eval_samples_per_second': 273.224, 'eval_steps_per_second': 34.153, 'epoch': 0.07}


 13%|█▎        | 1000/7500 [05:07<31:49,  3.40it/s] 

{'loss': 0.3207, 'grad_norm': 3.1832401752471924, 'learning_rate': 4.335333333333334e-05, 'epoch': 0.13}


                                                   
 13%|█▎        | 1000/7500 [05:21<31:49,  3.40it/s]

{'eval_loss': 0.3013884425163269, 'eval_runtime': 13.901, 'eval_samples_per_second': 273.361, 'eval_steps_per_second': 34.17, 'epoch': 0.13}


 20%|██        | 1500/7500 [07:48<29:10,  3.43it/s]  

{'loss': 0.3035, 'grad_norm': 3.8448498249053955, 'learning_rate': 4.0033333333333335e-05, 'epoch': 0.2}


                                                   
 20%|██        | 1500/7500 [08:02<29:10,  3.43it/s]

{'eval_loss': 0.3078445494174957, 'eval_runtime': 13.8763, 'eval_samples_per_second': 273.848, 'eval_steps_per_second': 34.231, 'epoch': 0.2}


 27%|██▋       | 2000/7500 [10:29<25:51,  3.54it/s]  

{'loss': 0.2768, 'grad_norm': 10.001203536987305, 'learning_rate': 3.6700000000000004e-05, 'epoch': 0.27}


                                                   
 27%|██▋       | 2000/7500 [10:43<25:51,  3.54it/s]

{'eval_loss': 0.2743837237358093, 'eval_runtime': 13.882, 'eval_samples_per_second': 273.735, 'eval_steps_per_second': 34.217, 'epoch': 0.27}


 33%|███▎      | 2500/7500 [13:10<24:23,  3.42it/s]  

{'loss': 0.2601, 'grad_norm': 0.46604636311531067, 'learning_rate': 3.336666666666667e-05, 'epoch': 0.33}


                                                   
 33%|███▎      | 2500/7500 [13:24<24:23,  3.42it/s]

{'eval_loss': 0.27000290155410767, 'eval_runtime': 13.8943, 'eval_samples_per_second': 273.494, 'eval_steps_per_second': 34.187, 'epoch': 0.33}


 40%|████      | 3000/7500 [15:52<20:53,  3.59it/s]  

{'loss': 0.2797, 'grad_norm': 7.961300373077393, 'learning_rate': 3.0033333333333336e-05, 'epoch': 0.4}


                                                   
 40%|████      | 3000/7500 [16:06<20:53,  3.59it/s]

{'eval_loss': 0.23781833052635193, 'eval_runtime': 13.8576, 'eval_samples_per_second': 274.218, 'eval_steps_per_second': 34.277, 'epoch': 0.4}


 47%|████▋     | 3500/7500 [18:33<19:36,  3.40it/s]  

{'loss': 0.2415, 'grad_norm': 9.936685562133789, 'learning_rate': 2.6700000000000002e-05, 'epoch': 0.47}


                                                   
 47%|████▋     | 3500/7500 [18:47<19:36,  3.40it/s]

{'eval_loss': 0.21502923965454102, 'eval_runtime': 13.8534, 'eval_samples_per_second': 274.301, 'eval_steps_per_second': 34.288, 'epoch': 0.47}


 53%|█████▎    | 4000/7500 [21:14<17:16,  3.38it/s]  

{'loss': 0.2326, 'grad_norm': 18.833702087402344, 'learning_rate': 2.3366666666666668e-05, 'epoch': 0.53}


                                                   
 53%|█████▎    | 4000/7500 [21:28<17:16,  3.38it/s]

{'eval_loss': 0.23042680323123932, 'eval_runtime': 13.881, 'eval_samples_per_second': 273.755, 'eval_steps_per_second': 34.219, 'epoch': 0.53}


 60%|██████    | 4500/7500 [23:56<14:34,  3.43it/s]  

{'loss': 0.2197, 'grad_norm': 6.789280414581299, 'learning_rate': 2.0033333333333334e-05, 'epoch': 0.6}


                                                   
 60%|██████    | 4500/7500 [24:10<14:34,  3.43it/s]

{'eval_loss': 0.21372494101524353, 'eval_runtime': 13.8765, 'eval_samples_per_second': 273.845, 'eval_steps_per_second': 34.231, 'epoch': 0.6}


 67%|██████▋   | 5000/7500 [26:37<12:12,  3.41it/s]  

{'loss': 0.2111, 'grad_norm': 27.331132888793945, 'learning_rate': 1.6700000000000003e-05, 'epoch': 0.67}


                                                   
 67%|██████▋   | 5000/7500 [26:51<12:12,  3.41it/s]

{'eval_loss': 0.20857928693294525, 'eval_runtime': 13.8551, 'eval_samples_per_second': 274.267, 'eval_steps_per_second': 34.283, 'epoch': 0.67}


 73%|███████▎  | 5500/7500 [29:19<10:15,  3.25it/s]  

{'loss': 0.2071, 'grad_norm': 0.43634289503097534, 'learning_rate': 1.3366666666666667e-05, 'epoch': 0.73}


                                                   
 73%|███████▎  | 5500/7500 [29:33<10:15,  3.25it/s]

{'eval_loss': 0.19225776195526123, 'eval_runtime': 13.8861, 'eval_samples_per_second': 273.655, 'eval_steps_per_second': 34.207, 'epoch': 0.73}


 80%|████████  | 6000/7500 [32:00<07:23,  3.38it/s]  

{'loss': 0.2041, 'grad_norm': 4.87384557723999, 'learning_rate': 1.0033333333333333e-05, 'epoch': 0.8}


                                                   
 80%|████████  | 6000/7500 [32:14<07:23,  3.38it/s]

{'eval_loss': 0.19728034734725952, 'eval_runtime': 13.898, 'eval_samples_per_second': 273.42, 'eval_steps_per_second': 34.178, 'epoch': 0.8}


 87%|████████▋ | 6500/7500 [34:42<04:56,  3.37it/s]  

{'loss': 0.2061, 'grad_norm': 4.977787494659424, 'learning_rate': 6.700000000000001e-06, 'epoch': 0.87}


                                                   
 87%|████████▋ | 6500/7500 [34:56<04:56,  3.37it/s]

{'eval_loss': 0.1934729665517807, 'eval_runtime': 13.8586, 'eval_samples_per_second': 274.197, 'eval_steps_per_second': 34.275, 'epoch': 0.87}


 93%|█████████▎| 7000/7500 [37:23<02:24,  3.46it/s]  

{'loss': 0.1836, 'grad_norm': 10.310710906982422, 'learning_rate': 3.3666666666666665e-06, 'epoch': 0.93}


                                                   
 93%|█████████▎| 7000/7500 [37:36<02:24,  3.46it/s]

{'eval_loss': 0.187775656580925, 'eval_runtime': 13.8615, 'eval_samples_per_second': 274.14, 'eval_steps_per_second': 34.267, 'epoch': 0.93}


100%|██████████| 7500/7500 [40:04<00:00,  3.60it/s]

{'loss': 0.2, 'grad_norm': 2.870138168334961, 'learning_rate': 3.3333333333333334e-08, 'epoch': 1.0}


                                                   
100%|██████████| 7500/7500 [40:18<00:00,  3.60it/s]

{'eval_loss': 0.18516644835472107, 'eval_runtime': 13.8914, 'eval_samples_per_second': 273.55, 'eval_steps_per_second': 34.194, 'epoch': 1.0}


100%|██████████| 7500/7500 [40:19<00:00,  3.10it/s]

{'train_runtime': 2419.6386, 'train_samples_per_second': 49.594, 'train_steps_per_second': 3.1, 'train_loss': 0.2506412394205729, 'epoch': 1.0}





TrainOutput(global_step=7500, training_loss=0.2506412394205729, metrics={'train_runtime': 2419.6386, 'train_samples_per_second': 49.594, 'train_steps_per_second': 3.1, 'total_flos': 2.0289992490004224e+16, 'train_loss': 0.2506412394205729, 'epoch': 1.0})

In [12]:
# PEFT Training
# model_ft = AutoModelForSequenceClassification.from_pretrained(base_model, id2label=id2label)

# peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
# peft_model = get_peft_model(model_ft, peft_config)
# peft_model.to(device)
# print("PEFT MODEL")
# peft_model.print_trainable_parameters()
# peft_lora_finetuning_trainer = get_trainer(peft_model, training_args, train_dataset)
# peft_lora_finetuning_trainer.train()

In [13]:
# Save
tokenizer.save_pretrained(modified_base)
fine_tune_trainer.save_model(modified_base)
# peft_model.save_pretrained(peft_model_name)

In [14]:
# Testing(Inference) of the LORA fine tune model.

peft_model = AutoPeftModelForSequenceClassification.from_pretrained(peft_model_name, id2label=id2label, local_files_only=True)
peft_model.to(device)
tokenizer = AutoTokenizer.from_pretrained(modified_base, local_files_only=True)

def classify(text, deivce = device):
    inputs = tokenizer(text,truncation=True, padding=True, return_tensors='pt')
    inputs.to(device)
    output = peft_model(**inputs)
    
    prediction = output.logits.argmax(dim=-1).item()
    
    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
classify( "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 1, Label: Sports, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.


In [16]:
metric = evaluate.load('accuracy')

def evaluate_model(model, dataset):
    
    eval_dataloader = DataLoader(dataset.rename_column("label", "labels"), batch_size=8, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=1)
        references = batch['labels']
        metric.add_batch(
            predictions=predictions,
            references=references
        )
    
    eval_metric = metric.compute()
    print(eval_metric)

In [17]:
# Base model evaluation on test data
evaluate_model(AutoModelForSequenceClassification.from_pretrained(base_model, id2label=id2label), test_dataset)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 475/475 [00:26<00:00, 17.98it/s]

{'accuracy': 0.24973684210526315}





In [18]:
# LORA (peft) model evaluation on test data
evaluate_model(peft_model, test_dataset)

100%|██████████| 475/475 [00:27<00:00, 17.04it/s]

{'accuracy': 0.9171052631578948}





In [19]:
# Fully Fine tuned model evaluation on test data
evaluate_model(fine_tune_trainer.model, test_dataset)

100%|██████████| 475/475 [00:14<00:00, 33.77it/s]

{'accuracy': 0.9426315789473684}





In [20]:
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleD