In [None]:
%pip install transformers datasets peft evaluate scikit-learn

In [32]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [33]:
model_checkpoint = 'distilbert-base-uncased'

# define label maps
id2label = {0:"Negative", 1:"Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# load dataset
dataset = load_dataset("shawhin/imdb-truncated")

dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

def tokenize_function(examples):
  text = examples["text"]

  tokenizer.truncation_side = "left"
  tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=512
  )

  return tokenized_inputs

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset



DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)

  return {"accuracy": accuracy.compute(predictions=predictions,references=labels)}

In [37]:
text_list = ["It was good.", "Not a fan, don't recommend.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass"]


print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors="pt")
  logits = model(inputs).logits
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass - Negative


In [38]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                         r=4,
                         lora_alpha=32,
                         lora_dropout=0.01,
                         target_modules = ['q_lin'])



In [39]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [40]:
lr = 1e-3
batch_size = 4
num_epochs = 10

training_args = TrainingArguments(
  output_dir = model_checkpoint + "-lora-text-classification",
  learning_rate=lr,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  num_train_epochs=num_epochs,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True
)



In [41]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset["train"],
  eval_dataset=tokenized_dataset["validation"],
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics
)


  trainer = Trainer(


In [42]:
trainer.train()

 10%|▉         | 249/2500 [00:19<02:51, 13.13it/s]
 10%|█         | 250/2500 [00:27<02:51, 13.13it/s]

{'eval_loss': 0.7153200507164001, 'eval_accuracy': {'accuracy': 0.801}, 'eval_runtime': 8.2898, 'eval_samples_per_second': 120.63, 'eval_steps_per_second': 30.158, 'epoch': 1.0}


 20%|██        | 500/2500 [00:49<02:32, 13.15it/s]

{'loss': 0.4574, 'grad_norm': 14.269070625305176, 'learning_rate': 0.0008, 'epoch': 2.0}



 20%|██        | 500/2500 [00:57<02:32, 13.15it/s]

{'eval_loss': 0.6687059998512268, 'eval_accuracy': {'accuracy': 0.84}, 'eval_runtime': 8.5299, 'eval_samples_per_second': 117.234, 'eval_steps_per_second': 29.309, 'epoch': 2.0}


 30%|███       | 750/2500 [01:19<02:24, 12.09it/s]
 30%|███       | 750/2500 [01:27<02:24, 12.09it/s]

{'eval_loss': 0.6257894039154053, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 8.6, 'eval_samples_per_second': 116.279, 'eval_steps_per_second': 29.07, 'epoch': 3.0}


 40%|████      | 1000/2500 [01:48<02:11, 11.45it/s]

{'loss': 0.2224, 'grad_norm': 0.13366465270519257, 'learning_rate': 0.0006, 'epoch': 4.0}



 40%|████      | 1000/2500 [01:57<02:11, 11.45it/s]

{'eval_loss': 0.6912614703178406, 'eval_accuracy': {'accuracy': 0.89}, 'eval_runtime': 8.52, 'eval_samples_per_second': 117.37, 'eval_steps_per_second': 29.343, 'epoch': 4.0}


 50%|████▉     | 1249/2500 [02:17<01:48, 11.57it/s]
 50%|█████     | 1250/2500 [02:26<01:48, 11.57it/s]

{'eval_loss': 0.8011311292648315, 'eval_accuracy': {'accuracy': 0.882}, 'eval_runtime': 8.3965, 'eval_samples_per_second': 119.097, 'eval_steps_per_second': 29.774, 'epoch': 5.0}


 60%|██████    | 1500/2500 [02:47<01:20, 12.40it/s]

{'loss': 0.0664, 'grad_norm': 0.04049336165189743, 'learning_rate': 0.0004, 'epoch': 6.0}



 60%|██████    | 1500/2500 [02:56<01:20, 12.40it/s]

{'eval_loss': 0.905749499797821, 'eval_accuracy': {'accuracy': 0.888}, 'eval_runtime': 8.7112, 'eval_samples_per_second': 114.794, 'eval_steps_per_second': 28.699, 'epoch': 6.0}


 70%|██████▉   | 1749/2500 [03:17<01:05, 11.46it/s]
 70%|███████   | 1750/2500 [03:26<01:05, 11.46it/s]

{'eval_loss': 0.9686894416809082, 'eval_accuracy': {'accuracy': 0.888}, 'eval_runtime': 8.7085, 'eval_samples_per_second': 114.83, 'eval_steps_per_second': 28.707, 'epoch': 7.0}


 80%|████████  | 2000/2500 [03:47<00:37, 13.18it/s]

{'loss': 0.0193, 'grad_norm': 0.00019481469644233584, 'learning_rate': 0.0002, 'epoch': 8.0}



 80%|████████  | 2000/2500 [03:56<00:37, 13.18it/s]

{'eval_loss': 1.0860835313796997, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 8.4551, 'eval_samples_per_second': 118.272, 'eval_steps_per_second': 29.568, 'epoch': 8.0}


 90%|████████▉ | 2249/2500 [04:16<00:23, 10.83it/s]
 90%|█████████ | 2250/2500 [04:25<00:23, 10.83it/s]

{'eval_loss': 1.0705336332321167, 'eval_accuracy': {'accuracy': 0.886}, 'eval_runtime': 8.5277, 'eval_samples_per_second': 117.264, 'eval_steps_per_second': 29.316, 'epoch': 9.0}


100%|██████████| 2500/2500 [04:46<00:00, 12.84it/s]

{'loss': 0.007, 'grad_norm': 0.0009069786756299436, 'learning_rate': 0.0, 'epoch': 10.0}



100%|██████████| 2500/2500 [04:55<00:00, 12.84it/s]

{'eval_loss': 1.0711623430252075, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 8.4906, 'eval_samples_per_second': 117.777, 'eval_steps_per_second': 29.444, 'epoch': 10.0}


100%|██████████| 2500/2500 [04:55<00:00,  8.45it/s]

{'train_runtime': 295.8187, 'train_samples_per_second': 33.804, 'train_steps_per_second': 8.451, 'train_loss': 0.1544877212524414, 'epoch': 10.0}





TrainOutput(global_step=2500, training_loss=0.1544877212524414, metrics={'train_runtime': 295.8187, 'train_samples_per_second': 33.804, 'train_steps_per_second': 8.451, 'total_flos': 1112883852759936.0, 'train_loss': 0.1544877212524414, 'epoch': 10.0})

In [49]:
print("Trained model predictions:")
print("----------------------------")
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
  logits = model(inputs).logits
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

Trained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass - Negative
