In [1]:
!pip install -qqq evaluate --progress-bar off
!pip install -qqq transformers --progress-bar off
!pip install -qqq peft --progress-bar off

In [2]:
import evaluate
import torch
import numpy as np
from huggingface_hub import notebook_login
from datasets import (
    load_dataset,
    DatasetDict,
    Dataset
    )
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
    )
from peft import (
    PeftModel,
    PeftConfig,
    get_peft_model,
    LoraConfig
)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Set the device
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f'Using device: {DEVICE}')

Using device: mps


### **Load dataset**

In [5]:
# # generate dataset

# # load imdb data
# imdb_dataset = load_dataset("imdb")

# # define subsample size
# N = 1000

# # generate indexes for random subsample
# rand_idx = np.random.randint(24999, size=N)

# # extract train and test data
# x_train = imdb_dataset['train'][rand_idx]['text']
# y_train = imdb_dataset['train'][rand_idx]['label']

# x_test = imdb_dataset['test'][rand_idx]['text']
# y_test = imdb_dataset['test'][rand_idx]['label']

# # create new dataset
# dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
#                        'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [6]:
# dataset.push_to_hub("wt-golf/imdb-1k")

In [7]:
# load dataset
dataset = load_dataset("wt-golf/imdb-1k")

Downloading readme:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/839k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
# the ratio of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.52

### **Base model for fine tuning**

In [27]:
base_model = 'distilbert-base-uncased'

# define label maps
id_label = {0: "Negative",
            1: "Positive"}
label_id = {"Negative":0,
            "Positive":1}

# generate classification model from base model
model = AutoModelForSequenceClassification.from_pretrained(
    base_model,
    num_labels=2,
    id2label=id_label,
    label2id=label_id
    )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# model architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### **Data preprocessing**

In [29]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    add_prefix_space=True,
    device_map="auto"
    )

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [30]:
# create a tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [31]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [32]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## **Evaluation**

In [33]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [34]:
# create an evaluation function
def evaluation_function(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"Accuracy": accuracy.compute(
        predictions=predictions,
        references=labels
        )
    }

## **Predict using the base model**

In [35]:
# define list of examples
text_list = [
    "It was excellent.",
    "I'm not a fan, wouldn't recommend.",
    "Superior to the initial one.",
    "This isn't worth watching even once.",
    "This one can be skipped."
    ]

print("Base model predictions:")
print("----------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + " - " + id_label[predictions.tolist()])

Base model predictions:
----------------------------
It was excellent. - Negative
I'm not a fan, wouldn't recommend. - Negative
Superior to the initial one. - Negative
This isn't worth watching even once. - Negative
This one can be skipped. - Negative


## **Train the base model**

In [36]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules = ['q_lin']
    )

peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules=['q_lin'], lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [37]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,221,124 || all params: 67,584,004 || trainable%: 1.8068239934408148


In [38]:
# hyperparameters
LEARNING_RATE = 0.001
BATCH_SIZE = 3
NUMBER_EPOCHS = 5

In [39]:
# define training arguments
training_args = TrainingArguments(
    output_dir= base_model + "-lora-text-classification",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUMBER_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [41]:
# create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=evaluation_function
)

In [42]:
# train model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.36239,{'accuracy': 0.885}
2,0.460900,0.485194,{'accuracy': 0.864}
3,0.288600,0.599084,{'accuracy': 0.882}
4,0.288600,0.657313,{'accuracy': 0.89}
5,0.084400,0.731868,{'accuracy': 0.889}


Trainer is attempting to log a value of "{'accuracy': 0.885}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.864}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.882}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.89}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.889}" of type <class 'dict'> for key "eval/Accuracy" as a scalar. This in

TrainOutput(global_step=1670, training_loss=0.25699408354159603, metrics={'train_runtime': 941.2753, 'train_samples_per_second': 5.312, 'train_steps_per_second': 1.774, 'total_flos': 524937541650984.0, 'train_loss': 0.25699408354159603, 'epoch': 5.0})

## **Predict using the PEFT model**

In [44]:
model.to('mps')

print("PEFT model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps")
    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices
    print(text + " - " + id_label[predictions.tolist()[0]])

PEFT model predictions:
--------------------------
It was excellent. - Positive
I'm not a fan, wouldn't recommend. - Negative
Superior to the initial one. - Positive
This isn't worth watching even once. - Negative
This one can be skipped. - Negative


### **Push the PEFT model to Hugging Face**

In [67]:
from huggingface_hub import login
write_key = 'hf_'
login(write_key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/work/.cache/huggingface/token
Login successful


In [65]:
hf_username = 'wt-golf'
ft_model = hf_username + "/" + base_model + "-lora-text-classification-imdb-1k"

In [68]:
model.push_to_hub(ft_model)
trainer.push_to_hub(ft_model)

adapter_model.bin:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/wt-golf/distilbert-base-uncased-lora-text-classification/tree/main/'

### **Load the PEFT model from Hugging Face**

In [69]:
load_model = 'wt-golf/distilbert-base-uncased-lora-text-classification-imdb-1k'

# define label maps
id_label = {0: "Negative",
            1: "Positive"}
label_id = {"Negative":0,
            "Positive":1}

In [72]:
config = PeftConfig.from_pretrained(load_model)

inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path,
    num_labels=2,
    id2label=id_label,
    label2id=label_id
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

loaded_model = PeftModel.from_pretrained(inference_model, load_model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
test_list = [
    "It was fantastic.",
    "I'm not a supporter, wouldn't suggest it.",
    "Outperformed the initial version.",
    "This isn't worth a single viewing.",
    "I'd skip this one.",
    "It was superb.",
    "I wouldn't endorse it, not my cup of tea.",
    "A step up from the first one.",
    "You won't find value in watching this even once.",
    "This is one you can pass on."
    ]

In [74]:
loaded_model.to('mps')

print("PEFT model predictions:")
print("--------------------------")
for text in test_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps")
    logits = loaded_model(inputs).logits
    predictions = torch.max(logits,1).indices
    print(text + " - " + id_label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was fantastic. - Positive
I'm not a supporter, wouldn't suggest it. - Negative
Outperformed the initial version. - Negative
This isn't worth a single viewing. - Negative
I'd skip this one. - Negative
It was superb. - Positive
I wouldn't endorse it, not my cup of tea. - Negative
A step up from the first one. - Positive
You won't find value in watching this even once. - Negative
This is one you can pass on. - Positive
