In [21]:
!pip install torch transformers datasets peft evaluate numpy scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp312-cp312-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m826.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.14.0-cp312-cp312-macosx_14_0_arm64.whl (23.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m921.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
I

In [4]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset, DatasetDict, Dataset
import evaluate
import torch
import numpy as np
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [6]:
model_checkpoint = 'distilbert-base-uncased'
id2label = {0: 'Negative', 1: 'Positive'}
label2id = {'Negative': 0, 'Positive': 1}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
dataset = load_dataset('shawhin/imdb-truncated')

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [9]:
dataset['train'][0]

{'label': 1,
 'text': '. . . or type on a computer keyboard, they\'d probably give this eponymous film a rating of "10." After all, no elephants are shown being killed during the movie; it is not even implied that any are hurt. To the contrary, the master of ELEPHANT WALK, John Wiley (Peter Finch), complains that he cannot shoot any of the pachyderms--no matter how menacing--without a permit from the government (and his tone suggests such permits are not within the realm of probability). Furthermore, the elements conspire--in the form of an unusual drought and a human cholera epidemic--to leave the Wiley plantation house vulnerable to total destruction by the Elephant People (as the natives dub them) to close the story. If you happen to see the current release EARTH, you\'ll detect the Elephant People are faring less well today.'}

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)



In [11]:
# This function converts text to numbers
def tokenize_function(examples):
    text = examples['text']
    
    # We need to make sure the examples are all of same length.
    # We can either do this by truncating long sequences or padding long sequences to fixed length.
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(text, return_tensors='np', truncation=True, max_length=512)
    return tokenized_inputs

# Add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # Update the model to handle the additional token we just added
    model.resize_token_embeddings(len(tokenizer))
    
# Tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 1000/1000 [00:00<00:00, 6617.48 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 7952.22 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [12]:
# create data collator. 
# Dynamically pads the examples in a given batch to the longest sequence in the batch
# Using the collator to pad the examples in each batch is more efficient than padding all
# the examples across the entire training set as there could be some anomalies with extremely long
# sequence length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:
# This is how we monitor the performance of the model during training

# import accuracy evaluation metric
accuracy = evaluate.load('accuracy')

# packaging evaluation metrics into a function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    
    # computing accuracy by comparing prediction with label
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [24]:
text_list = ["It was good", 
             "Not a fan, don't recommend", 
             "Better than the first one",
             "This is not worth watching even once",
             "This one is a pass"
            ]

print("Untrained model predictions:")
print("----------------------------")


for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    
    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good - Negative
Not a fan, don't recommend - Negative
Better than the first one - Negative
This is not worth watching even once - Negative
This one is a pass - Negative


In [25]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4,  # intrinsic rank of traininable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.01, # probability of dropout
                        target_modules=['q_lin']) # we apply lora to query layer

In [26]:
# From the original model, we get another model that is ready to be finetuned using LoRA
model = get_peft_model(model, peft_config)
model = model.to(device) # moving to 'mps' for Mac (can alternatively do 'cpu')
model.print_trainable_parameters()

print(model.device)

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307
mps:0


In [27]:
# hyperparameters
lr = 1e-3  # size of optimization step
batch_size = 4 # number of examples processed per optimization step
num_epochs = 10 # number of times model runs through training data

# define training arguments
training_args = TrainingArguments(
    output_dir = model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch", # compute the evaluation metrics for each epoch
    save_strategy="epoch", # save the model parameters for each epoch
    load_best_model_at_end=True, # at the end of training, return the best version of the model
)



In [28]:
# Define a custom collate function to move tensors to the MPS device
class CustomTrainer(Trainer):
    def _prepare_inputs(self, inputs):
        return {k: v.to(device) for k, v in inputs.items()}

    # def compute_loss(self, model, inputs, return_outputs=False):
    #         inputs = self._prepare_inputs(inputs)
    #         return super().compute_loss(model, inputs, return_outputs)
    
    def compute_loss(self, model, inputs, return_outputs=False):
        # Move inputs to the correct device
        inputs = self._prepare_inputs(inputs)
        # Forward pass
        outputs = model(**inputs)
        # Get the loss
        loss = outputs.get("loss")
        # Ensure the loss is on the correct device
        if loss is not None:
            loss = loss.to(device)
        return (loss, outputs) if return_outputs else loss
    
trainer = CustomTrainer(
    model=model, # our peft model
    args=training_args, # hyperparameter
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples
    compute_metrics=compute_metrics, # evaluate metrics using this function
)

trainer.train()


mps:0


                                                  
 10%|█         | 250/2500 [01:54<08:54,  4.21it/s]

{'eval_loss': 0.5735306143760681, 'eval_accuracy': {'accuracy': 0.843}, 'eval_runtime': 43.5504, 'eval_samples_per_second': 22.962, 'eval_steps_per_second': 5.74, 'epoch': 1.0}


 20%|██        | 500/2500 [03:00<08:30,  3.92it/s]  

{'loss': 0.4415, 'grad_norm': 19.244827270507812, 'learning_rate': 0.0008, 'epoch': 2.0}


                                                  
 20%|██        | 500/2500 [03:31<08:30,  3.92it/s]

{'eval_loss': 0.4678434431552887, 'eval_accuracy': {'accuracy': 0.879}, 'eval_runtime': 30.6719, 'eval_samples_per_second': 32.603, 'eval_steps_per_second': 8.151, 'epoch': 2.0}


                                                    
 30%|███       | 750/2500 [05:06<07:31,  3.88it/s]

{'eval_loss': 0.626606822013855, 'eval_accuracy': {'accuracy': 0.882}, 'eval_runtime': 31.2583, 'eval_samples_per_second': 31.992, 'eval_steps_per_second': 7.998, 'epoch': 3.0}


 40%|████      | 1000/2500 [06:06<05:08,  4.87it/s] 

{'loss': 0.2188, 'grad_norm': 0.3092513084411621, 'learning_rate': 0.0006, 'epoch': 4.0}


                                                   
 40%|████      | 1000/2500 [06:38<05:08,  4.87it/s]

{'eval_loss': 0.7080491185188293, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 32.1543, 'eval_samples_per_second': 31.1, 'eval_steps_per_second': 7.775, 'epoch': 4.0}


                                                     
 50%|█████     | 1250/2500 [08:06<04:28,  4.65it/s]

{'eval_loss': 0.7667421698570251, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 32.7305, 'eval_samples_per_second': 30.553, 'eval_steps_per_second': 7.638, 'epoch': 5.0}


 60%|██████    | 1500/2500 [09:00<02:53,  5.76it/s]  

{'loss': 0.0905, 'grad_norm': 0.000760614697355777, 'learning_rate': 0.0004, 'epoch': 6.0}


                                                   
 60%|██████    | 1500/2500 [09:31<02:53,  5.76it/s]

{'eval_loss': 0.8285343647003174, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 30.7065, 'eval_samples_per_second': 32.566, 'eval_steps_per_second': 8.142, 'epoch': 6.0}


                                                     
 70%|███████   | 1750/2500 [11:02<02:40,  4.68it/s]

{'eval_loss': 1.0264489650726318, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 37.2415, 'eval_samples_per_second': 26.852, 'eval_steps_per_second': 6.713, 'epoch': 7.0}


 80%|████████  | 2000/2500 [11:56<01:50,  4.53it/s]  

{'loss': 0.0319, 'grad_norm': 4.4457610783865675e-06, 'learning_rate': 0.0002, 'epoch': 8.0}


                                                   
 80%|████████  | 2000/2500 [12:33<01:50,  4.53it/s]

{'eval_loss': 0.9506431818008423, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 36.1754, 'eval_samples_per_second': 27.643, 'eval_steps_per_second': 6.911, 'epoch': 8.0}


                                                     
 90%|█████████ | 2250/2500 [14:13<00:48,  5.17it/s]

{'eval_loss': 0.9696640968322754, 'eval_accuracy': {'accuracy': 0.891}, 'eval_runtime': 37.7728, 'eval_samples_per_second': 26.474, 'eval_steps_per_second': 6.619, 'epoch': 9.0}


100%|██████████| 2500/2500 [15:13<00:00,  5.66it/s]

{'loss': 0.0044, 'grad_norm': 0.0002933680370915681, 'learning_rate': 0.0, 'epoch': 10.0}


                                                   
100%|██████████| 2500/2500 [15:47<00:00,  5.66it/s]

{'eval_loss': 0.9987988471984863, 'eval_accuracy': {'accuracy': 0.886}, 'eval_runtime': 33.7178, 'eval_samples_per_second': 29.658, 'eval_steps_per_second': 7.414, 'epoch': 10.0}


100%|██████████| 2500/2500 [15:48<00:00,  2.64it/s]

{'train_runtime': 948.1841, 'train_samples_per_second': 10.546, 'train_steps_per_second': 2.637, 'train_loss': 0.15740664529800416, 'epoch': 10.0}





TrainOutput(global_step=2500, training_loss=0.15740664529800416, metrics={'train_runtime': 948.1841, 'train_samples_per_second': 10.546, 'train_steps_per_second': 2.637, 'total_flos': 1112883852759936.0, 'train_loss': 0.15740664529800416, 'epoch': 10.0})

In [30]:
text_list = ["It was good", 
             "Not a fan, don't recommend", 
             "Better than the first one",
             "This is not worth watching even once",
             "This one is a pass"
            ]

print("Trained model predictions:")
print("----------------------------")


for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs.to(device)).logits
    predictions = torch.argmax(logits)
    
    print(text + " - " + id2label[predictions.tolist()])

Trained model predictions:
----------------------------
It was good - Positive
Not a fan, don't recommend - Negative
Better than the first one - Positive
This is not worth watching even once - Negative
This one is a pass - Negative
