Set Up Your Google Colab Environment

In [1]:
!pip install transformers datasets peft
!pip install torch torchvision torchaudio
!pip install bitsandbytes


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_n

Load a Pretrained Model from Hugging Face

In [2]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Prepare Your Dataset

In [3]:
from datasets import load_dataset

# Load a dataset
dataset = load_dataset("glue", "sst2")

# Preprocess the dataset
def preprocess_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding="max_length")

encoded_dataset = dataset.map(preprocess_function, batched=True)


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

 Apply PEFT and LoRA

In [4]:
from peft import LoRAConfig, apply_lora

# Define LoRA configuration
lora_config = LoRAConfig(
    r=8,         # Rank
    lora_alpha=32, # Alpha parameter for LoRA
    lora_dropout=0.1 # Dropout rate
)

# Apply LoRA to the model
model = apply_lora(model, lora_config)


ImportError: cannot import name 'LoRAConfig' from 'peft' (/usr/local/lib/python3.10/dist-packages/peft/__init__.py)


It looks like the peft library's API might have changed or the LoRAConfig class isn't available. Lets revise the approach based on available documentation and common practices.

In [5]:
!pip install --upgrade peft




In [6]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from peft import get_peft_model, LoRAConfig

# Define LoRA configuration
lora_config = LoRAConfig(
    r=8,  # Rank
    lora_alpha=32,  # Alpha parameter for LoRA
    lora_dropout=0.1  # Dropout rate
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)


ImportError: cannot import name 'LoRAConfig' from 'peft' (/usr/local/lib/python3.10/dist-packages/peft/__init__.py)

In [8]:
!pip install --upgrade transformers bitsandbytes


Collecting transformers
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m665.1 kB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.43.3


In [9]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
from bitsandbytes import quantize_model

# Quantize the model (8-bit quantization)
quantized_model = quantize_model(model, bits=8)




ImportError: cannot import name 'quantize_model' from 'bitsandbytes' (/usr/local/lib/python3.10/dist-packages/bitsandbytes/__init__.py)

In [2]:
!pip install --upgrade transformers torch


Collecting torch
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting triton==3.0.0 (from torch)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl (797.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.2/797.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00

In [3]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


KeyboardInterrupt: 

In [1]:
import torch

# Prepare the model for quantization
model.eval()  # Set the model to evaluation mode

# Apply dynamic quantization
quantized_model = torch.quantization.quantize_dynamic(
    model,  # The model to quantize
    {torch.nn.Linear},  # List of layers to quantize
    dtype=torch.qint8  # Quantization data type
)


NameError: name 'model' is not defined

In [2]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import torch

# Set the model to evaluation mode
model.eval()

# Apply dynamic quantization
quantized_model = torch.quantization.quantize_dynamic(
    model,  # The model to quantize
    {torch.nn.Linear},  # List of layers to quantize
    dtype=torch.qint8  # Quantization data type
)


In [4]:
from datasets import load_dataset

# Load and preprocess the dataset
dataset = load_dataset("glue", "sst2")

def preprocess_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding="max_length")

encoded_dataset = dataset.map(preprocess_function, batched=True)


In [6]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=.5,
    weight_decay=0.1,
)

# Initialize Trainer
trainer = Trainer(
    model=quantized_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"]
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [7]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",               # Output directory for model checkpoints
    evaluation_strategy="steps",          # Evaluate every `eval_steps`
    eval_steps=500,                       # Number of steps between evaluations
    per_device_train_batch_size=16,       # Increased batch size
    per_device_eval_batch_size=16,        # Increased batch size for evaluation
    num_train_epochs=1,                   # Reduced number of epochs
    weight_decay=0.01,                    # Weight decay for regularization
    logging_dir="./logs",                 # Directory for logs
    logging_steps=100,                    # Number of steps between logging
)

# Initialize Trainer
trainer = Trainer(
    model=quantized_model,                # Your quantized model
    args=training_args,                   # Training arguments
    train_dataset=encoded_dataset["train"],  # Training dataset
    eval_dataset=encoded_dataset["validation"]  # Evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [8]:
from datasets import load_dataset

# Load a small subset of the dataset (100 samples)
dataset = load_dataset("glue", "sst2", split='train[:100]')

def preprocess_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding="max_length")

encoded_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
from transformers import Trainer, TrainingArguments

# Define training arguments for quick training
training_args = TrainingArguments(
    output_dir="./results",               # Output directory for model checkpoints
    evaluation_strategy="no",            # Skip evaluation to save time
    per_device_train_batch_size=4,        # Smaller batch size
    per_device_eval_batch_size=4,         # Smaller batch size for evaluation
    num_train_epochs=1,                   # Single epoch for quick training
    weight_decay=0.01,                    # Weight decay for regularization
    logging_dir="./logs",                 # Directory for logs
    logging_steps=10,                     # Frequent logging
)


it was taking 24 hours+ for executing so for us to To drastically reduce training time, you can make the following changes:

    Set Number of Epochs to 1: Keep it as low as possible.
    Reduce Batch Size: Use a smaller batch size to speed up the iterations.
    Minimize Dataset Size: Use a small subset of the dataset for quick experimentation.
    Skip Evaluation: Skip evaluation to save time.

In [10]:
from transformers import Trainer

# Initialize Trainer with updated arguments and dataset
trainer = Trainer(
    model=model,                          # Your model
    args=training_args,                   # Training arguments
    train_dataset=encoded_dataset         # Training dataset
)

# Train the model
trainer.train()


Step,Training Loss
10,0.7097
20,0.7117


TrainOutput(global_step=25, training_loss=0.7059617900848388, metrics={'train_runtime': 420.1321, 'train_samples_per_second': 0.238, 'train_steps_per_second': 0.06, 'total_flos': 13246739865600.0, 'train_loss': 0.7059617900848388, 'epoch': 1.0})

In [11]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)

ValueError: Trainer: evaluation requires an eval_dataset.

In [12]:
model.save_pretrained("./quick_model")
tokenizer.save_pretrained("./quick_model")


('./quick_model/tokenizer_config.json',
 './quick_model/special_tokens_map.json',
 './quick_model/vocab.txt',
 './quick_model/added_tokens.json')

In [13]:
# Load a small subset of the validation dataset (100 samples)
eval_dataset = load_dataset("glue", "sst2", split='validation[:100]')
encoded_eval_dataset = eval_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
# Initialize Trainer with evaluation dataset
trainer = Trainer(
    model=model,                          # Your model
    args=training_args,                   # Training arguments
    train_dataset=encoded_dataset,        # Training dataset
    eval_dataset=encoded_eval_dataset     # Evaluation dataset
)


In [15]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)


Evaluation results: {'eval_loss': 0.6684677004814148, 'eval_model_preparation_time': 0.0149, 'eval_runtime': 116.6672, 'eval_samples_per_second': 0.857, 'eval_steps_per_second': 0.214}


With these changes, you provide an evaluation dataset to the Trainer, enabling evaluation.

In [16]:
# Save the trained model
model.save_pretrained("./quick_model")

# Save the tokenizer
tokenizer.save_pretrained("./quick_model")


('./quick_model/tokenizer_config.json',
 './quick_model/special_tokens_map.json',
 './quick_model/vocab.txt',
 './quick_model/added_tokens.json')