In [1]:
!pip install transformers datasets accelerate loralib

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading loralib-0.1.2-py3-none-any.whl (10 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizer
from datasets import load_dataset
from transformers import get_scheduler
import loralib as lora
import time

from datasets import concatenate_datasets

from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import pandas as pd

# Create dataset

In [3]:
# Load the IMDB dataset
dataset = load_dataset("imdb")

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the IMDB dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# Sample 2500 from the first 12500 negative samples
negative_samples_train = train_dataset.select(range(12500)).shuffle(seed=42).select(range(2500))

# Sample 2500 from the second 12500 positive samples
positive_samples_train = train_dataset.select(range(12500, 25000)).shuffle(seed=42).select(range(2500))

balanced_train_dataset = concatenate_datasets([negative_samples_train, positive_samples_train])

In [5]:
# Sample 2500 from the first 12500 negative samples
negative_samples_test = test_dataset.select(range(12500)).shuffle(seed=42).select(range(2500))

# Sample 2500 from the second 12500 positive samples
positive_samples_test = test_dataset.select(range(12500, 25000)).shuffle(seed=42).select(range(2500))

balanced_test_dataset = concatenate_datasets([negative_samples_test, positive_samples_test])

In [6]:
balanced_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5000
})

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
import torch
torch.cuda.empty_cache()


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary", zero_division=1)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [None]:
5e7abf784a3f7805725b2a241d1a5b31d222be1d

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import torch
from datasets import Dataset
import numpy as np

# 1. Load the pre-trained model and tokenizer directly
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,  # rank
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    # For DistilBERT, we target the attention layers
    target_modules=[
        "q_lin",  # query projection
        "v_lin",  # value projection
         "out_lin",  # output projection
         "ffn.lin1",  # FFN first layer
         "ffn.lin2",  # FFN second layer
    ],
    modules_to_save=["classifier"]  # Save the classification head
)

# 3. Prepare the model for LoRA
model = prepare_model_for_kbit_training(model)
peft_model = get_peft_model(model, lora_config).to(device)

# Print trainable parameters
print("Trainable parameters:")
peft_model.print_trainable_parameters()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters:
trainable params: 1,181,954 || all params: 68,136,964 || trainable%: 1.7347


In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary", zero_division=1)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [18]:
# 6. Training configuration
training_args = TrainingArguments(
    output_dir="./peft_model_outputs",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# 7. Training function
def train_with_lora():
    # Prepare datasets
    train_dataset = balanced_train_dataset
    eval_dataset = balanced_test_dataset

    # Initialize trainer
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Save the trained model
    peft_model.save_pretrained("./peft_model_final")

    eval_result = trainer.evaluate()
    print("\nFinal Evaluation Metrics:")
    print(f"Precision: {eval_result['eval_precision']}")
    print(f"Recall: {eval_result['eval_recall']}")
    print(f"F1 Score: {eval_result['eval_f1']}")





In [19]:
train_with_lora()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4546,0.284193,0.890044,0.8904,0.890222
2,0.3151,0.274853,0.885303,0.9108,0.897871
3,0.2902,0.275289,0.899002,0.9008,0.8999



Final Evaluation Metrics:
Precision: 0.885303265940902
Recall: 0.9108
F1 Score: 0.8978706624605678
