# 1. Install Dependencies

In [1]:
# Install required libraries
!pip install datasets transformers evaluate optuna peft
!apt-get install git-lfs

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading 

#2. Preprocess data

In [None]:
# Load data
from datasets import load_dataset
imdb = load_dataset("imdb")
print(imdb)

In [None]:
train_dataset = imdb['train'].shuffle(seed=42)
val_dataset = train_dataset.select([i for i in list(range(3000))])
train_dataset = train_dataset.select([i for i in list(range(3000, 7000))])
test_dataset = imdb['test'].shuffle(seed=42).select([i for i in list(range(3000))])

print(len(train_dataset))
print(len(test_dataset))
print(len(val_dataset))



In [28]:
import torch
import optuna
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import evaluate

torch.manual_seed(42)

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

#3. Hyperparameter Tuning

In [31]:
def objective(trial: optuna.Trial):
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    model = get_peft_model(model, config)
    batch_size = trial.suggest_categorical('batch_size', [16, 32])
    learning_rate = trial.suggest_categorical('learning_rate', [5e-5, 3e-5, 2e-5])
    num_epochs = trial.suggest_categorical('num_epochs', [2, 3, 4])

    training_args = TrainingArguments(
        output_dir="./results",
        logging_steps=50,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs
        )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset)

    trainer.train()
    # Evaluate the model on the validation dataset
    eval_result = trainer.evaluate()

    # Return validation loss (you can add other metrics here as needed)
    return eval_result["eval_loss"]


study = optuna.create_study(study_name='hp-search-distilbert', direction='minimize')
study.optimize(func=objective, n_trials=1)

best_lr = float(study.best_params['learning_rate'])
best_batch_size = study.best_params['batch_size']
best_epoch = int(study.best_params['num_epochs'])

print(f"Best Learning Rate: {best_lr}")
print(f"Best Batch Size: {best_batch_size}")
print(f"Best Epochs: {best_epoch}")

[I 2025-04-07 15:12:21,063] A new study created in memory with name: hp-search-distilbert
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,0.6872
100,0.6752


[I 2025-04-07 15:16:07,846] Trial 0 finished with value: 0.6641925573348999 and parameters: {'batch_size': 32, 'learning_rate': 2e-05, 'num_epochs': 1}. Best is trial 0 with value: 0.6641925573348999.


Best Learning Rate: 2e-05
Best Batch Size: 32
Best Epochs: 1


#4. Results

In [15]:
# Define evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load("recall")
precision_metric = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    return {"accuracy": accuracy, "f1": f1, "recall": recall, "precision": precision}

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


from datasets import concatenate_datasets
combined_train_dataset = concatenate_datasets([train_dataset, val_dataset])

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=best_lr,
    per_device_train_batch_size=best_batch_size,
    per_device_eval_batch_size=best_batch_size,
    num_train_epochs=best_epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
train_results = trainer.train()
print(train_results)

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=48, training_loss=0.5630646546681722, metrics={'train_runtime': 42.7203, 'train_samples_per_second': 17.556, 'train_steps_per_second': 1.124, 'total_flos': 99350548992000.0, 'train_loss': 0.5630646546681722, 'epoch': 3.0})


{'eval_loss': 0.2987764775753021, 'eval_accuracy': 0.944, 'eval_f1': 0.940677966101695, 'eval_recall': 0.9173553719008265, 'eval_precision': 0.9652173913043478, 'eval_runtime': 3.6748, 'eval_samples_per_second': 68.031, 'eval_steps_per_second': 4.354, 'epoch': 3.0}


In [19]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
print(model)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
import torch
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []

    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])

    return layer_names

list(set(get_specific_layer_names(model)))

['', 'ffn', 'attention']

In [24]:
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

target_modules = ['q_lin', 'k_lin', 'v_lin', 'out_lin', 'lin1', 'lin2']
config = LoraConfig(
    r=16, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="SEQ_CLS"
)

#model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)
model.print_trainable_parameters()



trainable params: 1,919,234 || all params: 68,874,244 || trainable%: 2.7866
