# 1. Install Dependencies

In [1]:
# Install required libraries
!pip install datasets transformers sentence-transformers scikit-learn

!apt-get install git-lfs

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[K     |████████████████████████████████| 255 kB 18.5 MB/s 
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1



W: Not using locking for read only lock file /var/lib/dpkg/lock-frontend
W: Not using locking for read only lock file /var/lib/dpkg/lock
E: Unable to locate package git-lfs


#2. Preprocess data

In [2]:
# Load data
from datasets import load_dataset
imdb = load_dataset("imdb")
print(imdb)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
train_dataset = imdb['train'].shuffle(seed=42).select([i for i in list(range(7000))])
test_dataset = imdb['test'].shuffle(seed=42).select([i for i in list(range(3000))])

train_texts = [x['text'] for x in train_dataset]
train_labels = [x['label'] for x in train_dataset]

test_texts = [x['text'] for x in test_dataset]
test_labels = [x['label'] for x in test_dataset]

print(len(train_dataset))
print(len(test_dataset))




7000
3000


#3. Linear Probing

In [4]:
import torch
import optuna
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import evaluate
from sentence_transformers import SentenceTransformer

torch.manual_seed(42)
model = SentenceTransformer('all-mpnet-base-v2')

train_embeddings = model.encode(train_texts, batch_size=32, show_progress_bar=True)
test_embeddings = model.encode(test_texts, batch_size=32, show_progress_bar=True)


Batches: 100%|██████████| 219/219 [00:15<00:00, 13.74it/s]
Batches: 100%|██████████| 94/94 [00:04<00:00, 21.08it/s]


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Train logistic regression on top of embeddings
clf = LogisticRegression(max_iter=1000)
clf.fit(train_embeddings, train_labels)

# Predict
preds = clf.predict(test_embeddings)

# Evaluate
acc = accuracy_score(test_labels, preds)
f1 = f1_score(test_labels, preds)
recall = recall_score(test_labels, preds)
precision = precision_score(test_labels, preds)
print(f"Test Accuracy: {acc:.4f}, Test F1: {f1:.4f}, Test Recall: {recall:.4f}, Test Precision: {precision:.4f}")

Test Accuracy: 0.8850, Test F1: 0.8859, Test Recall: 0.8993, Test Precision: 0.8729
