In [1]:
# Clone the Repository
!git clone https://github.com/wsklee/alignment-handbook.git
%cd alignment-handbook
!python -m pip install .

Cloning into 'alignment-handbook'...
remote: Enumerating objects: 1037, done.[K
remote: Counting objects: 100% (754/754), done.[K
remote: Compressing objects: 100% (414/414), done.[K
remote: Total 1037 (delta 515), reused 453 (delta 306), pack-reused 283 (from 1)[K
Receiving objects: 100% (1037/1037), 282.93 KiB | 1.58 MiB/s, done.
Resolving deltas: 100% (575/575), done.
/content/alignment-handbook
Processing /content/alignment-handbook
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes>=0.43.0 (from alignment-handbook==0.4.0.dev0)
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting evaluate==0.4.0 (from alignment-handbook==0.4.0.dev0)
  Downloading evaluate-0.4.0-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.18.0 (from alignment-handbook==0.4.0.dev0)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting deepspeed>=0.14.4 (from alignment-handbook==0.4.0.dev0)
  Downloading deepsp

In [2]:
import torch
import random
import numpy as np

# Set seed for reproducibility
seed_value = 42
torch.manual_seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# If GPU is used, set the seed for CUDA as well
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)

In [3]:
# Load the IMDB dataset
from datasets import load_dataset
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
test_data

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [5]:
X_test = test_data["text"]

In [6]:
y_true = test_data["label"]

## Create smaller train dataset for centroid calculation

In [7]:
from datasets import Dataset

# Filter 100 positive and 100 negative samples
small_positive = train_data.filter(lambda x: x['label'] == 1).shuffle(seed=42).select(range(1000))
small_negative = train_data.filter(lambda x: x['label'] == 0).shuffle(seed=42).select(range(1000))

# Combine into a smaller dataset
small_train_data = Dataset.from_dict({
    "text": small_positive["text"] + small_negative["text"],
    "label": small_positive["label"] + small_negative["label"]
})

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          pipeline,
                          logging)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

## Load Custom class: DistilBertCLModel

In [9]:
from transformers import AutoModel
from alignment.models.distilbert_cl import DistilBertCLModel  # Custom class

model_name = "wsklee/distilbert-sentiment-imdb-cft" # CFT
model = DistilBertCLModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model.eval()

README.md:   0%|          | 0.00/6.52k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/83.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DistilBertCLModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1)

In [10]:
# Ensure that the model is moved to the appropriate device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

DistilBertCLModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1)

# Calculation of centroids

If there is existing centroid value use it, if not calculate it

In [11]:
from tqdm import tqdm
import torch

def calculate_centroids(model, train_data, batch_size=8, device='cuda'):
    model.to(device)
    model.eval()

    positive_centroid = torch.zeros(768, device=device)
    negative_centroid = torch.zeros(768, device=device)
    positive_count = 0
    negative_count = 0

    positive_sentences = [ex['text'] for ex in train_data if ex['label'] == 1]
    negative_sentences = [ex['text'] for ex in train_data if ex['label'] == 0]

    with torch.no_grad():
        # Process positive sentences
        for i in tqdm(range(0, len(positive_sentences), batch_size), desc="Processing positive batches"):
            batch_sentences = positive_sentences[i:i + batch_size]
            positive_encodings = tokenizer(batch_sentences, truncation=True, padding=True, return_tensors="pt")

            input_ids = positive_encodings['input_ids'].to(device)
            attention_mask = positive_encodings['attention_mask'].to(device)

            # Get embeddings and accumulate them
            embeddings = model.get_embedding(input_ids, attention_mask)
            positive_centroid += embeddings.sum(dim=0)
            positive_count += embeddings.size(0)

            # Move variables to CPU and clear cache
            del input_ids, attention_mask, embeddings
            torch.cuda.empty_cache()

        # Process negative sentences
        for i in tqdm(range(0, len(negative_sentences), batch_size), desc="Processing negative batches"):
            batch_sentences = negative_sentences[i:i + batch_size]
            negative_encodings = tokenizer(batch_sentences, truncation=True, padding=True, return_tensors="pt")

            input_ids = negative_encodings['input_ids'].to(device)
            attention_mask = negative_encodings['attention_mask'].to(device)

            # Get embeddings and accumulate them
            embeddings = model.get_embedding(input_ids, attention_mask)
            negative_centroid += embeddings.sum(dim=0)
            negative_count += embeddings.size(0)

            # Move variables to CPU and clear cache
            del input_ids, attention_mask, embeddings
            torch.cuda.empty_cache()

    # Compute the final centroids
    positive_centroid /= positive_count
    negative_centroid /= negative_count

    return positive_centroid, negative_centroid


In [12]:
import os
import torch

centroids_path = 'centroids.pth'

positive_centroid = None
negative_centroid = None

if os.path.exists(centroids_path):
    # Load the existing centroids
    centroids = torch.load(centroids_path)
    positive_centroid = centroids['positive_centroid']
    negative_centroid = centroids['negative_centroid']
    print("Loaded existing centroids.")
else:
    # Calculate new centroids if the file does not exist
    positive_centroid, negative_centroid = calculate_centroids(model, small_train_data, batch_size=2, device=device)
    print("Calculated new centroids.")

    # Save the centroids
    torch.save({
        'positive_centroid': positive_centroid,
        'negative_centroid': negative_centroid
    }, centroids_path)
    print("Saved new centroids.")

# Centroid shapes
print(f"Positive Centroid Shape: {positive_centroid.shape}")
print(f"Negative Centroid Shape: {negative_centroid.shape}")


Processing positive batches: 100%|██████████| 500/500 [00:11<00:00, 44.50it/s]
Processing negative batches: 100%|██████████| 500/500 [00:10<00:00, 47.80it/s]

Calculated new centroids.
Saved new centroids.
Positive Centroid Shape: torch.Size([768])
Negative Centroid Shape: torch.Size([768])





In [13]:
print(negative_centroid)

tensor([-2.1365e-01, -2.7614e-01,  5.8479e-01, -2.4386e-01,  1.1495e-02,
        -1.1747e-01, -2.5277e-01,  4.3825e-01, -5.1148e-01, -2.0746e-01,
        -9.6473e-03, -5.1843e-01, -1.8106e-01,  1.0139e+00,  4.5002e-01,
         3.6146e-01,  3.5745e-01, -2.2844e-01,  4.6450e-01, -8.9819e-02,
        -2.3369e-01, -2.2825e-02,  2.7097e-01,  4.3917e-01,  2.1760e-01,
        -6.8147e-02,  1.9066e-01, -3.8505e-02,  9.5055e-02, -1.1345e-01,
         1.1337e-01, -1.6571e-01, -4.6878e-01, -1.4902e-01,  2.1152e-02,
        -3.5876e-01, -5.3641e-02, -2.8062e-01, -4.5896e-01, -4.0232e-01,
        -7.4323e-01, -5.1026e-01,  3.7737e-01,  1.5566e-01, -4.5608e-01,
        -5.2461e-01, -1.3093e+00,  5.0293e-01,  1.8630e-01,  3.5165e-01,
         1.1507e-01, -2.4504e-01, -6.5202e-02, -9.3156e-02,  2.8230e-01,
         3.0194e-01, -3.1814e-01, -7.5634e-01,  2.7253e-02, -4.0137e-01,
         1.8936e-01,  2.2266e-01, -7.4667e-02, -8.1238e-01,  2.6533e-01,
        -2.4065e-01, -1.7160e-01,  7.0814e-01, -8.1

# Create DistilLBERTCL-Inference Model
Since pretrained DistilBERTCL model accepts pairs of sentences as input, we need DistilBERTCL-Inference that accepts one sentence only for inference

In [14]:
import torch
from transformers import DistilBertModel, DistilBertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
from dataclasses import dataclass
from typing import Optional
from alignment.models.distilbert_cl import DistilBertCLModel


class DistilBertCLInferenceModel(DistilBertCLModel):
    def __init__(self, config):
        super().__init__(config)
        # Initialize the centroids with the correct dimensions (768)
        self.positive_centroid = torch.zeros(768).cuda()  # Move to GPU
        self.negative_centroid = torch.zeros(768).cuda()

    def forward(self, input_ids=None, attention_mask=None, return_dict=True):
        # Single sentence input here, no pairs.
        # print("\nDEBUG - DistilBertCLInferenceModel forward pass")

        # Get embedding for the input sentence
        sent_embed = self.get_embedding(input_ids, attention_mask)

        # Ensure the centroids and embeddings are on the same device
        device = sent_embed.device
        self.positive_centroid = self.positive_centroid.to(device)  # Move positive centroid to the same device
        self.negative_centroid = self.negative_centroid.to(device)  # Move negative centroid to the same device

        # Return logits directly
        logits = self.classify_sentiment(sent_embed)

        if not return_dict:
            return logits

        return SequenceClassifierOutput(
            logits=logits
        )

    def classify_sentiment(self, embedding):
        # Set during initialization or externally
        positive_centroid = self.positive_centroid
        negative_centroid = self.negative_centroid


        # Compute cosine similarity between input embedding and centroids
        positive_similarity = torch.cosine_similarity(embedding, positive_centroid)
        negative_similarity = torch.cosine_similarity(embedding, negative_centroid)

        # Get the predicted class based on comparing the similarities (higher similarity -> closer to centroid)
        # print(f"Positive similarity: {positive_similarity.item()}")
        # print(f"Negative similarity: {negative_similarity.item()}")

        # Return a tensor with both similarities as logits for pipeline compatibility
        return torch.tensor([negative_similarity, positive_similarity]).unsqueeze(0).to(embedding.device)

    def set_centroids(self, positive_centroid, negative_centroid):
        self.positive_centroid = positive_centroid
        self.negative_centroid = negative_centroid


In [15]:
from transformers import AutoModelForSequenceClassification

model_name = "wsklee/distilbert-sentiment-cft-v4"
inference_model = DistilBertCLInferenceModel.from_pretrained(model_name)
inference_model.eval()

# Set the centroids in the inference model
inference_model.set_centroids(positive_centroid, negative_centroid)

config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

# Sample prediction with custom text

In [16]:
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          pipeline)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Load model and tokenizer using the pipeline
pipe = pipeline(task="text-classification", model=inference_model, tokenizer=tokenizer, framework="pt", device=0)

# Sample texts for inference
texts = [
    "The movie was absolutely fantastic!",
    "I didn't enjoy the film; it was quite boring.",
    "An average experience, nothing special or memorable."
]

# Perform inference
# LABEL_0 for negative, LABEL_1 for positive
results = pipe(texts)
for text, result in zip(texts, results):
    print(f"Input: {text}")
    print(f"Output: {result}\n")

The model 'DistilBertCLInferenceModel' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification', 'Gemma

Input: The movie was absolutely fantastic!
Output: {'label': 'LABEL_1', 'score': 0.5538995265960693}

Input: I didn't enjoy the film; it was quite boring.
Output: {'label': 'LABEL_0', 'score': 0.5332493782043457}

Input: An average experience, nothing special or memorable.
Output: {'label': 'LABEL_0', 'score': 0.5219409465789795}



# Prediction with test data

In [17]:
from transformers import pipeline
from tqdm import tqdm

def predict(test, model, adapter, batch_size=16):
    y_pred = []

    pipe = pipeline(task="text-classification", model=model, tokenizer=tokenizer, framework="pt", device=0)

    for i in tqdm(range(0, len(test), batch_size)):
        batch = test[i:i+batch_size]
        results = pipe(batch, truncation=True)

        for result in results:
            standardized_label = adapter(result)
            y_pred.append(standardized_label)

    return y_pred


In [18]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

def evaluate_binary(y_true, y_pred):
    # Use binary mapping
    mapping = {'positive': 1, 'negative': 0}

    # Convert y_true and y_pred based on binary mapping
    y_true_binary = [0 if label == 0 else 1 for label in y_true]  # IMDB labels are 0 and 1
    y_pred_binary = [mapping[label] if label in mapping else 0 for label in y_pred]

    # Calculate overall accuracy
    accuracy = accuracy_score(y_true=y_true_binary, y_pred=y_pred_binary)
    print(f"Binary Accuracy: {accuracy:.3f}")

    print("\nBinary Classification Report:")
    print(classification_report(y_true=y_true_binary, y_pred=y_pred_binary, target_names=['negative', 'positive']))

    print("\nBinary Confusion Matrix:")
    print(confusion_matrix(y_true=y_true_binary, y_pred=y_pred_binary))




In [19]:
# Adapter for ('LABEL_0'/'LABEL_1')
def adapter_model_numeric(output):
    if output['label'] == 'LABEL_1':
        return 'positive'
    elif output['label'] == 'LABEL_0':
        return 'negative'
    else:
        return 'none'

# Adapter for ('POSITIVE'/'NEGATIVE'/'NEUTRAL')
def adapter_model_text(output):
    label = output['label'].lower()
    if "positive" in label:
        return 'positive'
    elif "negative" in label:
        return 'negative'
    elif "neutral" in label:
        return 'neutral'
    else:
        return 'none'

# Evaluation

In [20]:
y_pred = predict(X_test, inference_model, adapter_model_numeric, batch_size=16)

The model 'DistilBertCLInferenceModel' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification', 'Gemma

In [21]:
# Evaluate function for binary classification
evaluate_binary(y_true, y_pred)

Binary Accuracy: 0.890

Binary Classification Report:
              precision    recall  f1-score   support

    negative       0.97      0.80      0.88     12500
    positive       0.83      0.97      0.90     12500

    accuracy                           0.89     25000
   macro avg       0.90      0.89      0.89     25000
weighted avg       0.90      0.89      0.89     25000


Binary Confusion Matrix:
[[10055  2445]
 [  316 12184]]


In [22]:
# Print samples of predictions and true labels for verification
print("Sample y_pred:", y_pred[:100])
print("Sample y_true:", y_true[:100])

Sample y_pred: ['negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 