In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import torch
import random
import numpy as np

# Set seed for reproducibility
seed_value = 42  # You can change this value to any integer
torch.manual_seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# If you are using a GPU, set the seed for CUDA as well
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)

## Datasets

In [None]:
# Load the IMDB dataset
from datasets import load_dataset
dataset = load_dataset("imdb")
test_data = dataset["test"]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
test_data

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
X_test = test_data["text"]

In [None]:
y_true = test_data["label"]

## Evaluate functions

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
from transformers import pipeline
from tqdm import tqdm

def predict(test, model_name, adapter, batch_size=16, tokenizer_name=None):
    y_pred = []

    # Create the pipeline based on whether tokenizer_name is provided
    if tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        pipe = pipeline(task="text-classification", model=model_name, tokenizer=tokenizer, framework="pt", device=0)
    else:
        pipe = pipeline(task="text-classification", model=model_name, framework="pt", device=0)

    for i in tqdm(range(0, len(test), batch_size)):
        batch = test[i:i+batch_size]
        results = pipe(batch, truncation=True)

        for result in results:
            standardized_label = adapter(result)
            y_pred.append(standardized_label)

    return y_pred


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

def evaluate_binary(y_true, y_pred):
    # Use binary mapping
    mapping = {'positive': 1, 'negative': 0}

    # Convert y_true and y_pred based on binary mapping
    y_true_binary = [0 if label == 0 else 1 for label in y_true]  # IMDB labels are 0 and 1
    y_pred_binary = [mapping[label] if label in mapping else 0 for label in y_pred]

    # Calculate overall accuracy
    accuracy = accuracy_score(y_true=y_true_binary, y_pred=y_pred_binary)
    print(f"Binary Accuracy: {accuracy:.3f}")

    # Generate binary classification report
    print("\nBinary Classification Report:")
    print(classification_report(y_true=y_true_binary, y_pred=y_pred_binary, target_names=['negative', 'positive']))

    # Generate binary confusion matrix
    print("\nBinary Confusion Matrix:")
    print(confusion_matrix(y_true=y_true_binary, y_pred=y_pred_binary))




## Adapter functions

In [None]:
# Adapter for ('LABEL_0'/'LABEL_1')
def adapter_model_numeric(output):
    if output['label'] == 'LABEL_1':
        return 'positive'
    elif output['label'] == 'LABEL_0':
        return 'negative'
    else:
        return 'none'

# Adapter for ('POSITIVE'/'NEGATIVE'/'NEUTRAL')
def adapter_model_text(output):
    label = output['label'].lower()
    if "positive" in label:
        return 'positive'
    elif "negative" in label:
        return 'negative'
    elif "neutral" in label:
        return 'neutral'
    else:
        return 'none'

## Models

In [None]:
baseline_model_name = "distilbert-base-uncased"
benchmark_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sft_model_name = "wsklee/distilbert-sentiment-imdb-sft"

### Evaluation: Baseline DistilBERT model

In [None]:
# Call the predict function with test data
y_pred = predict(X_test, baseline_model_name, adapter_model_numeric)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  1%|          | 10/1563 [00:02<04:46,  5.42it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1563/1563 [04:21<00:00,  5.99it/s]


In [None]:
# Call the simplified evaluate function for binary classification
evaluate_binary(y_true, y_pred)

Binary Accuracy: 0.406

Binary Classification Report:
              precision    recall  f1-score   support

    negative       0.39      0.32      0.35     12500
    positive       0.42      0.49      0.45     12500

    accuracy                           0.41     25000
   macro avg       0.40      0.41      0.40     25000
weighted avg       0.40      0.41      0.40     25000


Binary Confusion Matrix:
[[3977 8523]
 [6336 6164]]


In [None]:
# Print samples of predictions and true labels for verification
print("Sample y_pred:", y_pred[:100])
print("Sample y_true:", y_true[:100])

Sample y_pred: ['negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive', 

In [None]:
# Sample inference
from transformers import pipeline

# Load model and tokenizer using the pipeline
classifier = pipeline("text-classification", model=baseline_model_name, device=0)

# Sample texts for inference
texts = [
    "The movie was absolutely fantastic!",
    "I didn't enjoy the film; it was quite boring.",
    "An average experience, nothing special or memorable."
]

# Perform inference
results = classifier(texts)
for text, result in zip(texts, results):
    print(f"Input: {text}")
    print(f"Output: {result}\n")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input: The movie was absolutely fantastic!
Output: {'label': 'LABEL_0', 'score': 0.5017849206924438}

Input: I didn't enjoy the film; it was quite boring.
Output: {'label': 'LABEL_0', 'score': 0.5064137578010559}

Input: An average experience, nothing special or memorable.
Output: {'label': 'LABEL_0', 'score': 0.5088108777999878}



### Evaluation: distilbert-base-uncased-finetuned-sst-2-english

In [None]:
# Call the predict function with test data
y_pred = predict(X_test, benchmark_model_name, adapter_model_text)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

100%|██████████| 1563/1563 [04:26<00:00,  5.87it/s]


In [None]:
# Call the simplified evaluate function for binary classification
evaluate_binary(y_true, y_pred)

Binary Accuracy: 0.891

Binary Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.92      0.89     12500
    positive       0.91      0.86      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000


Binary Confusion Matrix:
[[11494  1006]
 [ 1726 10774]]


### Evaluation: DistilBERT-SFT model

In [None]:
# Call the predict function with test data
y_pred = predict(X_test, sft_model_name, adapter_model_numeric)

config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

100%|██████████| 1563/1563 [04:38<00:00,  5.62it/s]


In [None]:
# Call the simplified evaluate function for binary classification
evaluate_binary(y_true, y_pred)

Binary Accuracy: 0.915

Binary Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.93      0.92     12500
    positive       0.93      0.90      0.91     12500

    accuracy                           0.92     25000
   macro avg       0.92      0.92      0.92     25000
weighted avg       0.92      0.92      0.92     25000


Binary Confusion Matrix:
[[11656   844]
 [ 1279 11221]]


In [None]:
# Print samples of predictions and true labels for verification
print("Sample y_pred:", y_pred[:100])
print("Sample y_true:", y_true[:100])

Sample y_pred: ['negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 

In [None]:
# Sample inference
from transformers import pipeline

# Load model and tokenizer using the pipeline
classifier = pipeline("text-classification", model=sft_model_name, device=0)

# Sample texts for inference
texts = [
    "The movie was absolutely fantastic!",
    "I didn't enjoy the film; it was quite boring.",
    "An average experience, nothing special or memorable."
]

# Perform inference
results = classifier(texts)
for text, result in zip(texts, results):
    print(f"Input: {text}")
    print(f"Output: {result}\n")

Input: The movie was absolutely fantastic!
Output: {'label': 'LABEL_1', 'score': 0.9981886744499207}

Input: I didn't enjoy the film; it was quite boring.
Output: {'label': 'LABEL_0', 'score': 0.997225821018219}

Input: An average experience, nothing special or memorable.
Output: {'label': 'LABEL_0', 'score': 0.9878718852996826}

