# Accelerating LLM and SLM with Fine-Tuning: A Live Coding Tutorial for Evaluating Accuracy, Latency, and Cost To Find The Best Model

### Daniel Taube

In [49]:
import pandas as pd
import random
import numpy as np
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
import evaluate
import time
from tqdm.auto import tqdm
import torch

# Read the Data

In [50]:
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

# Sample the data

In [53]:
random_int_between_0_and_1 = random.randint(0, 12000)

# pretty print the review
print("A positive review is:")
print(df[df.label == 1].iloc[random_int_between_0_and_1].text)
print("A Negative review is:")
print(df[df.label == 0].iloc[random_int_between_0_and_1].text)


A positive review is:
'The Luzhin Defence' is a movie worthy of anyone's time. it is a brooding, intense film, and kept my attention the entire time. John Turturro is absolutely stunning in his portrayal of a tender, eccentric chess Grandmaster, and Emily Watson is spell-binding as the gentle but rebellious daughter of a highly respected Russian family. The chemistry between Watson and Turturro on screen is obvious from the moment their characters meet in the story. All in all, this movie is one of the best in-depth looks at the life of a chess Grandmaster, and Turturro and Watson add a whole non-mainstream, non-cliche feel to the film. Most people will come out of the theater thinking, and feeling somewhat touched by this brilliant look at the most unlikely of love stories.
A Negative review is:
This flick was a blow to me. I guess little girls should aspire to be nothing more than swimsuit models, home makers or mistresses, since that seems to be all they'll ever be portrayed as anyw

In [54]:
# choose a random 1000 positive and 1000 negative reviews
positive_reviews = df[df.label == 1].sample(1000)
negative_reviews = df[df.label == 0].sample(1000)
# combine the two dataframes
df_small = pd.concat([positive_reviews, negative_reviews])
# shuffle the dataframe
df_small = df_small.sample(frac=1).reset_index(drop=True)
# save the dataframe to a csv file
df_small.to_csv("imdb_train.csv", index=False)

In [55]:
df_test = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])

In [56]:
# choose a random 1000 positive and 1000 negative reviews
positive_reviews = df_test[df_test.label == 1].sample(1000)
negative_reviews = df_test[df_test.label == 0].sample(1000)
# combine the two dataframes
df_small_test = pd.concat([positive_reviews, negative_reviews])
# shuffle the dataframe
df_small_test = df_small_test.sample(frac=1).reset_index(drop=True)
# save the dataframe to a csv file
df_small_test.to_csv("imdb_test.csv", index=False)

# Training - SLM

In [57]:
# Load the CSV datasets (ensure your train.csv and test.csv are in the working directory)
data_files = {"train": "imdb_train.csv", "test": "imdb_test.csv"}
dataset = load_dataset("csv", data_files=data_files)

# Load a pretrained tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") # ~66M params, SLM

# 4B params, SLM, can run on 1 GPU in inference
# 70-300B params, LLM, Need a cluster of GPUs to run in inference

Generating train split: 2000 examples [00:00, 16785.61 examples/s]
Generating test split: 2000 examples [00:00, 41275.61 examples/s]


In [58]:
# Tokenization function for the 'text' field in your CSV
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove the original text column (optional) and set the format to PyTorch tensors
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

# Use a data collator that dynamically pads the inputs
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load a small pretrained model for sequence classification with 2 output labels (0 and 1)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

Map: 100%|██████████| 2000/2000 [00:07<00:00, 257.63 examples/s]
Map: 100%|██████████| 2000/2000 [00:07<00:00, 271.44 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
text = """Hi, welcome to the world of NLP! This is a test sentence for the DistilBERT model. Let's see how well it performs on this text."""

# toknizer into a tensor pt
# 1. Tokenize the text


inputs_ids = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors='pt')['input_ids']

# use the model to embed the text
outputs = model.distilbert.embeddings(inputs_ids)

In [69]:
# Load an accuracy metric for evaluation
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

In [12]:
# Evaluate the model on the test set
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.5025317668914795, 'eval_accuracy': 0.8445, 'eval_runtime': 75.2018, 'eval_samples_per_second': 26.595, 'eval_steps_per_second': 3.324, 'epoch': 3.0}


In [None]:
trainer.save_model("distilbert-base-uncased-imdb")

In [72]:
# run the model on a sample text
classifier = pipeline("text-classification", model="distilbert-base-uncased-imdb")
print(classifier("This movie was fantastic! I loved it!"))
print(classifier("This movie was terrible! I hated it!"))

Device set to use cpu


[{'label': 'LABEL_1', 'score': 0.9864317774772644}]
[{'label': 'LABEL_0', 'score': 0.9901705384254456}]


In [73]:
# run the model on a sample text
def run_example_model(model_name, prompt = ''):
    print(f"Running {model_name} on a sample text")
    classifier = pipeline("text-classification", model=model_name)
    print(classifier(prompt + "This movie was fantastic! I loved it!"))
    print(classifier(prompt + "This movie was terrible! I hated it!"))
    print("     ")

run_example_model("distilbert-base-uncased")
run_example_model("distilbert-base-uncased-imdb")

Running distilbert-base-uncased on a sample text


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.5127051472663879}]
[{'label': 'LABEL_0', 'score': 0.5029503107070923}]
     
Running distilbert-base-uncased-imdb on a sample text
[{'label': 'LABEL_1', 'score': 0.9864317774772644}]
[{'label': 'LABEL_0', 'score': 0.9901705384254456}]
     


# Train LLM

In [75]:
def fine_tune_slm(model_name, train_path, test_path):
    # Load the CSV datasets (ensure your train.csv and test.csv are in the working directory)
    data_files = {"train": train_path, "test": test_path}
    dataset = load_dataset("csv", data_files=data_files)

    # Load a pretrained tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    # Tokenization function for the 'text' field in your CSV
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

    # Tokenize the dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Remove the original text column (optional) and set the format to PyTorch tensors
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    tokenized_datasets.set_format("torch")

    # Use a data collator that dynamically pads the inputs
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Load a small pretrained model for sequence classification with 2 output labels (0 and 1)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
    )

    # Load an accuracy metric for evaluation
    accuracy_metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return accuracy_metric.compute(predictions=predictions, references=labels)

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    eval_results = trainer.evaluate()
    print("Evaluation Results:", eval_results)

    # Save the model
    trainer.save_model(model_name + "-imdb")

In [None]:
# List of larger models with approximate parameter counts:
bigger_models = [
    "bert-large-uncased",          # ~340 million parameters - 5 times larger than DistilBERT
]

for model_name in bigger_models:
    fine_tune_slm(model_name, "imdb_train.csv", "imdb_test.csv")

In [78]:
run_example_model("bert-large-uncased")
run_example_model("bert-large-uncased-imdb")

Running bert-large-uncased on a sample text


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


[{'label': 'LABEL_1', 'score': 0.679204523563385}]
[{'label': 'LABEL_1', 'score': 0.7086663842201233}]
     
Running bert-large-uncased-imdb on a sample text


Device set to use cpu


[{'label': 'LABEL_1', 'score': 0.9978277087211609}]
[{'label': 'LABEL_0', 'score': 0.9881008863449097}]
     


# Add prompt to the untrained model

In [79]:
prompt = """
    Analyze the sentiment of the following movie review.
    
    Please classify the sentiment as:
    - LABEL_1 if the review is positive
    - LABEL_0 if the review is negative
    
    Return only the number (LABEL_1 or LABEL_0) without any additional text.

    Review:
    """


In [81]:
run_example_model("distilbert-base-uncased", prompt = prompt)
run_example_model("distilbert-base-uncased-imdb")
run_example_model("bert-large-uncased", prompt = prompt)
run_example_model("bert-large-uncased-imdb")

Running distilbert-base-uncased on a sample text


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Device set to use cpu


[{'label': 'LABEL_1', 'score': 0.5130700469017029}]
[{'label': 'LABEL_1', 'score': 0.5139321684837341}]
     
Running distilbert-base-uncased-imdb on a sample text
[{'label': 'LABEL_1', 'score': 0.9864317774772644}]
[{'label': 'LABEL_0', 'score': 0.9901705384254456}]
     
Running bert-large-uncased on a sample text


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.851081907749176}]
[{'label': 'LABEL_0', 'score': 0.8455356359481812}]
     


Device set to use cpu


Running bert-large-uncased-imdb on a sample text
[{'label': 'LABEL_1', 'score': 0.9978277087211609}]
[{'label': 'LABEL_0', 'score': 0.9881008863449097}]
     


# Evaluate accuracy and latency

In [84]:
def evaluate_model_accuracy_latency(model_name, use_prompt=False, num_samples=100):
    """
    Evaluate a model's accuracy and latency on the IMDB test set.
    
    Args:
        model_name (str): Name of the model to evaluate
        use_prompt (bool): Whether to add prompt to inputs
        num_samples (int): Number of samples to evaluate (for faster testing)
    """
    # Load test dataset
    df_test = pd.read_csv("imdb_test.csv").sample(num_samples, random_state=42)
    
    # Load model and create classifier
    classifier = pipeline("text-classification", model=model_name)
    tokenizer = classifier.tokenizer
    
    # Prepare for evaluation
    correct_predictions = 0
    total_examples = len(df_test)
    start_time = time.time()
    
    # Calculate max allowed review length to fit within context window if using prompt
    max_model_length = tokenizer.model_max_length
    prompt_length = len(tokenizer.encode(prompt)) - 2  # -2 for special tokens
    max_review_length = max_model_length - prompt_length if use_prompt else max_model_length
    
    # Iterate through test examples
    for _, row in tqdm(df_test.iterrows(), total=total_examples, desc=f"Evaluating {model_name}"):
        text = row["text"]
        true_label = row["label"]
        
        # Truncate text if needed
        if use_prompt:
            # Tokenize text to check length
            tokens = tokenizer.encode(text)[1:-1]  # Remove special tokens
            if len(tokens) > max_review_length:
                tokens = tokens[:max_review_length]
                text = tokenizer.decode(tokens)
            
            input_text = prompt + text
        else:
            input_text = text
        
        # Get prediction
        try:
            prediction = classifier(input_text)[0]
            
            # Extract the predicted label
            if prediction['label'] == 'LABEL_1':
                predicted_label = 1
            else:
                predicted_label = 0
            
            # Check if prediction is correct
            if predicted_label == true_label:
                correct_predictions += 1
        except Exception as e:
            print(f"Error processing example: {e}")
            continue
    
    # Calculate metrics
    end_time = time.time()
    accuracy = correct_predictions / total_examples
    latency = end_time - start_time
    
    # Print results
    print(f"Model: {model_name}")
    print(f"Using prompt: {'Yes' if use_prompt else 'No'}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Total latency: {latency:.2f} seconds")
    print("------------------------------")
    
    return {
        "model": model_name,
        "use_prompt": use_prompt,
        "accuracy": accuracy,
        "total_latency": latency,
    }

def evaluate_all_models():
    """Run evaluation on all models and return results as DataFrame"""
    results = []
    
    # Models to evaluate
    models = [
        {"name": "distilbert-base-uncased", "use_prompt": True},
        {"name": "distilbert-base-uncased-imdb", "use_prompt": False},
        {"name": "bert-large-uncased", "use_prompt": True},
        {"name": "bert-large-uncased-imdb", "use_prompt": False}
    ]
    
    for model_config in models:
        result = evaluate_model_accuracy_latency(
            model_config["name"], 
            use_prompt=model_config["use_prompt"],
        )
        results.append(result)
    
    # Return as DataFrame
    return pd.DataFrame(results)

# Run evaluation with 100 samples
results_df = evaluate_all_models()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Evaluating distilbert-base-uncased:   1%|          | 1/100 [00:00<00:19,  5.01it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
Evaluating distilbert-base-uncased:   3%|▎         | 3/100 [00:00<00:08, 11.10it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:   7%|▋         | 7/100 [00:00<00:07, 11.72it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  14%|█▍        | 14/100 [00:01<00:07, 11.10it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  18%|█▊        | 18/100 [00:01<00:04, 16.41it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  40%|████      | 40/100 [00:03<00:05, 11.58it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  45%|████▌     | 45/100 [00:03<00:03, 14.20it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  59%|█████▉    | 59/100 [00:05<00:03, 10.38it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  67%|██████▋   | 67/100 [00:06<00:03, 10.72it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  89%|████████▉ | 89/100 [00:08<00:00, 14.73it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased:  94%|█████████▍| 94/100 [00:08<00:00, 14.43it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased: 100%|██████████| 100/100 [00:09<00:00, 10.97it/s]
Device set to use cpu


Model: distilbert-base-uncased
Using prompt: Yes
Accuracy: 0.3900
Total latency: 9.12 seconds
------------------------------


Evaluating distilbert-base-uncased-imdb:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
Evaluating distilbert-base-uncased-imdb:   5%|▌         | 5/100 [00:00<00:05, 17.72it/s]

Error processing example: The size of tensor a (575) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (837) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:   9%|▉         | 9/100 [00:00<00:05, 15.85it/s]

Error processing example: The size of tensor a (602) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:  20%|██        | 20/100 [00:01<00:03, 20.19it/s]

Error processing example: The size of tensor a (685) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (706) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (699) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (943) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:  40%|████      | 40/100 [00:02<00:04, 13.91it/s]

Error processing example: The size of tensor a (631) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (1176) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:  45%|████▌     | 45/100 [00:03<00:03, 16.88it/s]

Error processing example: The size of tensor a (595) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:  61%|██████    | 61/100 [00:04<00:02, 14.54it/s]

Error processing example: The size of tensor a (544) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:  69%|██████▉   | 69/100 [00:05<00:02, 11.96it/s]

Error processing example: The size of tensor a (740) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:  90%|█████████ | 90/100 [00:06<00:00, 15.29it/s]

Error processing example: The size of tensor a (831) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb:  95%|█████████▌| 95/100 [00:06<00:00, 16.69it/s]

Error processing example: The size of tensor a (570) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating distilbert-base-uncased-imdb: 100%|██████████| 100/100 [00:07<00:00, 13.82it/s]


Model: distilbert-base-uncased-imdb
Using prompt: No
Accuracy: 0.7500
Total latency: 7.24 seconds
------------------------------


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Evaluating bert-large-uncased:   1%|          | 1/100 [00:00<01:15,  1.31it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors


Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:   3%|▎         | 3/100 [00:01<00:33,  2.85it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:   6%|▌         | 6/100 [00:02<00:50,  1.86it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  12%|█▏        | 12/100 [00:05<00:44,  2.00it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  14%|█▍        | 14/100 [00:06<00:39,  2.16it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  16%|█▌        | 16/100 [00:07<00:31,  2.65it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  38%|███▊      | 38/100 [00:19<00:33,  1.85it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  40%|████      | 40/100 [00:20<00:30,  1.97it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  42%|████▏     | 42/100 [00:21<00:27,  2.10it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  58%|█████▊    | 58/100 [00:31<00:28,  1.50it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  65%|██████▌   | 65/100 [00:34<00:19,  1.77it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  67%|██████▋   | 67/100 [00:35<00:14,  2.32it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  85%|████████▌ | 85/100 [00:45<00:09,  1.58it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased:  92%|█████████▏| 92/100 [00:48<00:03,  2.20it/s]

Error processing example: The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased: 100%|██████████| 100/100 [00:52<00:00,  1.91it/s]


Model: bert-large-uncased
Using prompt: Yes
Accuracy: 0.4300
Total latency: 52.33 seconds
------------------------------


Device set to use cpu
Evaluating bert-large-uncased-imdb:   1%|          | 1/100 [00:00<01:19,  1.24it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors


Error processing example: The size of tensor a (575) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:   3%|▎         | 3/100 [00:01<00:31,  3.05it/s]

Error processing example: The size of tensor a (837) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:   6%|▌         | 6/100 [00:03<00:52,  1.78it/s]

Error processing example: The size of tensor a (602) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  14%|█▍        | 14/100 [00:07<00:57,  1.51it/s]

Error processing example: The size of tensor a (685) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  16%|█▌        | 16/100 [00:07<00:37,  2.27it/s]

Error processing example: The size of tensor a (706) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (699) must match the size of tensor b (512) at non-singleton dimension 1
Error processing example: The size of tensor a (943) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  38%|███▊      | 38/100 [00:17<00:26,  2.34it/s]

Error processing example: The size of tensor a (631) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  40%|████      | 40/100 [00:18<00:24,  2.47it/s]

Error processing example: The size of tensor a (1176) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  42%|████▏     | 42/100 [00:19<00:22,  2.61it/s]

Error processing example: The size of tensor a (595) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  58%|█████▊    | 58/100 [00:27<00:26,  1.56it/s]

Error processing example: The size of tensor a (544) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  67%|██████▋   | 67/100 [00:32<00:24,  1.37it/s]

Error processing example: The size of tensor a (740) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  86%|████████▌ | 86/100 [00:43<00:09,  1.44it/s]

Error processing example: The size of tensor a (831) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb:  92%|█████████▏| 92/100 [00:45<00:03,  2.54it/s]

Error processing example: The size of tensor a (570) must match the size of tensor b (512) at non-singleton dimension 1


Evaluating bert-large-uncased-imdb: 100%|██████████| 100/100 [00:48<00:00,  2.05it/s]

Model: bert-large-uncased-imdb
Using prompt: No
Accuracy: 0.8000
Total latency: 48.78 seconds
------------------------------





In [86]:
display(results_df)

Unnamed: 0,model,use_prompt,accuracy,total_latency
0,distilbert-base-uncased,True,0.39,9.116962
1,distilbert-base-uncased-imdb,False,0.75,7.243595
2,bert-large-uncased,True,0.43,52.326432
3,bert-large-uncased-imdb,False,0.8,48.77884


In [None]:
# the slm is 6.7x faster than the llm
# the LLM is 5% more accurate than the SLM overall

# What about the cost?
# How much do you use your model?

6.6410256410256405