In [None]:
!pip install datasets transformers evaluate optuna
!apt-get install git-lfs

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading datasets-3.5.0-py3-non

In [None]:
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import evaluate
from tqdm import tqdm

In [None]:
# Load the IMDB dataset
imdb = load_dataset("imdb")
test_dataset = imdb['test'].shuffle(seed=42)
print(f"Test dataset size: {len(test_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Test dataset size: 25000


In [None]:
# Train the model
print("Starting training...")
train_results = trainer.train()
print(train_results)

# Evaluate on validation set
print("Evaluating on validation set...")
val_results = trainer.evaluate(eval_dataset=tokenized_val)
print(val_results)

# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print(test_results)

# Save the final model
model_path = "./llama_3.2_imdb_sentiment"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda


In [None]:
# Define prompt templates for sentiment analysis
def create_prompt(review):
    return f"Review: {review}\n. Sentiment (positive or negative): "

# Function to get sentiment prediction from GPT-2
def predict_sentiment(review, max_attempts=3):
    # Truncate the review to prevent exceeding the model's context length
    # GPT-2 has a max length of 1024 tokens, but we need to leave room for the prompt and generated response
    max_review_length = 800  # This gives plenty of space for the prompt and generation

    # Tokenize the review to count tokens
    review_tokens = tokenizer.encode(review)
    if len(review_tokens) > max_review_length:
        # Truncate to max_review_length tokens
        truncated_review_tokens = review_tokens[:max_review_length]
        review = tokenizer.decode(truncated_review_tokens)

    prompt = create_prompt(review)

    for attempt in range(max_attempts):
        # Tokenize the prompt
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate text
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=5,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=False  # Use greedy decoding for deterministic results
            )

        # Decode the output
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

        # Extract the sentiment prediction
        prediction = generated_text.split("Sentiment (positive or negative): ")[-1].strip().lower()

        # Check if "positive" or "negative" was generated
        if "positive" in prediction:
            return "positive", attempt + 1
        elif "negative" in prediction:
            return "negative", attempt + 1

    return "unknown", attempt + 1

# Define evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")


In [None]:
# Test GPT-2 on the dataset
predictions = []
ground_truth = []
attempt_counts = []

for item in tqdm(test_dataset, desc="Evaluating GPT-2"):
    review = item["text"]
    true_label = "positive" if item["label"] == 1 else "negative"

    # Get GPT-2's prediction
    prediction, attempts = predict_sentiment(review)

    predictions.append(prediction)
    ground_truth.append(true_label)
    attempt_counts.append(attempts)

    # Print some examples as we go
    if len(predictions) % 1000 == 0:
        print(f"\nReview: {review[:100]}...")
        print(f"True sentiment: {true_label}")
        print(f"Predicted sentiment: {prediction}")
        print(f"Required {attempts} attempt(s)")

# Calculate metrics
valid_predictions = [p for p, g in zip(predictions, ground_truth) if p != "unknown"]
valid_ground_truth = [g for p, g in zip(predictions, ground_truth) if p != "unknown"]

# Report results
total_samples = len(predictions)
valid_samples = len(valid_predictions)
unknown_samples = total_samples - valid_samples

print("\n===== GPT-2 Zero-Shot Sentiment Analysis Results =====")
print(f"Model: {model_name}")
print(f"Total samples: {total_samples}")
print(f"Valid predictions: {valid_samples} ({valid_samples/total_samples*100:.2f}%)")
print(f"Unknown predictions: {unknown_samples} ({unknown_samples/total_samples*100:.2f}%)")

Evaluating GPT-2:   4%|▍         | 1001/25000 [03:00<1:11:21,  5.61it/s]


Review: This is the best version of Gypsy that has been filmed.Bette Midler is simply superb as Mama Rose.Sh...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:   8%|▊         | 2001/25000 [05:59<1:11:58,  5.33it/s]


Review: Age of Steel follows up the previous episode, Rise of the Cybermen, which was excellent in some resp...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  12%|█▏        | 3001/25000 [08:58<1:06:33,  5.51it/s]


Review: Plunkett and MaCleane are two highwaymen that rob from the rich in order to give to ... well, the ri...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  16%|█▌        | 4001/25000 [11:58<1:05:29,  5.34it/s]


Review: The final entry in the On The Buses trilogy sees the usual wasters go about their business in Wales....
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  20%|██        | 5001/25000 [14:59<58:18,  5.72it/s]


Review: Duncan Roy's writing and direction is really, and regularly, below par. Actually it sort of stinks. ...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  24%|██▍       | 6001/25000 [17:59<56:16,  5.63it/s]


Review: We've all see the countless previews and trailers. If you enjoyed Knoxville getting flipped by the B...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  28%|██▊       | 7001/25000 [20:58<54:27,  5.51it/s]


Review: Let me get this out of the way before I trash this film: I love Park Chan-Wook's work as a director....
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  32%|███▏      | 8001/25000 [23:58<48:30,  5.84it/s]


Review: I have vague memories of this movie being funny.<br /><br />Having seen it again either I have chang...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  36%|███▌      | 9001/25000 [26:57<48:57,  5.45it/s]


Review: This movie looked fun on the cover and I honestly thought 'how bad can this be?' Little did I know. ...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  40%|████      | 10001/25000 [29:57<45:24,  5.51it/s]


Review: "Cherry" tells of a naive, unmarried virgin who decides to have a baby but isn't quite sure how to g...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  44%|████▍     | 11001/25000 [32:57<42:05,  5.54it/s]


Review: While killing time on a Saturday morning, "Looking For Lola" came on HBO. I decided to give it a sho...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  48%|████▊     | 12001/25000 [35:58<40:47,  5.31it/s]


Review: Why did I have to go out and buy (yes buy!) JACK FROST 2: REVENGE OF THE MUTANT KILLER SNOWMAN??? Ma...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  52%|█████▏    | 13001/25000 [38:59<36:58,  5.41it/s]


Review: The main attraction of Anywhere but Here is the superb performance of Natalie Portman. She gave her ...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  56%|█████▌    | 14001/25000 [42:00<32:31,  5.64it/s]


Review: I picked this DVD up for 3.99 at rogers video in order to get enough points to get a better movie fo...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  60%|██████    | 15000/25000 [45:01<28:31,  5.84it/s]


Review: For a good take on the Roman Empire watch the excellent BBC produced miniseries "I, Claudius". This ...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  64%|██████▍   | 16001/25000 [48:01<26:02,  5.76it/s]


Review: You gotta be a fan of the little man but I found Burlesque on Carmen dull, unimaginative and totally...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  68%|██████▊   | 17001/25000 [51:00<23:19,  5.72it/s]


Review: This film is about a deadly poison that is contained in small glass globes that is used to kill. Thi...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  72%|███████▏  | 18001/25000 [54:00<19:46,  5.90it/s]


Review: This movie will not be considered for an academy award, but if you enjoy a movie that doesn't take i...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  76%|███████▌  | 19001/25000 [56:59<17:06,  5.84it/s]


Review: carrot top in a full length movie, enough said. only reason this doesn't get a one is through my per...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  80%|████████  | 20001/25000 [59:58<14:30,  5.74it/s]


Review: What a terrible sequel. The reason I give this film two stars instead of zero because it's a movie t...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  84%|████████▍ | 21001/25000 [1:02:59<12:03,  5.53it/s]


Review: What You Need In the run up to 'What You Need', every episode since 'The Lonely' had been a winner t...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  88%|████████▊ | 22000/25000 [1:05:59<09:02,  5.53it/s]


Review: I have reasons to love the great users of a camera; fluid direction of action lends itself to fast-p...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  92%|█████████▏| 23001/25000 [1:09:00<06:13,  5.36it/s]


Review: MANNA FROM HEAVEN is a terrific film that is both predictable and unpredictable at the same time. Yo...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2:  96%|█████████▌| 24001/25000 [1:11:59<03:02,  5.47it/s]


Review: Don't watch this film while, or soon after, eating.<br /><br />Having said that, Begotten will stick...
True sentiment: positive
Predicted sentiment: unknown
Required 3 attempt(s)


Evaluating GPT-2: 100%|██████████| 25000/25000 [1:15:00<00:00,  5.56it/s]


Review: I was expecting "Born to Kill" to be an exciting, high-tension film noir. Instead, it's got two good...
True sentiment: negative
Predicted sentiment: unknown
Required 3 attempt(s)

===== GPT-2 Zero-Shot Sentiment Analysis Results =====
Model: gpt2
Total samples: 25000
Valid predictions: 54 (0.22%)
Unknown predictions: 24946 (99.78%)





It appears that almost all the predictions is unknown. This could be a result of GPT2 being too small, and thus not being good enough to follow instructions well for answer. Let's try an alternative method using finetuning and the classification head for gpt instead