In [1]:
# Install required libraries
!pip install datasets evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m491.2/491.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.0/84.0 kB[0m 

In [9]:
from datasets import load_dataset
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from tqdm import tqdm
import random

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [10]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Load the IMDB dataset
imdb = load_dataset("imdb")
train_dataset = imdb['train'].shuffle(seed=42)
test_dataset = imdb['test'].shuffle(seed=42)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# For hyperparameter tuning, create a validation split
val_split_ratio = 0.1
val_size = int(len(train_dataset) * val_split_ratio)

val_dataset = train_dataset.select([i for i in list(range(val_size))])
train_dataset = train_dataset.select([i for i in list(range(val_size, len(train_dataset)))])

print(f"Train dataset size after split: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Train dataset size: 25000
Test dataset size: 25000
Train dataset size after split: 22500
Validation dataset size: 2500


In [11]:
# Load LLaMA 3.2 tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Update tokenizer with the pad token
num_added_tokens = tokenizer.add_special_tokens({'pad_token': tokenizer.pad_token})
print(f"Added {num_added_tokens} tokens to the tokenizer")

# Load LLaMA for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.pad_token_id
)

# Resize embeddings to account for added tokens
model.resize_token_embeddings(len(tokenizer))

# Create a small subset of examples to use as demonstration shots
def get_shot_examples(dataset, num_shots=5):
    # Get equal positive and negative examples
    pos_examples = [ex for ex in dataset if ex['label'] == 1][:num_shots//2 + num_shots%2]
    neg_examples = [ex for ex in dataset if ex['label'] == 0][:num_shots//2]

    # Shuffle to avoid bias from ordering
    examples = pos_examples + neg_examples
    random.shuffle(examples)

    return examples

shot_examples = get_shot_examples(train_dataset)
print(f"Selected {len(shot_examples)} shot examples")

# Format the input using the 5 example shots plus the target example
def format_5shot_input(example):
    # Start with a general instruction
    formatted_text = "Classify the sentiment of movie reviews as positive or negative.\n\n"

    # Add the 5 examples with their labels
    for i, shot in enumerate(shot_examples):
        sentiment = "positive" if shot["label"] == 1 else "negative"
        # Truncate long reviews to keep sequence length manageable
        truncated_text = shot["text"][:150] + "..." if len(shot["text"]) > 150 else shot["text"]
        formatted_text += f"Example {i+1}:\nReview: {truncated_text}\nSentiment: {sentiment}\n\n"

    # Add the target example to classify (without label)
    formatted_text += f"Example {len(shot_examples)+1}:\nReview: {example['text']}\nSentiment: "

    return formatted_text

# Define tokenization function
def tokenize_function(examples):
    # Format each example with the 5-shot context
    formatted_texts = [format_5shot_input({"text": text}) for text in examples["text"]]

    # Tokenize with appropriate padding and truncation
    return tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=512  # Increased to accommodate 5-shot examples
    )


Added 0 tokens to the tokenizer


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Selected 5 shot examples


In [12]:
# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format datasets
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [13]:
# Define evaluation metrics (same as original)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Load metrics
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")

    # Calculate metrics
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1_score = f1.compute(predictions=predictions, references=labels, average="binary")["f1"]
    precision_score = precision.compute(predictions=predictions, references=labels, average="binary")["precision"]
    recall_score = recall.compute(predictions=predictions, references=labels, average="binary")["recall"]

    return {
        "accuracy": accuracy_score,
        "f1": f1_score,
        "precision": precision_score,
        "recall": recall_score
    }

In [14]:
# Define training arguments (adjusted for longer sequences)
training_args = TrainingArguments(
    output_dir="./llama_3.2_5shot_classification_imdb_results",
    learning_rate=2e-5,  # Lower learning rate for few-shot
    per_device_train_batch_size=2,  # Reduced batch size due to longer sequences
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./llama_3.2_5shot_classification_imdb_logs",
    logging_steps=200,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="tensorboard",
    gradient_accumulation_steps=8,  # Increased accumulation to handle smaller batches
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)




In [15]:
# Train the model
print("Starting training...")
train_results = trainer.train()
print(train_results)

# Evaluate on validation set
print("Evaluating on validation set...")
val_results = trainer.evaluate(eval_dataset=tokenized_val)
print(val_results)

# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print(test_results)

# Save the final model
model_path = "./llama_3.2_5shot_classification_imdb_sentiment"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2374,0.284813,0.9064,0.91203,0.864576,0.964996
2,0.1011,0.354807,0.9296,0.931677,0.90978,0.954654
3,0.0397,0.450251,0.926,0.928433,0.903614,0.954654
4,0.0,0.692238,0.9284,0.928257,0.93538,0.921241


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

TrainOutput(global_step=7030, training_loss=0.09306093011445529, metrics={'train_runtime': 23861.6018, 'train_samples_per_second': 4.715, 'train_steps_per_second': 0.295, 'total_flos': 3.361174246989496e+17, 'train_loss': 0.09306093011445529, 'epoch': 4.9969777777777775})
Evaluating on validation set...


{'eval_loss': 0.35480743646621704, 'eval_accuracy': 0.9296, 'eval_f1': 0.9316770186335404, 'eval_precision': 0.9097801364670205, 'eval_recall': 0.954653937947494, 'eval_runtime': 172.987, 'eval_samples_per_second': 14.452, 'eval_steps_per_second': 7.226, 'epoch': 4.9969777777777775}
Evaluating on test set...
{'eval_loss': 0.36330848932266235, 'eval_accuracy': 0.92512, 'eval_f1': 0.9269947741985805, 'eval_precision': 0.904352457768985, 'eval_recall': 0.9508, 'eval_runtime': 1728.181, 'eval_samples_per_second': 14.466, 'eval_steps_per_second': 7.233, 'epoch': 4.9969777777777775}
Model saved to ./llama_3.2_5shot_classification_imdb_sentiment
