In [4]:
# Install required libraries
!pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [5]:
from datasets import load_dataset
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from tqdm import tqdm

In [9]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load the IMDB dataset
imdb = load_dataset("imdb")
train_dataset = imdb['train'].shuffle(seed=42)
test_dataset = imdb['test'].shuffle(seed=42)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# For hyperparameter tuning, create a validation split
val_split_ratio = 0.1
val_size = int(len(train_dataset) * val_split_ratio)

val_dataset = train_dataset.select([i for i in list(range(val_size))])
train_dataset = train_dataset.select([i for i in list(range(val_size, len(train_dataset)))])

print(f"Train dataset size after split: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train dataset size: 25000
Test dataset size: 25000
Train dataset size after split: 22500
Validation dataset size: 2500


In [10]:
# Load LLaMA 3.2 tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Update tokenizer with the pad token
num_added_tokens = tokenizer.add_special_tokens({'pad_token': tokenizer.pad_token})
print(f"Added {num_added_tokens} tokens to the tokenizer")

# Load LLaMA for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.pad_token_id
)

# Resize embeddings to account for added tokens
model.resize_token_embeddings(len(tokenizer))

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256  # Shorter sequences for faster training
    )

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Added 0 tokens to the tokenizer


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format datasets
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Load metrics
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")

    # Calculate metrics
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1_score = f1.compute(predictions=predictions, references=labels, average="binary")["f1"]
    precision_score = precision.compute(predictions=predictions, references=labels, average="binary")["precision"]
    recall_score = recall.compute(predictions=predictions, references=labels, average="binary")["recall"]

    return {
        "accuracy": accuracy_score,
        "f1": f1_score,
        "precision": precision_score,
        "recall": recall_score
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./llama_3.2_imdb_results",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduced batch size for better memory management
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./llama_3.2_imdb_logs",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="tensorboard",
    gradient_accumulation_steps=4  # Accumulate gradients to compensate for smaller batch size
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]



In [12]:
# Train the model
print("Starting training...")
train_results = trainer.train()
print(train_results)

# Evaluate on validation set
print("Evaluating on validation set...")
val_results = trainer.evaluate(eval_dataset=tokenized_val)
print(val_results)

# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print(test_results)

# Save the final model
model_path = "./llama_3.2_imdb_sentiment"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4239,0.479526,0.816,0.788214,0.935519,0.680986
2,0.1151,0.563086,0.874,0.87495,0.873217,0.876691


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

TrainOutput(global_step=4218, training_loss=0.3039873361926669, metrics={'train_runtime': 7195.4981, 'train_samples_per_second': 9.381, 'train_steps_per_second': 0.586, 'total_flos': 1.0084240225153843e+17, 'train_loss': 0.3039873361926669, 'epoch': 2.9984})
Evaluating on validation set...


{'eval_loss': 0.5630858540534973, 'eval_accuracy': 0.874, 'eval_f1': 0.8749503771337832, 'eval_precision': 0.873217115689382, 'eval_recall': 0.8766905330151153, 'eval_runtime': 88.591, 'eval_samples_per_second': 28.22, 'eval_steps_per_second': 7.055, 'epoch': 2.9984}
Evaluating on test set...
{'eval_loss': 0.6959826350212097, 'eval_accuracy': 0.83836, 'eval_f1': 0.8346630661593224, 'eval_precision': 0.854199815760824, 'eval_recall': 0.816, 'eval_runtime': 842.0176, 'eval_samples_per_second': 29.691, 'eval_steps_per_second': 7.423, 'epoch': 2.9984}
Model saved to ./llama_3.2_imdb_sentiment


In [13]:
# Redefine training arguments for more epochs
training_args = TrainingArguments(
    output_dir="./llama_3.2_imdb_results",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./llama_3.2_imdb_logs",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="tensorboard",
    gradient_accumulation_steps=4
)

# Re-Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)



In [14]:
# Train the model
print("Starting training...")
train_results = trainer.train()
print(train_results)

# Evaluate on validation set
print("Evaluating on validation set...")
val_results = trainer.evaluate(eval_dataset=tokenized_val)
print(val_results)

# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print(test_results)

# Save the final model
model_path = "./llama_3.2_imdb_sentiment_5_epochs"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1315,0.502395,0.8636,0.870194,0.834307,0.909308
2,0.0712,0.824752,0.8564,0.861764,0.835075,0.890215
3,0.0239,1.11199,0.8552,0.857143,0.850431,0.863962
4,0.0007,1.465261,0.8476,0.848268,0.849282,0.847255


TrainOutput(global_step=7030, training_loss=0.04935677032855225, metrics={'train_runtime': 12161.834, 'train_samples_per_second': 9.25, 'train_steps_per_second': 0.578, 'total_flos': 1.680587123494748e+17, 'train_loss': 0.04935677032855225, 'epoch': 4.9969777777777775})
Evaluating on validation set...


{'eval_loss': 0.5023949146270752, 'eval_accuracy': 0.8636, 'eval_f1': 0.8701941377997716, 'eval_precision': 0.8343065693430657, 'eval_recall': 0.9093078758949881, 'eval_runtime': 89.3344, 'eval_samples_per_second': 27.985, 'eval_steps_per_second': 6.996, 'epoch': 4.9969777777777775}
Evaluating on test set...
{'eval_loss': 0.6119452118873596, 'eval_accuracy': 0.83208, 'eval_f1': 0.837513546988698, 'eval_precision': 0.8112627474505099, 'eval_recall': 0.86552, 'eval_runtime': 845.0495, 'eval_samples_per_second': 29.584, 'eval_steps_per_second': 7.396, 'epoch': 4.9969777777777775}
Model saved to ./llama_3.2_imdb_sentiment_5_epochs
