In [1]:
!pip install transformers[sentencepiece]
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
# Import Libraries

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch
from tqdm.auto import tqdm
from accelerate import Accelerator
import evaluate

2026-02-10 13:45:11.153142: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770731111.357812      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770731111.417500      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770731111.920040      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770731111.920081      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770731111.920084      55 computation_placer.cc:177] computation placer alr

In [15]:
# Data Preprocessing and Cleaning

spam_dataset = load_dataset("sms_spam")
print(spam_dataset)
print(spam_dataset['train'].features)

train_val = spam_dataset['train'].train_test_split(test_size=0.3, seed=42)
val_test = train_val['test'].train_test_split(test_size=0.5, seed=42)

# As the spam_dataset doesnt have an eval and test dataset, we need to create them.

train_dataset = train_val['train']
val_dataset = val_test['train']
test_dataset = val_test['test']

print(f"Train: {len(train_dataset)}")
print(f"Val: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})
{'sms': Value('string'), 'label': ClassLabel(names=['ham', 'spam'])}
Train: 3901
Val: 836
Test: 837


In [18]:
# Define Preprocessing function

def preprocessing_function(example):
    return tokenizer(example['sms'],truncation = True)

In [19]:
# Map the function to all datasets

tokenized_dataset_train = train_dataset.map(preprocessing_function, batched = True)
tokenized_dataset_test = test_dataset.map(preprocessing_function, batched = True)
tokenized_dataset_eval = val_dataset.map(preprocessing_function, batched = True)

Map:   0%|          | 0/3901 [00:00<?, ? examples/s]

Map:   0%|          | 0/837 [00:00<?, ? examples/s]

Map:   0%|          | 0/836 [00:00<?, ? examples/s]

In [20]:
# Define DataCollator for dynamic padding

print(tokenized_dataset_train)

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Dataset({
    features: ['sms', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3901
})


In [24]:
# Complete Postprocessing

tokenized_dataset_train = tokenized_dataset_train.remove_columns(['sms'])
tokenized_dataset_test  = tokenized_dataset_test.remove_columns(['sms'])
tokenized_dataset_eval  = tokenized_dataset_eval.remove_columns(['sms'])

tokenized_dataset_train = tokenized_dataset_train.rename_column('label', 'labels')
tokenized_dataset_test  = tokenized_dataset_test.rename_column('label', 'labels')
tokenized_dataset_eval  = tokenized_dataset_eval.rename_column('label', 'labels')

tokenized_dataset_train.set_format('torch')
tokenized_dataset_test.set_format('torch')
tokenized_dataset_eval.set_format('torch')

print(tokenized_dataset_train.column_names)

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [25]:
# Use DataLoaders

train_dataloader = DataLoader(tokenized_dataset_train, shuffle = True, batch_size = 8, collate_fn = data_collator)
eval_dataloader = DataLoader(tokenized_dataset_eval, batch_size = 8, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_dataset_test, batch_size = 8, collate_fn = data_collator)

for batch in train_dataloader:
    break
{k: v.shape for k,v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 60]),
 'token_type_ids': torch.Size([8, 60]),
 'attention_mask': torch.Size([8, 60])}

In [27]:
# Examine output for 1 batch

outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(1.0956, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [28]:
# Define Optimizer

optimizer = AdamW(model.parameters(), lr = 4e-5, weight_decay = 0.01)

In [29]:
# Define Scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

print(num_training_steps)

1464


In [30]:
# Check GPU

device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu')
device

device(type='cuda')

In [32]:
# Make a progress bar to track training

progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/1464 [00:00<?, ?it/s]

In [33]:
# Define Accelerator

accelerator = Accelerator()

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

In [34]:
# Training the model

model.train()

for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [38]:
# Evaluation

metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
model.eval()

eval_dataloader = accelerator.prepare(eval_dataloader)
for batch in eval_dataloader:
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    metric1.add_batch(
        predictions=accelerator.gather(predictions),
        references=accelerator.gather(batch["labels"])
    )
    metric2.add_batch(
    predictions=accelerator.gather(predictions),
    references=accelerator.gather(batch["labels"])
    )

print(metric1.compute())
print(metric2.compute())



{'accuracy': 0.9940191387559809}
{'f1': 0.9777777777777777}
