# Setup

In [None]:
!pip3 install torchmetrics transformers accelerate datasets deepspeed wandb


In [None]:
!nvidia-smi

In [None]:
import os
import torch
from torchmetrics import Accuracy, Precision, Recall, F1Score, AUROC
from transformers import Trainer, TrainingArguments, EvalPrediction
import wandb
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset


In [None]:
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"

# Function to generate train & eval set

In [None]:
def generate_datasets(tokenizer, dataset):
  # create tokenizer function
  def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
  # create datasets
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
  train_dataset = tokenized_datasets["train"]
  eval_dataset = tokenized_datasets["test"]
  return train_dataset, eval_dataset

# Create Deep Speed config

In [None]:
%%bash
cat <<'EOT' > ds_config.json
{
  "zero_optimization": {
    "stage": 2,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "allgather_partitions": true,
    "allgather_bucket_size": 2e8,
    "reduce_scatter": true,
    "reduce_bucket_size": 2e8,
    "overlap_comm": true,
    "contiguous_gradients": true
  },

  "zero_allow_untested_optimizer": true,

  "optimizer": {
    "type": "AdamW",
    "params": {
      "lr": "auto",
      "betas": "auto",
      "eps": "auto",
      "weight_decay": "auto"
    }
  },

  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": "auto",
      "warmup_max_lr": "auto",
      "warmup_num_steps": "auto"
    }
  },

  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "steps_per_print": 10,
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}
EOT

# Trainer

In [None]:

os.environ["WANDB_ENTITY"] = "sc4001" # name of W&B team
os.environ["WANDB_PROJECT"] = "text-sentiment-analysis" # name of W&B project

wandb.login()

# default optimizer: AdamW
training_args = TrainingArguments(
    output_dir='./results', # output directory of results
    num_train_epochs=3, # number of train epochs
    report_to='wandb', # enable logging to W&B
    evaluation_strategy='steps', # check evaluation metrics at each epoch
    logging_steps = 10, # we will log every 10 steps
    eval_steps = 50, # we will perform evaluation every 50 steps
    save_steps = 50, # we will save the model every 50 steps
    load_best_model_at_end = True, # we will load the best model at the end of training
    metric_for_best_model = 'accuracy', # metric to see which model is better
    deepspeed='ds_config.json', # deep speed integration

    #### effective batch_size = per_device_train_batch_size x gradient_accumulation_steps ####
    #### We set effective batch_size to 32 ####
    per_device_train_batch_size=8, # batch size per device
    gradient_accumulation_steps=4, # gradient accumulation
    per_device_eval_batch_size=8, # eval batch size per device
)


def compute_metrics(pred: EvalPrediction):
    """
    Compute metrics using torchmetrics for a given set of predictions and labels.

    Args:
    pred (EvalPrediction): An object containing model predictions and labels.

    Returns:
    dict: A dictionary containing metric results.
    """
    # Extract labels and predictions
    labels = pred.label_ids
    preds = pred.predictions

    num_classes = preds.shape[1]

    # Convert to torch tensors
    labels = torch.tensor(labels)
    preds = torch.tensor(preds)

    # Initialize metrics
    accuracy = Accuracy(task="multiclass", num_classes=num_classes)
    precision = Precision(task="multiclass", num_classes=num_classes)
    recall = Recall(task="multiclass", num_classes=num_classes)
    f1 = F1Score(task="multiclass", num_classes=num_classes)
    auroc = AUROC(task="multiclass", num_classes=num_classes)

    # Calculate metrics (automatically does argmax)
    accuracy_score = accuracy(preds, labels)
    precision_score = precision(preds, labels)
    recall_score = recall(preds, labels)
    f1_score = f1(preds, labels)
    auroc_score = auroc(preds, labels)


    # Convert to CPU for serialization
    return {
        "accuracy": accuracy_score.cpu().item(),
        "precision": precision_score.cpu().item(),
        "recall": recall_score.cpu().item(),
        "f1": f1_score.cpu().item(),
        "auroc": auroc_score.cpu().item(),
    }

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, compute_metrics=compute_metrics, args=training_args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Override the default compute_loss.
        Use Cross Entropy Loss for multiclass classification (>= 2).
        """
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute cross entropy loss
        loss_func = torch.nn.CrossEntropyLoss()
        loss = loss_func(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# Load Datasets

In [8]:
imdbDataset = load_dataset("imdb")
sstDataset = load_dataset("sst2")
# https://huggingface.co/datasets/reza-alipour/Yelp_Sentiment
yelpDataset = load_dataset("reza-alipour/Yelp_Sentiment")


Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/704 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.45M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/444101 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/63483 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/126670 [00:00<?, ? examples/s]

# IMDB Dataset

In [10]:
# load models

# roberta https://huggingface.co/roberta-base
robertaTokenizer = AutoTokenizer.from_pretrained("roberta-base")
robertaModel = AutoModelForSequenceClassification.from_pretrained("roberta-base")

# gpt2 https://huggingface.co/gpt2
gptTokenizer = AutoTokenizer.from_pretrained("gpt2")
gptModel = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
if gptTokenizer.pad_token is None:
    gptTokenizer.add_special_tokens({'pad_token': '[PAD]'})
    gptModel.resize_token_embeddings(len(gptTokenizer))

# T5 https://huggingface.co/t5-base
t5Tokenizer = AutoTokenizer.from_pretrained("t5-base")
t5Model = AutoModelForSequenceClassification.from_pretrained("t5-base")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
r_train_dataset, r_eval_dataset = generate_datasets(robertaTokenizer, imdbDataset)
g_train_dataset, g_eval_dataset = generate_datasets(gptTokenizer, imdbDataset)
t_train_dataset, t_eval_dataset = generate_datasets(t5Tokenizer, imdbDataset)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train Models

In [None]:
# roberta
r_trainer = CustomTrainer(
    model=robertaModel,
    train_dataset=r_train_dataset,
    eval_dataset=r_eval_dataset,
)

r_trainer.train()

In [None]:
# gpt
g_trainer = CustomTrainer(
    model=gptModel,
    train_dataset=g_train_dataset,
    eval_dataset=g_eval_dataset,
)

g_trainer.train()

In [None]:
# t5
t_trainer = CustomTrainer(
    model=t5Model,
    train_dataset=t_train_dataset,
    eval_dataset=t_eval_dataset,
)

t_trainer.train()