In [1]:
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%matplotlib inline

In [3]:
login("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")

# Dataset prepatation

In [4]:
from datasets import load_dataset

In [5]:
dataset = load_dataset(path="yelp_review_full")

In [6]:
print(dataset)
print(dataset['train'])

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})
Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})


In [7]:
# PEFT training does not exist with 4.33.2
!pip install transformers==4.33.3 datasets==3.0.1 evaluate peft==0.5.0



In [8]:
dataset['train'][:2]['text']

["dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patient

Get the Tokenizer

In [9]:
import torch
from transformers import AutoTokenizer

In [10]:
_MODEL_ID = "bert-base-cased"  # Find a list of model id at https://huggingface.co/models
# More details about this model can be found at https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=_MODEL_ID)



In [11]:
def tokenize_sample(sample):
    return tokenizer(sample["text"], padding="max_length", truncation=True)

In [12]:
tokenized_dataset = dataset.map(tokenize_sample, batched=True)

In [13]:
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(["text"])  # The model does not accept text input, we can't copy a list to GPUS
tokenized_dataset.set_format("torch")  # Convert list to torch.Tensor

In [14]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))
print(tokenized_dataset['train']['input_ids'].shape)
print(small_train_dataset['input_ids'].shape)

torch.Size([650000, 512])
torch.Size([1000, 512])


In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer



[2024-10-20 20:01:47,433] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [16]:
seq_class_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=_MODEL_ID, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# More details about the training arguments can be found at https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir="seq_class_res", 
    evaluation_strategy="epoch",
)

In [18]:
import evaluate
import numpy as np
from typing import Tuple

In [19]:
acc_metric = evaluate.load(path="accuracy")

In [20]:
def _compute_acc_metric(eval_pred: Tuple[np.ndarray]):
    """Computes the accuracy metric.
    
    Args:
        eval_pred: A tuple of predicted logits shape (B, N_clas), and labels shape (B,)
    """
    logits, labels = eval_pred
    return acc_metric.compute(predictions=np.argmax(logits, axis=-1), references=labels)
    
    

# Training with the Hugging Face Trainer

In [21]:
trainer = Trainer(
    model=seq_class_model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=_compute_acc_metric,
)

# Training using native PyTorch

In [22]:
from torch.utils.data import DataLoader

In [23]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [24]:
from torch.optim import AdamW

optimizer = AdamW(seq_class_model.parameters(), lr=5e-5)

In [25]:
from transformers import get_scheduler
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
print(f"num_training_steps = {num_training_steps}")
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

num_training_steps = 625


In [26]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [27]:
_ = seq_class_model.to(device)

In [28]:
from tqdm.auto import tqdm

In [29]:
_ = seq_class_model.train()

# Run the inference

# Add a PEFT adapter and train the model

In [30]:
seq_class_model_2 = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=_MODEL_ID, 
    num_labels=5,
    load_in_8bit=True,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from peft import LoraConfig

In [32]:
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

In [33]:
seq_class_model_2.add_adapter(lora_config)

In [34]:
peft_trainer = Trainer(
    model=seq_class_model_2,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=_compute_acc_metric,
)

In [35]:
peft_trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.633789,0.212
2,No log,1.632812,0.216
3,No log,1.631836,0.216


TrainOutput(global_step=189, training_loss=1.6510209986772486, metrics={'train_runtime': 82.0843, 'train_samples_per_second': 36.548, 'train_steps_per_second': 2.303, 'total_flos': 811097699328000.0, 'train_loss': 1.6510209986772486, 'epoch': 3.0})