# **Fine-Tuning Transformers with PyTorch and Hugging Face**

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip cache purge

In [None]:
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio

In [None]:
!pip install torchtext==0.17.2
!pip install portalocker==2.8.2
!pip install torchdata==0.7.1
!pip install pandas
!pip install matplotlib==3.9.0 scikit-learn==1.5.0
!pip install numpy==1.26.0
!pip install --user transformers==4.42.1
!pip install --user datasets # 2.20.0
!pip install portalocker>=2.0.0
!pip install --user torchmetrics==1.4.0.post0
!pip install numpy==1.26.4
!pip install peft==0.11.1
!pip install evaluate==0.4.2
!pip install -q bitsandbytes==0.43.1
!pip install --user accelerate==0.31.0
!pip install --user trl==0.9.4
!pip install --user protobuf==3.20.*
!pip install matplotlib
!pip install fsspec==2023.1.0

In [None]:
import torch
import torchvision
print(torch.__version__)
print(torchvision.__version__)
from torchvision.ops import nms

2.3.1+cu121
0.18.1+cu121


In [None]:
from torchmetrics import Accuracy
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoConfig,AutoModelForCausalLM,AutoModelForSequenceClassification,BertConfig,BertForMaskedLM,TrainingArguments, Trainer, TrainingArguments
from transformers import AutoTokenizer,BertTokenizerFast,TextDataset,DataCollatorForLanguageModeling
from transformers import pipeline
from datasets import load_dataset
from trl import SFTConfig,SFTTrainer, DataCollatorForCompletionOnlyLM


import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import math
import time
import matplotlib.pyplot as plt

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


### Dataset Preparations

In [None]:
dataset = load_dataset("yelp_review_full")
dataset, dataset["train"][100]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

(DatasetDict({
     test: Dataset({
         features: ['label', 'text'],
         num_rows: 50000
     })
     train: Dataset({
         features: ['label', 'text'],
         num_rows: 650000
     })
 }),
 {'label': 0,
  'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. S

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets['train'][0].keys()

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

dict_keys(['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets['train'][0].keys()

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=2)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=2)

### Train the Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=5e-4)
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = LambdaLR(optimizer, lr_lambda=lambda current_step: (1 - current_step / num_training_steps))

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def train_model(model,tr_dataloader):
    progress_bar = tqdm(range(num_training_steps))
    model.train()
    tr_losses=[]
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tr_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            # Backward pass (compute gradients)
            loss.backward()
            total_loss += loss.item()
            # Update the model parameters
            optimizer.step()
            # Update the learning rate scheduler
            lr_scheduler.step()
            # Clear the gradients
            optimizer.zero_grad()
            progress_bar.update(1)
        tr_losses.append(total_loss/len(tr_dataloader))
    plt.plot(tr_losses)
    plt.title("Training loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

### Evaluate

In [None]:
def evaluate_model(model, evl_dataloader):
    metric = Accuracy(task="multiclass", num_classes=5).to(device)
    model.eval()
    with torch.no_grad():
        for batch in evl_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric(predictions, batch["labels"])
    accuracy = metric.compute()
    print("Accuracy:", accuracy.item())

In [None]:
# train_model(model=model,tr_dataloader=train_dataloader)

# torch.save(model, 'my_model.pt')

### Loading the Saved Model

In [None]:
!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/wFhKpkBMSgjmZKRSyayvsQ/bert-classification-model.pt'
model.load_state_dict(torch.load('bert-classification-model.pt',map_location=torch.device('cpu')))

--2025-07-08 14:53:27--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/wFhKpkBMSgjmZKRSyayvsQ/bert-classification-model.pt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 198.23.119.245
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|198.23.119.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 433341834 (413M) [binary/octet-stream]
Saving to: ‘bert-classification-model.pt’


2025-07-08 14:53:37 (44.8 MB/s) - ‘bert-classification-model.pt’ saved [433341834/433341834]



<All keys matched successfully>

In [None]:
evaluate_model(model, eval_dataloader)

## Training a Conversational Model using SFTTrainer

In [None]:
dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")
dataset[0]

Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'text': '### Human: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.### Assistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargaining po

In [None]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [None]:
instruction_template = "### Human:"
response_template = "### Assistant:"

In [None]:
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)

In [None]:
training_args = SFTConfig(
    output_dir="/tmp",
    num_train_epochs=10,
    #learning_rate=2e-5,
    save_strategy="epoch",
    fp16=True,
    per_device_train_batch_size=2,  # Reduce batch size
    per_device_eval_batch_size=2,  # Reduce batch size
    #gradient_accumulation_steps=4,  # Accumulate gradients
    max_seq_length=1024,
    do_eval=True)

trainer = SFTTrainer(
    model,
    args=training_args,
    train_dataset=dataset,
    dataset_text_field="text",
    data_collator=collator,)

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

In [None]:
pipe = pipeline("text-generation", model=model,tokenizer=tokenizer,max_new_tokens=70)
print(pipe('''Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.''')[0]["generated_text"])

Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.

The term "monopsony" is used in the context of the "mono" (mono-economy) model. The term "mono-economy" is used in the context of the "mono-economy" model. The term "mono-economy" is used in the context of


Not aware of what "monopsony" is in the context of economics.

In [None]:
#trainer.train()

In [None]:
!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Assistant_model.pt'
model.load_state_dict(torch.load('Assistant_model.pt',map_location=torch.device('cpu')))

--2025-07-08 15:55:56--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Assistant_model.pt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 198.23.119.245
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|198.23.119.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1324934570 (1.2G) [application/octet-stream]
Saving to: ‘Assistant_model.pt’


2025-07-08 15:56:35 (32.7 MB/s) - ‘Assistant_model.pt’ saved [1324934570/1324934570]



<All keys matched successfully>

In [None]:
## Write your code here
pipe = pipeline("text-generation", model=model,tokenizer=tokenizer,max_new_tokens=70)
print(pipe('''Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.''')[0]["generated_text"])

Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.

The term "monopsony" in economics refers to the practice of controlling the working class by imposing a price on them. This can be seen as a form of economic control, but it can also be seen as a form of political control, as the price of a product can be influenced by the interests of the state.


