<a href="https://colab.research.google.com/github/tecexbarto/EmotiBot/blob/main/finetuning_bart_large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#IMPORTS

In [None]:
#install the latest version of the Hugging Face dataset library
!pip install datasets --upgrade

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
#install three necessary libraries: transformers, datasets and torch (PyTorch, necessary for model training and tensor operations)
!pip install transformers datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import datasets
import fsspec
import gcsfs
print(datasets.__version__)
print(fsspec.__version__)

3.3.2
2024.10.0


In [None]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset
from torch.utils.data import DataLoader
import random
import numpy as np

In [None]:
#set seed in multiple libraries
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

#LOAD THE DATASET

In [None]:
dataset = load_dataset("Amod/mental_health_counseling_conversations")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

combined_dataset.json:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

In [None]:
#rename columns for clarity
dataset = dataset.rename_column("Context", "Question")
dataset = dataset.rename_column("Response", "Answer")

#DOWNLOAD THE MODEL

In [None]:
#load tokenizer and model
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

#FINE TUNING

In [None]:
#tokenization and preprocessing of the dataset
def tokenize_data(example):
    inputs = tokenizer(example["Question"], max_length=512, truncation=True, padding="max_length") #we tokenize the question
    outputs = tokenizer(example["Answer"], max_length=512, truncation=True, padding="max_length")  #we tokenize the response

    inputs["labels"] = outputs["input_ids"]
    return inputs

In [None]:
#we apply the tokenization function to the complete dataset
tokenized_dataset = dataset.map(tokenize_data, batched=True)

#now, we divide the tokenized dataset into 80% training and 20% validation.
train_valid_split = tokenized_dataset["train"].train_test_split(test_size=0.2, seed=42)

train_dataset = train_valid_split["train"]
valid_dataset = train_valid_split["test"]

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print(f"Train: {len(train_dataset)} samples")
print(f"Validation: {len(valid_dataset)} samples")

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

Train: 2809 samples
Validation: 703 samples


In [None]:
#define a custom collate function for the DataLoader
def collate_fn(batch):
    return {key: torch.stack([example[key] for example in batch]) for key in batch[0]}

In [None]:
#Configure DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=8,
    collate_fn=collate_fn
)

valid_dataloader = DataLoader(
    valid_dataset,
    shuffle=False,
    batch_size=8,
    collate_fn=collate_fn
)

In [None]:
#configuration of the model and device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#enable FP16 to save memory and improve performance
scaler = torch.cuda.amp.GradScaler()

#define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

  scaler = torch.cuda.amp.GradScaler()  # Escalador para FP16


In [None]:
#define number of epochs and early stopping
epochs = 40
early_stopping_patience = 2
best_valid_loss = float('inf')
epochs_without_improvement = 0
gradient_accumulation_steps = 2

for epoch in range(epochs):
    model.train()  #training phase
    epoch_loss = 0

    for step, batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        epoch_loss += loss.item()

        if (step + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Step {step + 1}, Loss: {loss.item() * gradient_accumulation_steps}")

    avg_train_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} completed. Average Training Loss: {avg_train_loss}")

    #validation phase
    model.eval()
    valid_loss = 0

    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            valid_loss += loss.item()

    avg_valid_loss = valid_loss / len(valid_dataloader)
    print(f"Epoch {epoch + 1} completed. Average Validation Loss: {avg_valid_loss}")

    #early stopping to verify whether the loss of validation has improved
    if avg_valid_loss < best_valid_loss:
        best_valid_loss = avg_valid_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), "best_model.pth")
        print("Best model saved!")
    else:
        epochs_without_improvement += 1

    #if it does not improve at `early_stopping_patience` epochs, stop training
    if epochs_without_improvement >= early_stopping_patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

  with torch.cuda.amp.autocast():  # Activar FP16


Epoch 1, Step 10, Loss: 16.786670684814453
Epoch 1, Step 20, Loss: 10.512179374694824
Epoch 1, Step 30, Loss: 10.306687355041504
Epoch 1, Step 40, Loss: 10.57268238067627
Epoch 1, Step 50, Loss: 8.247928619384766
Epoch 1, Step 60, Loss: 8.876813888549805
Epoch 1, Step 70, Loss: 7.3818793296813965
Epoch 1, Step 80, Loss: 7.813458442687988
Epoch 1, Step 90, Loss: 6.540438652038574
Epoch 1, Step 100, Loss: 6.250674724578857
Epoch 1, Step 110, Loss: 5.034867286682129
Epoch 1, Step 120, Loss: 4.980526924133301
Epoch 1, Step 130, Loss: 4.4768452644348145
Epoch 1, Step 140, Loss: 3.173612594604492
Epoch 1, Step 150, Loss: 3.474034070968628
Epoch 1, Step 160, Loss: 2.2179691791534424
Epoch 1, Step 170, Loss: 2.0587215423583984
Epoch 1, Step 180, Loss: 1.8316856622695923
Epoch 1, Step 190, Loss: 1.8888311386108398
Epoch 1, Step 200, Loss: 1.9609050750732422
Epoch 1, Step 210, Loss: 2.2397727966308594
Epoch 1, Step 220, Loss: 0.9143024682998657
Epoch 1, Step 230, Loss: 1.2860716581344604
Epoch 1

  with torch.cuda.amp.autocast():


Epoch 1 completed. Average Validation Loss: 1.2605583119121464
Best model saved!
Epoch 2, Step 10, Loss: 1.6012061834335327
Epoch 2, Step 20, Loss: 1.2206735610961914
Epoch 2, Step 30, Loss: 1.6722817420959473
Epoch 2, Step 40, Loss: 1.437752604484558
Epoch 2, Step 50, Loss: 1.0632672309875488
Epoch 2, Step 60, Loss: 2.151751756668091
Epoch 2, Step 70, Loss: 1.9813530445098877
Epoch 2, Step 80, Loss: 1.0969117879867554
Epoch 2, Step 90, Loss: 1.2604095935821533
Epoch 2, Step 100, Loss: 1.4748340845108032
Epoch 2, Step 110, Loss: 1.1554142236709595
Epoch 2, Step 120, Loss: 1.5153465270996094
Epoch 2, Step 130, Loss: 1.361903429031372
Epoch 2, Step 140, Loss: 1.511640191078186
Epoch 2, Step 150, Loss: 1.1754885911941528
Epoch 2, Step 160, Loss: 0.9642397165298462
Epoch 2, Step 170, Loss: 0.8112015724182129
Epoch 2, Step 180, Loss: 1.1286334991455078
Epoch 2, Step 190, Loss: 1.6135023832321167
Epoch 2, Step 200, Loss: 0.8392849564552307
Epoch 2, Step 210, Loss: 1.4532430171966553
Epoch 2,

In [None]:
#save model, configuration and tokenizer
model.save_pretrained("best_model")
tokenizer.save_pretrained("best_model")

#compress the model to download
!zip -r best_model.zip best_model
from google.colab import files
files.download("best_model.zip")



  adding: best_model/ (stored 0%)
  adding: best_model/model.safetensors (deflated 7%)
  adding: best_model/tokenizer_config.json (deflated 76%)
  adding: best_model/vocab.json (deflated 68%)
  adding: best_model/generation_config.json (deflated 47%)
  adding: best_model/merges.txt (deflated 53%)
  adding: best_model/config.json (deflated 63%)
  adding: best_model/special_tokens_map.json (deflated 85%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#TEST THE MODEL

In [None]:
model.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [None]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

#therapeutic test question
ask_question = "I always feel angry."

#tokenize the question
inputs = tokenizer(ask_question, return_tensors="pt").to(device)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [None]:
#generate the model response with the following parameters
output_ids = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=200,
    num_beams=5,
    early_stopping=True
)

In [None]:
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Respuesta del modelo:", response)

Respuesta del modelo: Anger is not necessarily a bad thing. If you are angry and you can talk about your feelings, that would be very helpful. Anger usually comes along with something else, like feeling sad, worried, overwhelmed, confused, and many others. Consider looking at what you notice in addition to anger and you may have a different starting point.
