# Load Library

In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [2]:
!pip install huggingface_hub
!pip install transformers datasets torch
!pip install --upgrade torch
!pip install --upgrade pip
!pip install --disable-pip-version-check \
    torch \
    torchdata \
    transformers[torch] \
    evaluate \
    rouge_score \
    loralib \
    datasets \

!pip install 'accelerate>=0.26.0' --quiet

Collecting huggingface_hub
  Downloading huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.5/450.5 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tqdm>=4.42.1
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm, huggingface_hub
Successfully installed huggingface_hub-0.27.0 tqdm-4.67.1
[0mCollecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 KB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m
Collecting regex!=2019.12.17


In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
nvidiagpu = !nvidia-smi
nvidiagpu

['Tue Dec 24 09:33:42 2024       ',
 '+---------------------------------------------------------------------------------------+',
 '| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |',
 '|-----------------------------------------+----------------------+----------------------+',
 '| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |',
 '| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |',
 '|                                         |                      |               MIG M. |',
 '|   0  NVIDIA GeForce RTX 3090        Off | 00000000:0B:00.0 Off |                  N/A |',
 '|  0%   37C    P0             109W / 420W |      3MiB / 24576MiB |     14%      Default |',
 '|                                         |                      |                  N/A |',
 '+-----------------------------------------+----------------------+----------------------+',
 '                      

# Load Data & EDA

In [5]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import pandas as pd
import numpy as np

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# torch.autograd.set_detect_anomaly(True)
f"Using device: {device}"

'Using device: cuda'

In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from datasets import load_dataset

dataset = load_dataset("super_glue", "rte", trust_remote_code=True)
dataset


README.md:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

super_glue.py:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 277
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 3000
    })
})

In [9]:
import hashlib
import datetime
import random

LABEL_MAP = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
TEMPLATE_VARIANTS = [
    "Given the premise and hypothesis below, identify whether the hypothesis logically follows from the premise.",
    "Determine the logical relationship between the following premise and hypothesis.",
    "Does the hypothesis follow, contradict, or remain neutral to the premise provided below?",
    "Classify the relationship between the provided premise and hypothesis as entailment, contradiction, or neutral.",
    "Based on the premise, decide if the hypothesis is entailed, neutral, or contradicting it.",
    "Analyze the premise and hypothesis to classify their logical connection.",
    "Evaluate whether the hypothesis is supported, unrelated, or contradicted by the premise.",
    "Read the premise and hypothesis carefully and classify their relationship."
]

def generate_unique_id(premise, hypothesis):
    return hashlib.md5(f"{premise}{hypothesis}".encode()).hexdigest()

def generate_metadata(sample, unique_id):
    return {
        "idx": sample.get("idx", None),
        "source": "SuperGLUE RTE",
        "timestamp": datetime.datetime.now().isoformat(),
        "unique_id": unique_id,
        "lengths": {
            "premise": len(sample["premise"].split()),
            "hypothesis": len(sample["hypothesis"].split())
        },
    }

def process_superglue_rte(sample):
    label = LABEL_MAP.get(sample["label"], str(sample["label"]).capitalize())
    # Replace '-1' with 'Neutral' in the dataset
    if label == "-1":
        label = "Neutral"
        
    unique_id = generate_unique_id(sample["premise"], sample["hypothesis"])
    metadata = generate_metadata(sample, unique_id)
    instruction = random.choice(TEMPLATE_VARIANTS)
    return {
        "instruction": instruction,
        "input": {
            "premise": sample["premise"],
            "hypothesis": sample["hypothesis"]
        },
        "output": label,
        "metadata": metadata
    }

def process_superglue_dataset(task_name, dataset):
    if task_name != "rte":
        raise ValueError(f"Task '{task_name}' is not supported.")
    return [process_superglue_rte(sample) for sample in dataset]

trainData = process_superglue_dataset('rte', dataset['train'])
testData = process_superglue_dataset('rte', dataset['test'])
valData = process_superglue_dataset('rte', dataset['validation'])


In [10]:
trainData[0]


{'instruction': 'Analyze the premise and hypothesis to classify their logical connection.',
 'input': {'premise': 'No Weapons of Mass Destruction Found in Iraq Yet.',
  'hypothesis': 'Weapons of Mass Destruction Found in Iraq.'},
 'output': 'Neutral',
 'metadata': {'idx': 0,
  'source': 'SuperGLUE RTE',
  'timestamp': '2024-12-24T09:36:17.856533',
  'unique_id': 'd66c49c494a8aa9999ea35c06205542c',
  'lengths': {'premise': 9, 'hypothesis': 7}}}

# Data preprocessing

# Tokenize

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [12]:
from transformers import AutoModelForSeq2SeqLM
import torch
import torch.nn as nn

class LVModel(nn.Module):
    def __init__(self, base_model):
        super(LVModel, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(p=0.3)
        self.layer_norm = nn.LayerNorm(self.base_model.config.d_model)
        self.classifier = nn.Linear(self.base_model.config.d_model, 3)
        
        # Ensure weight sharing is maintained
        self.base_model.shared = self.base_model.encoder.embed_tokens
        self.base_model.decoder.embed_tokens = self.base_model.encoder.embed_tokens

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, labels=None):
        outputs = self.base_model(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            decoder_input_ids=decoder_input_ids, 
            labels=labels
        )
        logits = self.dropout(outputs.logits)
        
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}

    def save_pretrained(self, path):
        # Save the model configuration
        self.base_model.config.save_pretrained(path)
        
        # Save the model weights
        state_dict = self.state_dict()
        
        # Remove duplicate weights
        if 'base_model.decoder.embed_tokens.weight' in state_dict:
            del state_dict['base_model.decoder.embed_tokens.weight']
        if 'base_model.shared.weight' in state_dict:
            del state_dict['base_model.shared.weight']
            
        torch.save(state_dict, f"{path}/FlanT5.bin")

# Load base model
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
model = LVModel(base_model=base_model)
model.to(device)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

LVModel(
  (base_model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=384, bias=False)
                (k): Linear(in_features=512, out_features=384, bias=False)
                (v): Linear(in_features=512, out_features=384, bias=False)
                (o): Linear(in_features=384, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 6)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=512, out_features=1024, bias=False)
                (wi_1): Linear(i

In [13]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()

    return \
        f"Trainable model parameters: {trainable_model_params}\n" +\
        f"All model parameters: {all_model_params}\n" +\
        f"Percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

Trainable model parameters: 76963715
All model parameters: 76963715
Percentage of trainable model parameters: 100.00%


In [14]:
from transformers import T5Tokenizer
import torch

# Hàm chuẩn bị dữ liệu
def prepare_data_for_training(data, tokenizer, max_length=128):
    """
    Hàm chuẩn bị dữ liệu đầu vào cho mô hình học sâu.
    
    Args:
    - data (list): Danh sách các ví dụ dữ liệu chứa tiền đề, giả thuyết và nhãn.
    - tokenizer (T5Tokenizer): Tokenizer dùng để token hóa văn bản.
    - max_length (int): Độ dài tối đa của chuỗi token.

    Returns:
    - inputs_tensor (torch.Tensor): Tensor đầu vào (input_ids) cho mô hình.
    - labels_tensor (torch.Tensor): Tensor nhãn cho mô hình.
    """
    inputs = []
    labels = []

    # Mã hóa dữ liệu
    for example in data:
        premise = example['input']['premise']
        hypothesis = example['input']['hypothesis']
        
        # Hợp nhất tiền đề và giả thuyết với dấu phân cách "<sep>"
        input_text = f"{premise} <sep> {hypothesis}"
        
        # Token hóa và chuẩn hóa độ dài chuỗi
        encoding = tokenizer(input_text, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
        
        # Thêm input_ids vào danh sách inputs
        inputs.append(encoding['input_ids'])
        
        # Mã hóa nhãn thành số
        label = example['output']
        label_map = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
        labels.append(label_map.get(label, -1))  # Nếu không tìm thấy nhãn, gán -1

    # Chuyển inputs và labels thành tensor
    inputs_tensor = torch.cat(inputs, dim=0)
    labels_tensor = torch.tensor(labels)

    return inputs_tensor, labels_tensor
def prepare_data_for_training(data, tokenizer, max_length=128):
    inputs = tokenizer([item['input']['premise'] + " " + item['input']['hypothesis'] for item in data], 
                       padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    labels = tokenizer([item['output'] for item in data], 
                       padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    return inputs['input_ids'], labels['input_ids']
# Gọi hàm chuẩn bị dữ liệu
trainDataset = prepare_data_for_training(trainData, tokenizer)
testDataset = prepare_data_for_training(testData, tokenizer)
valDataset =  prepare_data_for_training(valData, tokenizer)


In [19]:
trainDataset[0]

tensor([[  465, 30785,     7,  ...,     0,     0,     0],
        [   71,   286,    13,  ...,     0,     0,     0],
        [ 1347,  6873,    77,  ...,     0,     0,     0],
        ...,
        [15971,    31,     7,  ...,     0,     0,     0],
        [12805, 28666,  2501,  ...,  3677, 11095,     1],
        [ 9299,    19,  9909,  ...,     0,     0,     0]])

In [18]:
def tokenize_dataset(data, tokenizer, max_length=128):
    """
    Tokenizes the dataset for training.
    
    Args:
        data (list): List of examples containing premise and hypothesis
        tokenizer: The tokenizer to use
        max_length (int): Maximum sequence length
    """
    tokenized_data = []
    
    for example in data:
        premise = example['input']['premise']
        hypothesis = example['input']['hypothesis']
        label = example['output'].lower()
        
        # Combine premise and hypothesis
        input_text = f"{premise} </s> {hypothesis}"
        
        # Tokenize input
        inputs = tokenizer(
            input_text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Tokenize output/label
        labels = tokenizer(
            label,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        tokenized_data.append({
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        })
    
    return tokenized_data

In [19]:
tokenized_train = tokenize_dataset(trainData, tokenizer)
tokenized_test = tokenize_dataset(testData, tokenizer)
tokenized_val = tokenize_dataset(valData, tokenizer)

In [22]:
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader



In [23]:
from transformers import Trainer, TrainingArguments
import torch
import plotly.graph_objects as go
import json

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate at the end of each epoch
    save_strategy="epoch",           # save model at the end of each epoch
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train,       # training dataset
    eval_dataset=tokenized_val           # evaluation dataset
)

# Train the model and save the metrics
train_result = trainer.train()
metrics = train_result.metrics

# Evaluate the model
eval_result = trainer.evaluate(eval_dataset=tokenized_test)
metrics.update(eval_result)

# Save metrics to a file
with open("metrics.json", "w") as f:
    json.dump(metrics, f)

# Save the model's state dictionary
torch.save(model.state_dict(), './results/model_state_dict.pt')

# Visualize the metrics with Plotly
fig = go.Figure()

# Add training loss trace
fig.add_trace(go.Scatter(
    x=list(range(len(metrics['train_loss']))),
    y=metrics['train_loss'],
    mode='lines+markers',
    name='Training Loss'
))

# Add evaluation loss trace
fig.add_trace(go.Scatter(
    x=list(range(len(metrics['eval_loss']))),
    y=metrics['eval_loss'],
    mode='lines+markers',
    name='Evaluation Loss'
))

# Add evaluation accuracy trace if available
if 'eval_accuracy' in metrics:
    fig.add_trace(go.Scatter(
        x=list(range(len(metrics['eval_accuracy']))),
        y=metrics['eval_accuracy'],
        mode='lines+markers',
        name='Evaluation Accuracy'
    ))

fig.update_layout(
    title='Training and Evaluation Metrics',
    xaxis_title='Epoch',
    yaxis_title='Loss/Accuracy',
    template='plotly_dark'
)

fig.show()



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:

# Visualize the metrics with Plotly
fig = go.Figure()

# Add training loss trace
fig.add_trace(go.Scatter(
    x=list(range(len(metrics['train_loss']))),
    y=metrics['train_loss'],
    mode='lines+markers',
    name='Training Loss'
))

# Add evaluation loss trace
fig.add_trace(go.Scatter(
    x=list(range(len(metrics['eval_loss']))),
    y=metrics['eval_loss'],
    mode='lines+markers',
    name='Evaluation Loss'
))

# Add evaluation accuracy trace if available
if 'eval_accuracy' in metrics:
    fig.add_trace(go.Scatter(
        x=list(range(len(metrics['eval_accuracy']))),
        y=metrics['eval_accuracy'],
        mode='lines+markers',
        name='Evaluation Accuracy'
    ))

fig.update_layout(
    title='Training and Evaluation Metrics',
    xaxis_title='Epoch',
    yaxis_title='Loss/Accuracy',
    template='plotly_dark'
)

fig.show()