# Load Library

In [1]:
!pip install huggingface_hub
!pip install transformers datasets torch
!pip install --upgrade torch
!pip install --upgrade pip
!pip install --disable-pip-version-check \
    torch \
    torchdata \
    transformers[torch] \
    evaluate \
    rouge_score \
    loralib \
    datasets \


!pip install 'accelerate>=0.26.0' --quiet
!pip install pyngrok

[0m

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
nvidiagpu = !nvidia-smi
nvidiagpu

['Sat Dec 21 20:53:55 2024       ',
 '+---------------------------------------------------------------------------------------+',
 '| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |',
 '|-----------------------------------------+----------------------+----------------------+',
 '| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |',
 '| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |',
 '|                                         |                      |               MIG M. |',
 '|   0  NVIDIA GeForce RTX 3090        Off | 00000000:0B:00.0 Off |                  N/A |',
 '|  0%   33C    P0             107W / 420W |      3MiB / 24576MiB |      4%      Default |',
 '|                                         |                      |                  N/A |',
 '+-----------------------------------------+----------------------+----------------------+',
 '                      

# Load Data & EDA

In [4]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import pandas as pd
import numpy as np

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# torch.autograd.set_detect_anomaly(True)
f"Using device: {device}"

'Using device: cuda'

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from datasets import load_dataset

dataset = load_dataset("super_glue", "rte", trust_remote_code=True)
dataset


DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 277
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'idx', 'label'],
        num_rows: 3000
    })
})

In [8]:
import hashlib
import datetime
import random

LABEL_MAP = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
TEMPLATE_VARIANTS = [
    "Given the premise and hypothesis below, identify whether the hypothesis logically follows from the premise.",
    "Determine the logical relationship between the following premise and hypothesis.",
    "Does the hypothesis follow, contradict, or remain neutral to the premise provided below?",
    "Classify the relationship between the provided premise and hypothesis as entailment, contradiction, or neutral.",
    "Based on the premise, decide if the hypothesis is entailed, neutral, or contradicting it.",
    "Analyze the premise and hypothesis to classify their logical connection.",
    "Evaluate whether the hypothesis is supported, unrelated, or contradicted by the premise.",
    "Read the premise and hypothesis carefully and classify their relationship."
]

def generate_unique_id(premise, hypothesis):
    return hashlib.md5(f"{premise}{hypothesis}".encode()).hexdigest()

def generate_metadata(sample, unique_id):
    return {
        "idx": sample.get("idx", None),
        "source": "SuperGLUE RTE",
        "timestamp": datetime.datetime.now().isoformat(),
        "unique_id": unique_id,
        "lengths": {
            "premise": len(sample["premise"].split()),
            "hypothesis": len(sample["hypothesis"].split())
        },
    }

def process_superglue_rte(sample):
    label = LABEL_MAP.get(sample["label"], str(sample["label"]).capitalize())
    # Replace '-1' with 'Neutral' in the dataset
    if label == "-1":
        label = "Neutral"
        
    unique_id = generate_unique_id(sample["premise"], sample["hypothesis"])
    metadata = generate_metadata(sample, unique_id)
    instruction = random.choice(TEMPLATE_VARIANTS)
    return {
        "instruction": instruction,
        "input": {
            "premise": sample["premise"],
            "hypothesis": sample["hypothesis"]
        },
        "output": label,
        "metadata": metadata
    }

def process_superglue_dataset(task_name, dataset):
    if task_name != "rte":
        raise ValueError(f"Task '{task_name}' is not supported.")
    return [process_superglue_rte(sample) for sample in dataset]

trainData = process_superglue_dataset('rte', dataset['train'])
testData = process_superglue_dataset('rte', dataset['test'])
valData = process_superglue_dataset('rte', dataset['validation'])


In [9]:
trainData[0]


{'instruction': 'Does the hypothesis follow, contradict, or remain neutral to the premise provided below?',
 'input': {'premise': 'No Weapons of Mass Destruction Found in Iraq Yet.',
  'hypothesis': 'Weapons of Mass Destruction Found in Iraq.'},
 'output': 'Neutral',
 'metadata': {'idx': 0,
  'source': 'SuperGLUE RTE',
  'timestamp': '2024-12-21T20:54:00.911423',
  'unique_id': 'd66c49c494a8aa9999ea35c06205542c',
  'lengths': {'premise': 9, 'hypothesis': 7}}}

# Data preprocessing

# Tokenize

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

In [11]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch.nn as nn

class LVModel(nn.Module):
    def __init__(self, base_model):
        super(LVModel, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(p=0.3)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.dropout(outputs.logits)

        if labels is not None:
            # Ensure logits and labels are reshaped correctly
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}
    def config(self):
        return self.base_model.config

model = LVModel(base_model=AutoModelForCausalLM.from_pretrained("openai-community/gpt2"))
model.to(device)

LVModel(
  (base_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_feat

In [12]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()

    return \
        f"Trainable model parameters: {trainable_model_params}\n" +\
        f"All model parameters: {all_model_params}\n" +\
        f"Percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

Trainable model parameters: 124439808
All model parameters: 124439808
Percentage of trainable model parameters: 100.00%


In [13]:
testData[0]


{'instruction': 'Classify the relationship between the provided premise and hypothesis as entailment, contradiction, or neutral.',
 'input': {'premise': "Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case.",
  'hypothesis': 'Shukla is related to Mangla.'},
 'output': 'Neutral',
 'metadata': {'idx': 0,
  'source': 'SuperGLUE RTE',
  'timestamp': '2024-12-21T20:54:01.093236',
  'unique_id': 'ee3f963a3c86ad712b8076efa56327c2',
  'lengths': {'premise': 16, 'hypothesis': 5}}}

In [14]:
def tokenize_dataset(data, model_name="gpt2", max_length=512):
    """
    Tokenize dataset for GPT-2, skipping invalid samples.
    
    Args:
        data (list): List of dataset samples.
        model_name (str): Hugging Face model name for tokenizer.
        max_length (int): Maximum sequence length for tokenization.
    
    Returns:
        list: Tokenized dataset with valid samples.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    LABEL_MAP_REVERSE = {"Entailment": 0, "Neutral": 1, "Contradiction": 2}
    tokenized_data = []

    for i, sample in enumerate(data):
        try:
            # Skip invalid samples
            if sample["output"] not in LABEL_MAP_REVERSE:
                print(f"Skipping invalid sample at index {i}: {sample['output']}")
                continue

            # Combine instruction, premise, and hypothesis
            text = f"{sample['instruction']} Premise: {sample['input']['premise']} Hypothesis: {sample['input']['hypothesis']}"

            # Tokenize the text
            tokens = tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=max_length,
                return_tensors="pt"
            )

            # Add tokenized data
            tokenized_data.append({
                "input_ids": tokens["input_ids"][0],
                "attention_mask": tokens["attention_mask"][0],
                "labels": LABEL_MAP_REVERSE[sample["output"]]
            })
        except Exception as e:
            print(f"Error processing sample at index {i}: {e}")

    return tokenized_data

In [15]:
tokenized_train = tokenize_dataset(trainData)
tokenized_test = tokenize_dataset(testData)
tokenized_val = tokenize_dataset(valData)



In [16]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [17]:
!pip install plotly

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [18]:
tokenized_train[0].keys()


dict_keys(['input_ids', 'attention_mask', 'labels'])

In [19]:
class customDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

tokenizer.pad_token = tokenizer.eos_token
train_dataset = customDataset(tokenized_train)
test_dataset = customDataset(tokenized_test)
val_dataset = customDataset(tokenized_val)

train_dataset[0]


{'input_ids': tensor([13921,   262, 14078,  1061,    11, 18372,    11,   393,  3520,  8500,
           284,   262, 18659,  2810,  2174,    30,  6929,   786,    25,  1400,
         18944,   286,  5674, 25034,  4062,   287,  3908,  6430,    13, 21209,
           313,  8497,    25, 18944,   286,  5674, 25034,  4062,   287,  3908,
            13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50

In [20]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512,  shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=True)




In [21]:
from torch.utils.data import DataLoader
from transformers import GPT2ForSequenceClassification, AdamW, get_scheduler
import torch
from tqdm.auto import tqdm
from torchmetrics.classification import MulticlassAccuracy

# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32
learning_rate = 5e-6
num_epochs = 100

num_classes = 3

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_classes)
model.config.pad_token_id = model.config.eos_token_id
model.dropout = torch.nn.Dropout(p=0.3)


model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate,weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=10)

# Loss and Metrics
criterion = torch.nn.CrossEntropyLoss()
train_accuracy = MulticlassAccuracy(num_classes=num_classes).to(device)
val_accuracy = MulticlassAccuracy(num_classes=num_classes).to(device)

# Training Loop
def train_one_epoch(model, loader, optimizer, scheduler, criterion, metric):
    model.train()
    total_loss = 0
    metric.reset()

    loop = tqdm(loader, desc="Training", leave=False)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

        # Metrics
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=-1)
        metric.update(preds, labels)

        loop.set_postfix(loss=loss.item())
    
    avg_loss = total_loss / len(loader)
    accuracy = metric.compute()
    return avg_loss, accuracy

# Validation Loop
def validate_one_epoch(model, loader, criterion, metric):
    model.eval()
    total_loss = 0
    metric.reset()

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = criterion(logits, labels)

            # Metrics
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            metric.update(preds, labels)

    avg_loss = total_loss / len(loader)
    accuracy = metric.compute()
    return avg_loss, accuracy

# Training Function
def train_model(model, train_loader, val_loader, optimizer, scheduler, criterion, num_epochs):
    # Initialize a dictionary to store results
    history = {"epoch": [], "train_loss": [], "train_accuracy": [], "val_loss": [], "val_accuracy": []}
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Train one epoch
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, train_accuracy)
        print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

        # Validate one epoch
        val_loss, val_acc = validate_one_epoch(model, val_loader, criterion, val_accuracy)
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

        # Early Stopping Check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_callback.early_stopping_patience:
                print("Early stopping triggered")
                break

        # Log metrics for visualization
        history["epoch"].append(epoch + 1)
        history["train_loss"].append(train_loss)
        history["train_accuracy"].append(train_acc.item())
        history["val_loss"].append(val_loss)
        history["val_accuracy"].append(val_acc.item())

    return history
# Run Training
# Train the model
history = train_model(model, train_loader, val_loader, optimizer, scheduler, criterion, num_epochs)



Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100




Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 2.1180, Train Accuracy: 0.2146
Validation Loss: 0.7505, Validation Accuracy: 0.3274
Epoch 2/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.7393, Train Accuracy: 0.3352
Validation Loss: 0.7031, Validation Accuracy: 0.5033
Epoch 3/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.7136, Train Accuracy: 0.3556
Validation Loss: 0.6962, Validation Accuracy: 0.4919
Epoch 4/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.6999, Train Accuracy: 0.3600
Validation Loss: 0.6929, Validation Accuracy: 0.5095
Epoch 5/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.6900, Train Accuracy: 0.3722
Validation Loss: 0.6783, Validation Accuracy: 0.5414
Epoch 6/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.6720, Train Accuracy: 0.5851
Validation Loss: 0.6759, Validation Accuracy: 0.5705
Epoch 7/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.6607, Train Accuracy: 0.4023
Validation Loss: 0.6548, Validation Accuracy: 0.6008
Epoch 8/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.6375, Train Accuracy: 0.6448
Validation Loss: 0.6876, Validation Accuracy: 0.5987
Epoch 9/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.6275, Train Accuracy: 0.6524
Validation Loss: 0.7021, Validation Accuracy: 0.5929
Epoch 10/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.6116, Train Accuracy: 0.6528
Validation Loss: 0.6740, Validation Accuracy: 0.6227
Epoch 11/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.5967, Train Accuracy: 0.6809
Validation Loss: 0.6636, Validation Accuracy: 0.6297
Epoch 12/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.5928, Train Accuracy: 0.6781
Validation Loss: 0.6891, Validation Accuracy: 0.6078
Epoch 13/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.5724, Train Accuracy: 0.7074
Validation Loss: 0.6835, Validation Accuracy: 0.6196
Epoch 14/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.5442, Train Accuracy: 0.7216
Validation Loss: 0.6730, Validation Accuracy: 0.6396
Epoch 15/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.5361, Train Accuracy: 0.7392
Validation Loss: 0.6910, Validation Accuracy: 0.6246
Epoch 16/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.5231, Train Accuracy: 0.7460
Validation Loss: 0.6926, Validation Accuracy: 0.6292
Epoch 17/100


Training:   0%|          | 0/78 [00:00<?, ?it/s]

Train Loss: 0.5075, Train Accuracy: 0.7456
Validation Loss: 0.7171, Validation Accuracy: 0.6135
Early stopping triggered


In [25]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create subplots: 1 row, 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=("Loss", "Accuracy"))

# Plot loss
fig.add_trace(
    go.Scatter(x=history["epoch"], y=history["train_loss"], mode='lines+markers', name="Train Loss"),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=history["epoch"], y=history["val_loss"], mode='lines+markers', name="Validation Loss"),
    row=1, col=1
)

# Plot accuracy
fig.add_trace(
    go.Scatter(x=history["epoch"], y=history["train_accuracy"], mode='lines+markers', name="Train Accuracy"),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=history["epoch"], y=history["val_accuracy"], mode='lines+markers', name="Validation Accuracy"),
    row=1, col=2
)

# Update layout
fig.update_layout(
    title="Training and Validation Metrics",
    xaxis_title="Epoch",
    yaxis_title="Value",
    template="plotly_dark"
)

fig.show()

In [23]:
!nvidia-smi


Sat Dec 21 21:09:35 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:0B:00.0 Off |                  N/A |
| 77%   68C    P2             182W / 420W |  21660MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
# save model
import joblib
joblib.dump(model, "lv_model.joblib")




['lv_model.joblib']