In [2]:
import random, numpy as np, torch
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModel, DataCollatorWithPadding)
from torch.utils.data import DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

### Reproducibility

In [3]:
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

### Load dataset

In [None]:
ds = load_dataset("nvidia/HelpSteer2")
train_data, val_data = ds["train"], ds["validation"]

label_cols = ["helpfulness", "correctness", "coherence", "complexity", "verbosity"]

### Tokeniser & encoding

In [5]:
model_name = "bert-base-uncased"
tok = AutoTokenizer.from_pretrained(model_name)

In [6]:
def tok_pair(batch):
    return tok(batch["prompt"], batch["response"], truncation=True, max_length=512)

In [10]:
train_data = train_data.map(tok_pair, batched=True, remove_columns=["prompt", "response"])
val_data   = val_data.map(tok_pair, batched=True, remove_columns=["prompt", "response"])

train_data.set_format(type="torch",
                      columns=["input_ids", "attention_mask", "token_type_ids"] + label_cols,
                      output_all_columns=True)
val_data.set_format(type="torch",
                    columns=["input_ids", "attention_mask", "token_type_ids"] + label_cols,
                    output_all_columns=True)

Map:   0%|          | 0/1038 [00:00<?, ? examples/s]

### Custom collate with dynamic padding 

In [None]:
padder = DataCollatorWithPadding(tok, return_tensors="pt")

def collate_fn(batch):
    features = [{k: v for k, v in item.items() if k not in label_cols}
                for item in batch]
    batch_padded = padder(features)
    for attr in label_cols:
        batch_padded[attr] = torch.tensor([item[attr] for item in batch],
                                          dtype=torch.long)
    return batch_padded

batch_size = 8
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

### Multi-head model

In [None]:
class MultiHeadReward(nn.Module):
  def __init__(self, enc_name):
      super().__init__()
      self.enc = AutoModel.from_pretrained(enc_name)
      h = self.enc.config.hidden_size
      
      self.layer_norm = nn.LayerNorm(h)
      self.dropout1 = nn.Dropout(0.3)  # Reduced first dropout
      self.dropout2 = nn.Dropout(0.5)  # Keep high dropout before final layer
      
      self.intermediate = nn.Linear(h, h // 2)
      self.heads = nn.ModuleList([nn.Linear(h // 2, 5) for _ in range(5)])
      
      for head in self.heads:
          nn.init.xavier_uniform_(head.weight)
          nn.init.zeros_(head.bias)
      nn.init.xavier_uniform_(self.intermediate.weight)
      nn.init.zeros_(self.intermediate.bias)
      
  def forward(self, **enc_inputs):
      out = self.enc(**enc_inputs).last_hidden_state[:, 0]  # [CLS]
      out = self.layer_norm(out)
      out = self.dropout1(out)
      out = torch.relu(self.intermediate(out))
      out = self.dropout2(out)
      return [head(out) for head in self.heads]  # (B, 5)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Test results

In [106]:
torch.save(model, 'models/metric_model_6.pth')

In [13]:
model = MultiHeadReward(model_name).to(device)
model.load_state_dict(torch.load(r'models\improved2\best_model.pth'))

<All keys matched successfully>

In [14]:
def predict(prompt, response):
    model.eval()
    inputs = tok(prompt, response, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    preds = [torch.softmax(out, dim=-1).argmax().item() for out in outputs]
    return dict(zip(label_cols, preds))
  
def print_load_data(dataset, idx):
    print(f'Actual:\n{json.dumps(dataset[idx], indent=2)}\n')
    print(f'Pred:\n{predict(dataset['prompt'][idx], dataset['response'][idx])}')
    
def calculate_score_accuracy(dataset, label_cols):
    correct = {col: 0 for col in label_cols}
    total = {col: 0 for col in label_cols}

    for idx in tqdm(range(len(dataset)), desc="Calculating Accuracy"):
        preds = predict(dataset['prompt'][idx], dataset['response'][idx])
        for col in label_cols:
            if preds[col] == dataset[col][idx]:
                correct[col] += 1
            total[col] += 1

    accuracy = {col: round(correct[col] / total[col], 4) if total[col] > 0 else 0 for col in label_cols}
    return accuracy


In [15]:
# print one prediction
print_load_data(ds['validation'], 0)

Actual:
{
  "prompt": "explain master slave replication nsql",
  "response": "In the context of NoSQL databases, master-slave replication refers to a configuration where a single master node writes data, and one or more slave nodes read data from the master and replicate it to provide read scalability. The master node is responsible for accepting write requests and updating its own data, while the slave nodes are responsible for replicating the data from the master and serving read requests.\n\nIn this configuration, the master node is the only node that can make changes to the data, while the slave nodes can only read the data and replicate it. This ensures that the data is consistent across all nodes and that there is no data loss in case of a failure of a single node.\n\nMaster-slave replication in NoSQL databases can be implemented using various technologies, such as Apache Cassandra, MongoDB, and Apache HBase. In each of these databases, the master-slave replication configuration 

In [16]:
# validate prediction accuracy
print(calculate_score_accuracy(ds['validation'], label_cols))

Calculating Accuracy: 100%|██████████| 1038/1038 [00:25<00:00, 41.43it/s]

{'helpfulness': 0.3815, 'correctness': 0.4778, 'coherence': 0.7254, 'complexity': 0.6127, 'verbosity': 0.6744}



