In [1]:
import numpy as np
import llm_blender
blender = llm_blender.Blender()
blender.loadranker("llm-blender/PairRM", device="cuda") # load PairRM

  from .autonotebook import tqdm as notebook_tqdm


Successfully loaded ranker from  /home/ra43rid/.cache/huggingface/hub/llm-blender/PairRM


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets

# Load SNLI dataset
ds = load_dataset("ag_news").shuffle(seed=42)
full_dataset = concatenate_datasets([ds["train"], ds["test"]])


In [3]:
full_dataset


Dataset({
    features: ['text', 'label'],
    num_rows: 127600
})

In [5]:
dataset = full_dataset

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
classes = ["World", "Sports", "Business", "Science and Technology"]

# Extract labels
y = dataset['label']  # Keeping it in Hugging Face format

# Define K-Fold Cross-Validation
k_folds = 5  # Change as needed
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

def compute_cal_scores(ds):
    inputs = [f"{x['text']}" for x in ds]
    candidates_texts = [[f"The category is {x}" for x in classes]]*len(inputs)
    return blender.rank(inputs, candidates_texts, return_scores=True, batch_size=2**8)


# Perform cross-validation
for fold, (test_idx, cal_idx) in enumerate(skf.split(range(len(dataset)), y)):
    print(f"Fold: {fold}\n")
    test_dataset = dataset.select(test_idx)  # Get training subset
    cal_dataset = dataset.select(cal_idx)    # Get test subset

    labels_cal = [x["label"] for x in cal_dataset]
    labels_test = [x["label"] for x in test_dataset]

    cal_scores = compute_cal_scores(cal_dataset)
    # take scores of true labels
    cal_scores = cal_scores[np.arange(cal_scores.shape[0]), labels_cal]
    pred_scores = compute_cal_scores(test_dataset)


    alphas = [0.02, 0.05, 0.1, 0.2]
    for alpha in alphas:
        print("\n\n")
        print(f"alpha =\t\t\t {alpha}")
        n = len(cal_scores)
        threshold = np.quantile(cal_scores.flatten(), np.ceil((n+1)*(alpha))/n, method="inverted_cdf")
        pred_sets = [np.where(row > threshold)[0].tolist() for row in pred_scores]
        predictions = np.argmax(pred_scores, axis=1)
        coverage = np.mean([labels_test[i] in pred_sets[i] for i in range(n)])
        avg_set_size = np.mean([len(s) for s in pred_sets])
        median_set_size = np.median([len(s) for s in pred_sets])
        accuracy = accuracy_score(labels_test, predictions)
        print(f"coverage =\t\t {coverage}")
        print(f"mean set size =\t\t {avg_set_size}")
        print(f"median set size =\t {median_set_size}")
        print(f"accuracy =\t {accuracy}")



Fold: 0



Ranking candidates:  31%|███       | 31/100 [06:21<14:09, 12.31s/it]


KeyboardInterrupt: 

In [None]:
blender

<llm_blender.blender.blender.Blender at 0x7fe6d81f8160>