In [16]:
import numpy as np
import llm_blender
blender = llm_blender.Blender()
blender.loadranker("llm-blender/PairRM", device="cuda") # load PairRM



Successfully loaded ranker from  /home/ra43rid/.cache/huggingface/hub/llm-blender/PairRM


In [17]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets

# Load SNLI dataset
ds = load_dataset("ag_news").shuffle(seed=42)
full_dataset = concatenate_datasets([ds["train"], ds["test"]])


In [18]:
full_dataset


Dataset({
    features: ['text', 'label'],
    num_rows: 127600
})

In [19]:
dataset = full_dataset.select(range(500))

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
classes = ["World", "Sports", "Business", "Science and Technology"]

# Extract labels
y = dataset['label']  # Keeping it in Hugging Face format

# Define K-Fold Cross-Validation
k_folds = 5  # Change as needed
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

def compute_cal_scores(ds):
    inputs = [f"{x['text']}" for x in ds]
    candidates_texts = [[f"The category is {x}" for x in classes]]*len(inputs)
    return blender.rank(inputs, candidates_texts, return_scores=True, batch_size=1)


# Perform cross-validation
for fold, (test_idx, cal_idx) in enumerate(skf.split(range(len(dataset)), y)):
    print(f"Fold: {fold}\n")
    test_dataset = dataset.select(test_idx)  # Get training subset
    cal_dataset = dataset.select(cal_idx)    # Get test subset

    labels_cal = [x["label"] for x in cal_dataset]
    labels_test = [x["label"] for x in test_dataset]

    cal_scores = compute_cal_scores(cal_dataset)
    # take scores of true labels
    cal_scores = cal_scores[np.arange(cal_scores.shape[0]), labels_cal]
    pred_scores = compute_cal_scores(test_dataset)


    alphas = [0.02, 0.05, 0.1, 0.2]
    for alpha in alphas:
        print("\n\n")
        print(f"alpha =\t\t\t {alpha}")
        n = len(cal_scores)
        threshold = np.quantile(cal_scores.flatten(), np.ceil((n+1)*(alpha))/n, method="inverted_cdf")
        pred_sets = [np.where(row > threshold)[0].tolist() for row in pred_scores]
        predictions = np.argmax(pred_scores, axis=1)
        coverage = np.mean([labels_test[i] in pred_sets[i] for i in range(n)])
        avg_set_size = np.mean([len(s) for s in pred_sets])
        median_set_size = np.median([len(s) for s in pred_sets])
        accuracy = accuracy_score(labels_test, predictions)
        print(f"coverage =\t\t {coverage}")
        print(f"mean set size =\t\t {avg_set_size}")
        print(f"median set size =\t {median_set_size}")
        print(f"accuracy =\t {accuracy}")



Fold: 0



Ranking candidates: 100%|██████████| 100/100 [00:35<00:00,  2.85it/s]
Ranking candidates: 100%|██████████| 400/400 [02:20<00:00,  2.84it/s]





alpha =			 0.02
coverage =		 0.99
mean set size =		 2.9675
median set size =	 3.0
accuracy =	 0.7875



alpha =			 0.05
coverage =		 0.94
mean set size =		 1.84
median set size =	 2.0
accuracy =	 0.7875



alpha =			 0.1
coverage =		 0.89
mean set size =		 1.425
median set size =	 1.0
accuracy =	 0.7875



alpha =			 0.2
coverage =		 0.74
mean set size =		 0.99
median set size =	 1.0
accuracy =	 0.7875
Fold: 1



Ranking candidates: 100%|██████████| 100/100 [00:36<00:00,  2.75it/s]
Ranking candidates: 100%|██████████| 400/400 [02:23<00:00,  2.78it/s]





alpha =			 0.02
coverage =		 0.97
mean set size =		 2.2125
median set size =	 2.0
accuracy =	 0.7825



alpha =			 0.05
coverage =		 0.95
mean set size =		 1.76
median set size =	 2.0
accuracy =	 0.7825



alpha =			 0.1
coverage =		 0.85
mean set size =		 1.28
median set size =	 1.0
accuracy =	 0.7825



alpha =			 0.2
coverage =		 0.8
mean set size =		 0.9625
median set size =	 1.0
accuracy =	 0.7825
Fold: 2



Ranking candidates: 100%|██████████| 100/100 [00:35<00:00,  2.81it/s]
Ranking candidates: 100%|██████████| 400/400 [02:22<00:00,  2.82it/s]





alpha =			 0.02
coverage =		 1.0
mean set size =		 3.6125
median set size =	 4.0
accuracy =	 0.7925



alpha =			 0.05
coverage =		 0.92
mean set size =		 1.8
median set size =	 2.0
accuracy =	 0.7925



alpha =			 0.1
coverage =		 0.88
mean set size =		 1.56
median set size =	 2.0
accuracy =	 0.7925



alpha =			 0.2
coverage =		 0.79
mean set size =		 1.0175
median set size =	 1.0
accuracy =	 0.7925
Fold: 3



Ranking candidates: 100%|██████████| 100/100 [00:35<00:00,  2.84it/s]
Ranking candidates: 100%|██████████| 400/400 [02:21<00:00,  2.83it/s]





alpha =			 0.02
coverage =		 0.97
mean set size =		 2.29
median set size =	 2.0
accuracy =	 0.7975



alpha =			 0.05
coverage =		 0.96
mean set size =		 1.84
median set size =	 2.0
accuracy =	 0.7975



alpha =			 0.1
coverage =		 0.93
mean set size =		 1.6275
median set size =	 2.0
accuracy =	 0.7975



alpha =			 0.2
coverage =		 0.85
mean set size =		 1.17
median set size =	 1.0
accuracy =	 0.7975
Fold: 4



Ranking candidates: 100%|██████████| 100/100 [00:35<00:00,  2.85it/s]
Ranking candidates: 100%|██████████| 400/400 [02:22<00:00,  2.81it/s]




alpha =			 0.02
coverage =		 0.96
mean set size =		 2.27
median set size =	 2.0
accuracy =	 0.78



alpha =			 0.05
coverage =		 0.91
mean set size =		 1.665
median set size =	 2.0
accuracy =	 0.78



alpha =			 0.1
coverage =		 0.86
mean set size =		 1.3625
median set size =	 1.0
accuracy =	 0.78



alpha =			 0.2
coverage =		 0.8
mean set size =		 1.0025
median set size =	 1.0
accuracy =	 0.78





In [None]:
blender

<llm_blender.blender.blender.Blender at 0x7fe6d81f8160>