# SummEval example

In this notebook, we use one summarization example from the SummEval to demostrate how to use the PairS.

In [1]:
import sys
sys.path.append('../scripts')

import os
from dotenv import load_dotenv

In [2]:
def reset_openai_api_key():
    os.environ.pop("OPENAI_API_KEY", None)
    load_dotenv()

reset_openai_api_key()

In [18]:
from utils import shuffle_lists, calculate_correlation, load_newsroom, load_summEval, calculate_uncertainty, load_sf_data, CompareResultObject, insert_index_to_anchors


summ_eval_path = '../data/SummEval/model_annotations.aligned.paired.jsonl'
input_doc, output_doc, scores_doc = load_summEval(summ_eval_path, flat_output=False)
scores_doc = scores_doc['coherence']

doc_id = 42
input, output, scores = input_doc[doc_id], output_doc[doc_id], scores_doc[doc_id]
print('Number of summary candidates:', len(output))

Number of summary candidates: 16


## PairS-greedy

In [16]:
from transformers import LlamaForCausalLM, AutoTokenizer
import torch
device = 'cuda'

model = 'GPT-3.5-turbo'
tokenizer = AutoTokenizer.from_pretrained(model)   # base_model

model = LlamaForCausalLM.from_pretrained(model,
    torch_dtype=torch.bfloat16,
    device_map=device,
    token=os.getenv("OPENAI_API_KEY")
)


OSError: GPT-3.5-turbo is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [4]:
from tqdm import tqdm
import numpy as np

# Set the meta-parameters
params = {
    'dataset': 'SummEval',
    'engine': "gpt-3.5-turbo",
    'aspect': 'coherence',
    'eval_method': 'pairwise comparison',
    'confidence_beam': False,  # False for PairS-greedy search
    # 'beam_size': 2000,
    # 'prob_gap': 0.1,
    'api_call': 0,
    'with_input': True,
    'compare_log': {},
    'calibration': False,
}


In [5]:
from sorting import merge_sort_indices, merge_sort
import random

random.seed(42)

# Set the progress bar
if params['confidence_beam']:
    params['progress_bar'] = tqdm(total=int(len(input)**2), desc='Processing')
else:
    params['progress_bar'] = tqdm(total=int(len(input) * np.log2(len(input))), desc='Processing')

# Shuffle the input, output, and scores
input, output, scores = shuffle_lists(input, output, scores)

# Perform the PairS-greedy ranking
# Please note: All prompts are saved in /scripts/prompts.py
ranking_indices = merge_sort_indices(input, output, params)

params['progress_bar'].close()

  from .autonotebook import tqdm as notebook_tqdm
  entropy = -np.sum(probablities * np.log(probablities))
Processing:  70%|███████   | 45/64 [00:28<00:12,  1.56it/s]


In [7]:
# Calculate the correlation
spearman_corr, kendall_tau = calculate_correlation(np.array(scores)[ranking_indices], list(range(len(scores))))

spearman_corr, kendall_tau

(0.5504669157093779, 0.4526019054848144)

## PairS-beam

In [8]:
from tqdm import tqdm
import numpy as np

# Set the meta-parameters
params = {
    'dataset': 'SummEval',
    'engine': "gpt-3.5-turbo",
    'aspect': 'coherence',
    'eval_method': 'pairwise comparison',
    'confidence_beam': True,  # True for PairS-beam search
    'beam_size': 2000,
    'api_call': 0,
    'prob_gap': 0.1,
    'with_input': True,
    'compare_log': {},
    'calibration': False,
}


In [10]:
from sorting import merge_sort_indices, merge_sort
import random

random.seed(42)

# Set the progress bar
if params['confidence_beam']:
    params['progress_bar'] = tqdm(total=int(len(input)**2), desc='Processing')
else:
    params['progress_bar'] = tqdm(total=int(len(input) * np.log2(len(input))), desc='Processing')

# Shuffle the input, output, and scores
input, output, scores = shuffle_lists(input, output, scores)

# Perform the PairS-beam ranking
# Please note: All prompts are saved in /scripts/prompts.py
ranking_indices = merge_sort_indices(input, output, params)

params['progress_bar'].close()

Processing:  23%|██▎       | 58/256 [00:37<02:06,  1.57it/s]


In [12]:
# Calculate the correlation
spearman_corr, kendall_tau = calculate_correlation(np.array(scores)[ranking_indices], list(range(len(scores))))

spearman_corr, kendall_tau

(0.14689009341031914, 0.12185435916898849)