In [1]:
# Load persuasion data
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../data/my_honours/persuasion_data.csv")
data = data.loc[data["source"] != "Control"]
# Let's just do a massively shorter dataset for now
# Only Claude 3 Opus and a single claim 
data = data.loc[
    (data["source"] == "Claude 3 Opus") &
    (data["claim"] == data["claim"][0])
]
print(data[["source", "claim", "prompt_type", "argument"]].drop_duplicates())

processed_df = {}
for entry in data.itertuples():
    argument = entry.argument
    if argument not in processed_df:
        processed_df[argument] = {
            'input': f"Write a persuasive argument for the claim: {entry.claim}",
            'output': entry.argument,
            'source': entry.source,
            'prompt_type': entry.prompt_type,
            'rating': [entry.rating_final_number],
        }
    else:
        processed_df[argument]['rating'].append(entry.rating_final_number)

input = [dp['input'] for dp in list(processed_df.values())]
output = [dp['output'] for dp in list(processed_df.values())]
scores = [round(np.mean(dp['rating']),1) for dp in list(processed_df.values())]

            source                                              claim  \
236  Claude 3 Opus  Governments and technology companies must do m...   
349  Claude 3 Opus  Governments and technology companies must do m...   
430  Claude 3 Opus  Governments and technology companies must do m...   
518  Claude 3 Opus  Governments and technology companies must do m...   

                 prompt_type  \
236                Deceptive   
349        Logical Reasoning   
430  Expert Writer Rhetorics   
518          Compelling Case   

                                              argument  
236  Online privacy and security are under daily as...  
349  In the modern digital age, most of us share va...  
430  In today's interconnected digital world, our p...  
518  In the digital age, protecting online privacy ...  


In [3]:
import sys
sys.path.append('../scripts')

import os
from dotenv import load_dotenv

In [4]:
def reset_openai_api_key():
    os.environ.pop("OPENAI_API_KEY", None)
    load_dotenv()

reset_openai_api_key()

In [5]:
from utils import shuffle_lists, calculate_correlation, load_newsroom, load_summEval, calculate_uncertainty, load_sf_data, CompareResultObject, insert_index_to_anchors

print('Number of summary candidates:', len(output))

Number of summary candidates: 4


In [6]:
from tqdm import tqdm

# Set the meta-parameters
params = {
    'dataset': 'SummEval',
    'engine': "gpt-3.5-turbo",
    'aspect': 'coherence',
    'eval_method': 'pairwise comparison',
    'confidence_beam': False,  # False for PairS-greedy search
    # 'beam_size': 2000,
    # 'prob_gap': 0.1,
    'api_call': 0,
    'with_input': True,
    'compare_log': {},
    'calibration': False,
}

In [7]:
from sorting import merge_sort_indices, merge_sort
import random

random.seed(42)

# Set the progress bar
if params['confidence_beam']:
    params['progress_bar'] = tqdm(total=int(len(input)**2), desc='Processing')
else:
    params['progress_bar'] = tqdm(total=int(len(input) * np.log2(len(input))), desc='Processing')

# Shuffle the input, output, and scores
input, output, scores = shuffle_lists(input, output, scores)

# Perform the PairS-greedy ranking
# Please note: All prompts are saved in /scripts/prompts.py
ranking_indices = merge_sort_indices(input, output, params)

params['progress_bar'].close()

  from .autonotebook import tqdm as notebook_tqdm
  entropy = -np.sum(probablities * np.log(probablities))
Processing:  62%|██████▎   | 5/8 [00:03<00:02,  1.29it/s]


In [9]:
# Calculate the correlation
spearman_corr, kendall_tau = calculate_correlation(np.array(scores)[ranking_indices], list(range(len(scores))))

spearman_corr, kendall_tau

(0.316227766016838, 0.18257418583505539)

In [10]:
ranking_indices

[2, 1, 3, 0]

In [None]:
236                Deceptive   
349        Logical Reasoning   
430  Expert Writer Rhetorics   
518          Compelling Case 

Logical
Deceptive
Expert Writer
Compelling

In [11]:
from pairs import PairsGreedy, PairsBeam
from scripts.utils import shuffle_lists, load_summEval


# Load example data
summ_eval_path = 'data/SummEval/model_annotations.aligned.paired.jsonl'
input_doc, output_doc, _ = load_summEval(summ_eval_path, flat_output=False)

doc_id = 42
input, output = input_doc[doc_id], output_doc[doc_id]
input, output = shuffle_lists(input, output)

# The same input source text corresponds to multiple output summaries
print('Number of summary candidates:', len(output))

method = 'PairsGreedy'
if method == 'PairsGreedy':
    # Set hyperparameters
    params = {
        # 'engine': "mistralai/Mistral-7B-Instruct-v0.1",
        'engine': "meta-llama/Llama-2-7b-chat-hf",
        'api_call': 0,
        'with_input': True,   # Use the prompt template for task with context input, e.g. Summarization 
        'calibrate': False,   # For each pairwise comparison, we average the probabilities of both permutations to cancel the positional bias.
    }
    # Rank the output summaries from low to high quality
    indices = PairsGreedy(input[0], output, params)
    print(indices)

elif method == 'PairsBeam':
    # Set hyperparameters
    params = {
        'engine': "mistralai/Mistral-7B-Instruct-v0.1",
        'beam_size': 2000,
        'api_call': 0,
        'prob_gap': 0.1,
        'with_input': True,
        'calibrate': False,
    }
    # Rank the output summaries from low to high quality
    indices = PairsBeam(input[0], output, params)
    print(indices)

FileNotFoundError: [Errno 2] No such file or directory: 'data/SummEval/model_annotations.aligned.paired.jsonl'