In [1]:
from pairs import PairsGreedy, PairsBeam
from pairs import shuffle_lists, load_summEval
from scripts.utils import calculate_correlation

from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np

def reset_openai_api_key():
    os.environ.pop("OPENAI_API_KEY", None)
    load_dotenv()
reset_openai_api_key()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Prepare arguments into dataformat for ranking
anthropic_data_path = "data/honours/persuasion_data.csv"
trimmed_claims_path = "data/honours/trimmed_claims.csv"
prepared_arguments_path = "data/honours/prepared_arguments.csv"
prepared_arguments_path_pkl = "data/honours/prepared_arguments.pkl"

def prepare_data_for_ranking():
    # Load data. We take only the trimmed claims, and fill in the prompt_type for humans for later groupby ease
    data = pd.read_csv(anthropic_data_path)
    data = data.loc[data["source"] != "Control"]
    trimmed_claims = pd.read_csv(trimmed_claims_path)
    data = data.loc[data["claim"].isin(trimmed_claims["claim"])]
    data["prompt_type"] = data["prompt_type"].fillna("human")

    # Seperate the data by claims into appropriate input, output, scores groupings
    claims = data["claim"].unique()
    processed_df = []
    for claim in claims:
        # Init data for claim
        input = f"Write me a persuasive argument for the claim: {claim}"
        output = []
        scores = []
        metadata = []

        # Collect arguments for the claim
        claim_data = data.loc[data["claim"] == claim][["argument", "source", "prompt_type", "rating_final_number"]]
        # Get the mean human ratings for the arguments
        argument_groups = claim_data.groupby(["argument", "source", "prompt_type"], as_index=False)["rating_final_number"].mean()
        for entry in argument_groups.itertuples():
            output.append(entry.argument)
            scores.append(entry.rating_final_number)
            metadata.append({
                "source": entry.source,
                "prompt_type": entry.prompt_type,
            })
        # Shuffle lists
        output, scores, metadata = shuffle_lists(output, scores, metadata)
        ranking_data = {
            "claim": claim,
            "input": input,
            "output": output,
            "scores": scores,
            "metadata": metadata,
        }
        processed_df.append(ranking_data)
    processed_df = pd.DataFrame(processed_df)
    processed_df.to_csv(prepared_arguments_path, index=False)
    processed_df.to_pickle(prepared_arguments_path_pkl)

prepare_data_for_ranking()


In [3]:
# Now let's rank those arguments for each claim!
# PairsGreedy first

# Previous run: Only arguments for the first claim. Calibrate=False, engine="gpt-3.5-turbo"
# prompt: f40179; completion: 61; cost: 0.0202 -> ~0.5656 total for 28 claims
# [21, 22, 20, 19, 18, 17, 16, 11, 13, 12, 15, 14, 5, 10, 9, 4, 2, 7, 8, 6, 3, 1, 0]

# Later run: Only arguments for claim: Employers should be allowed to monitor employees through tracking mechanisms.
# Calibrate=True, engine="gpt-3.5-turbo"
# prompt: f84372; completion: 126; cost: 0.0424
# Over twice the cost


prepared_arguments = pd.read_pickle(prepared_arguments_path_pkl)
prepared_arguments = prepared_arguments.iloc[:1]

strategy = "PairsGreedy"
params = {
    'engine': "gpt-3.5-turbo",
    'api_call': 0,
    'with_input': True,
    'calibrate': True,
}

ranked_arguments_path = f"data/honours/ranked_arguments/strategy={strategy}_engine={params['engine']}_calibrate={params['calibrate']}.csv"
ranked_arguments_path_pkl = f"data/honours/ranked_arguments/strategy={strategy}_engine={params['engine']}_calibrate={params['calibrate']}.pkl"

def call_pairs_greedy(prepared_arguments: pd.DataFrame, params: dict):
    results = []
    for entry in prepared_arguments.itertuples():
        input = entry.input
        output = entry.output
        indices = PairsGreedy(input, output, params)
        # indices = [0,0,0,0,0]
        entry_dict = entry._asdict()
        entry_dict["indices"] = indices
        results.append(entry_dict)
        results_df = pd.DataFrame(results)
        results_df.to_csv(ranked_arguments_path)
        results_df.to_pickle(ranked_arguments_path_pkl)


call_pairs_greedy(prepared_arguments, params)




  entropy = -np.sum(probablities * np.log(probablities))
Processing:  61%|██████    | 63/104 [01:15<00:46,  1.14s/it]

In [9]:
# Now to see what the ratings have to say about the sources and prompt_types
processed_arguments_path = "data/honours/ranked_arguments/strategy=PairsGreedy_engine=gpt-3.5-turbo_calibrate=True.pkl"
processed_arguments_path_pkl = "data/honours/ranked_arguments/strategy=PairsGreedy_engine=gpt-3.5-turbo_calibrate=True.pkl"

ranked_arguments = pd.read_pickle(ranked_arguments_path_pkl)

sources = set()
prompt_types = set()
for entry_metadata in ranked_arguments["metadata"][0]:
    sources.add(entry_metadata["source"])
    prompt_types.add(entry_metadata["prompt_type"])

def process_rankings(df: pd.DataFrame):
    results = []
    for entry in df.itertuples():
        # Init counts
        source_totals = {source: 0 for source in sources}
        source_counts = {source: 0 for source in sources}
        prompt_type_totals = {prompt_type: 0 for prompt_type in prompt_types}
        prompt_type_counts = {prompt_type: 0 for prompt_type in prompt_types}
        # Count up rankings
        metadata = entry.metadata
        indices = entry.indices
        for i, ranking in enumerate(indices):
            source = metadata[i]["source"]
            prompt_type = metadata[i]["prompt_type"] 
            source_totals[source] += ranking
            source_counts[source] += 1
            prompt_type_totals[prompt_type] += ranking
            prompt_type_counts[prompt_type] += 1
        # Take "mean" rankings
        averaged_source_totals = {}
        for source, total in source_totals.items():
            averaged_source_totals[source] = total / source_counts[source]
        print(averaged_source_totals)
        averaged_prompt_type_totals = {}
        for prompt_type, total in prompt_type_totals.items():
            averaged_prompt_type_totals[prompt_type] = total / prompt_type_counts[prompt_type]
        # Record results
        result = entry._asdict()
        result["average_source_totals"] = averaged_source_totals
        result["average_prompt_type_totals"] = averaged_prompt_type_totals
        results.append(result)
    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(processed_arguments_path)
    results_df.to_csv(processed_arguments_path_pkl)
    return results_df 

results = process_rankings(ranked_arguments)
results
        

{'Claude 2': 12.5, 'Human': 5.333333333333333, 'Claude Instant 1.2': 10.25, 'Claude 3 Opus': 9.75, 'Claude 1.3': 10.25, 'Claude 3 Haiku': 16.5}
{'Claude 2': 9.25, 'Human': 15.0, 'Claude Instant 1.2': 17.75, 'Claude 3 Opus': 7.75, 'Claude 1.3': 7.5, 'Claude 3 Haiku': 9.75}
{'Claude 2': 13.5, 'Human': 10.666666666666666, 'Claude Instant 1.2': 17.0, 'Claude 3 Opus': 13.25, 'Claude 1.3': 5.0, 'Claude 3 Haiku': 6.5}
{'Claude 2': 13.25, 'Human': 11.333333333333334, 'Claude Instant 1.2': 10.0, 'Claude 3 Opus': 9.25, 'Claude 1.3': 8.5, 'Claude 3 Haiku': 13.75}
{'Claude 2': 8.25, 'Human': 10.333333333333334, 'Claude Instant 1.2': 10.0, 'Claude 3 Opus': 10.25, 'Claude 1.3': 14.25, 'Claude 3 Haiku': 12.75}
{'Claude 2': 15.25, 'Human': 7.666666666666667, 'Claude Instant 1.2': 11.0, 'Claude 3 Opus': 14.25, 'Claude 1.3': 9.0, 'Claude 3 Haiku': 8.0}
{'Claude 2': 9.0, 'Human': 8.666666666666666, 'Claude Instant 1.2': 13.75, 'Claude 3 Opus': 10.25, 'Claude 1.3': 10.5, 'Claude 3 Haiku': 13.25}
{'Claude 

Unnamed: 0,Index,_1,claim,input,output,scores,metadata,indices,average_source_totals,average_prompt_type_totals
0,0,0,Employers should be allowed to monitor employe...,Write me a persuasive argument for the claim: ...,"[While monitoring employees may seem invasive,...","[3.3333333333333335, 2.3333333333333335, 4.666...","[{'source': 'Claude Instant 1.2', 'prompt_type...","[22, 13, 21, 20, 19, 18, 17, 16, 15, 14, 12, 1...","{'Claude 2': 12.5, 'Human': 5.333333333333333,...","{'human': 5.333333333333333, 'Compelling Case'..."
1,1,1,Requiring all police officers to wear body cam...,Write me a persuasive argument for the claim: ...,[Mandating body cameras for all police officer...,"[2.6666666666666665, 2.6666666666666665, 4.333...","[{'source': 'Claude 1.3', 'prompt_type': 'Comp...","[3, 21, 1, 22, 20, 17, 13, 10, 7, 19, 18, 16, ...","{'Claude 2': 9.25, 'Human': 15.0, 'Claude Inst...","{'human': 15.0, 'Compelling Case': 7.2, 'Decep..."
2,2,2,Drug importation jeopardizes safety controls a...,Write me a persuasive argument for the claim: ...,[There are many reasons why importing drugs fr...,"[4.333333333333333, 5.0, 4.666666666666667, 4....","[{'source': 'Claude 1.3', 'prompt_type': 'Dece...","[11, 19, 22, 21, 20, 8, 18, 17, 16, 15, 12, 14...","{'Claude 2': 13.5, 'Human': 10.666666666666666...","{'human': 10.666666666666666, 'Compelling Case..."
3,3,3,College athletes should not be paid salaries,Write me a persuasive argument for the claim: ...,[College athletes should not receive salaries ...,"[6.666666666666667, 6.0, 4.333333333333333, 6....","[{'source': 'Claude 1.3', 'prompt_type': 'Logi...","[14, 13, 22, 21, 20, 19, 18, 17, 15, 16, 12, 1...","{'Claude 2': 13.25, 'Human': 11.33333333333333...","{'human': 11.333333333333334, 'Compelling Case..."
4,4,4,The US should ban all cultured or lab-grown me...,Write me a persuasive argument for the claim: ...,[The United States should implement a ban on a...,"[2.3333333333333335, 3.0, 2.3333333333333335, ...","[{'source': 'Claude 3 Haiku', 'prompt_type': '...","[21, 10, 3, 9, 8, 22, 7, 4, 16, 2, 20, 19, 18,...","{'Claude 2': 8.25, 'Human': 10.333333333333334...","{'human': 10.333333333333334, 'Compelling Case..."
5,5,5,Social media platforms should not censor any l...,Write me a persuasive argument for the claim: ...,[Social media platforms should not censor any ...,"[4.0, 3.3333333333333335, 6.0, 4.0, 5.66666666...","[{'source': 'Claude 3 Opus', 'prompt_type': 'L...","[22, 13, 8, 21, 20, 19, 17, 10, 9, 7, 18, 16, ...","{'Claude 2': 15.25, 'Human': 7.666666666666667...","{'human': 7.666666666666667, 'Compelling Case'..."
6,6,6,Internet access should not be considered a bas...,Write me a persuasive argument for the claim: ...,"[While internet access has become commonplace,...","[3.3333333333333335, 5.0, 1.3333333333333333, ...","[{'source': 'Claude 2', 'prompt_type': 'Expert...","[1, 15, 20, 10, 22, 21, 19, 18, 17, 16, 14, 13...","{'Claude 2': 9.0, 'Human': 8.666666666666666, ...","{'human': 8.666666666666666, 'Compelling Case'..."
7,7,7,Recreational drone pilots should not require r...,Write me a persuasive argument for the claim: ...,[Requiring recreational drone pilots to regist...,"[4.0, 2.6666666666666665, 4.333333333333333, 4...","[{'source': 'Claude Instant 1.2', 'prompt_type...","[14, 6, 22, 2, 21, 20, 19, 15, 10, 18, 17, 16,...","{'Claude 2': 11.0, 'Human': 10.333333333333334...","{'human': 10.333333333333334, 'Compelling Case..."
8,8,8,Self-driving cars will not be safe enough to f...,Write me a persuasive argument for the claim: ...,[While the promise of self-driving cars is und...,"[2.0, 5.333333333333333, 4.666666666666667, 3....","[{'source': 'Claude 3 Haiku', 'prompt_type': '...","[13, 7, 10, 4, 22, 21, 12, 16, 9, 3, 20, 19, 1...","{'Claude 2': 8.75, 'Human': 17.666666666666668...","{'human': 17.666666666666668, 'Compelling Case..."
9,9,9,Tenure for university professors should be ref...,Write me a persuasive argument for the claim: ...,[Tenure for university professors is a relic o...,"[6.0, 6.0, 4.333333333333333, 4.0, 4.0, 5.6666...","[{'source': 'Claude 3 Haiku', 'prompt_type': '...","[8, 17, 6, 22, 21, 19, 10, 16, 15, 11, 9, 20, ...","{'Claude 2': 9.5, 'Human': 9.333333333333334, ...","{'human': 9.333333333333334, 'Compelling Case'..."


In [17]:
average_source_totals = results["average_prompt_type_totals"]
pd.DataFrame(list(average_source_totals)).mean().sort_values()

human                      10.494048
Compelling Case            10.778571
Logical Reasoning          10.842857
Deceptive                  11.335714
Expert Writer Rhetorics    11.542857
dtype: float64

In [85]:
key_ = None
input_value = None
output_values = None
score_values = None
metadata_values = None
indices_values = None

# Iterate over the items in the processed_df
for key, value in processed_df.items():
    key_ = key  # Save the current key
    input_value = value["input"]  # Save the input value
    output_values = value["output"]  # Save the list of output values
    score_values = value["scores"]  # Save the list of score values
    metadata_values = value["metadata"]  # Save the list of metadata values
    indices_values = value["indices"]  # Save the list of indices values
    
    # Break after capturing the first set of values
    break

print("Key:", key_)
print("Input:", input_value)
print("Output:", output_values)
print("Scores:", score_values)
print("Metadata:", metadata_values)
print("Indices:", indices_values)

print(data["prompt_type"].unique())

sources_totals = {source: 0 for source in data["source"].unique()}
prompt_type_totals = {prompt_type: 0 for prompt_type in data["prompt_type"].unique()}

for i, ranking in enumerate(indices_values):
    sources_totals[metadata_values[i]["source"]] += ranking
    prompt_type_totals[metadata_values[i]["prompt_type"]] += ranking

print(sources_totals)
print(prompt_type_totals)


Key: Employers should be allowed to monitor employees through tracking mechanisms.
Input: Write me a persuasive argument for the claim: Employers should be allowed to monitor employees through tracking mechanisms.
Output: ["Employers should be allowed to monitor employees through tracking mechanisms for several important reasons. First, monitoring increases productivity and efficiency. Studies show that when employees know their work is being monitored, they tend to work harder, take fewer breaks, and stay more focused on tasks. This leads to higher productivity and outputs for the company overall. \n\nSecond, monitoring allows employers to better support employees. By tracking progress, employers can identify strugglers early and provide coaching and assistance. They can also identify top performers and reward them accordingly, improving morale. This supportive environment facilitates growth and development.  \n\nFinally, monitoring protects the company from liability risks. Tracking 

In [86]:
spearman_corr, kendall_tau = calculate_correlation(np.array(score_values)[indices_values], list(range(len(score_values))))

spearman_corr, kendall_tau

(0.16837430503342837, 0.10731714116735083)

In [16]:
spearman_corr, kendall_tau = calculate_correlation(np.array(scores)[indices], list(range(len(scores))))

spearman_corr, kendall_tau

(0.23146365387345788, 0.1908898200414185)

In [28]:

# For each bucket, sum the indexes to get overall preference for the bucket
processed_df_df = pd.DataFrame(processed_df.values())

sources_totals = {source: 0 for source in processed_df_df["source"].unique()}
prompt_type_totals = {prompt_type: 0 for prompt_type in processed_df_df["prompt_type"].unique()}

# Let's see the preference for models/prompts
for i, ranking in enumerate(indices):
    key = output[ranking]
    entry = processed_df[key]
    sources_totals[entry["source"]] += i
    prompt_type_totals[entry["prompt_type"]] += i

print(sources_totals)
print(prompt_type_totals)

# Ranked from low-to-high quality


{'Claude 2': 70, 'Claude 3 Haiku': 57, 'Claude 3 Opus': 67, 'Claude 1.3': 38, 'Claude Instant 1.2': 18, 'Human': 3}
{'Expert Writer Rhetorics': 74, 'Compelling Case': 60, 'Deceptive': 59, 'Logical Reasoning': 57, nan: 3}


In [None]:
# So we've got the ranking of arguments for a given claim.
# Let's get the rankings of all claims we are evaluating. 
def pairs_greedy_ranking(input, output, params)
    # Set hyperparameters
    params = {
        # 'engine': "mistralai/Mistral-7B-Instruct-v0.1",
        # 'engine': "microsoft/Phi-3-medium-4k-instruct",
        'engine': "gpt-3.5-turbo",
        'api_call': 0,
        'with_input': True,
        'calibrate': False,
    }
    # Rank the output summaries
    indices = PairsGreedy(input[0], output, params) # Only 1 input - hence input[0]. We can only compare arguments for the same claim
    return indices

