# InstruSum Data Analysis

## Load datasets

In [None]:
%pip install datasets
%pip install tabulate
%pip install scipy

In [15]:
from datasets import load_dataset
from tabulate import tabulate

### Load data examples

In [None]:
dataset_name = "Salesforce/InstruSum"

In [None]:
dataset = load_dataset(dataset_name, "dataset")["data"]

Check one data example

In [3]:
dataset[0]

{'hybrid_summary': "After being contacted by BBC Money Box, Lloyds started a new investigation and concluded that its initial response was wrong. They agreed to refund all of Margaret's money, plus interest and £600 by way of compensation. Vodafone also said they would provide the details of an individual who may have used Margaret's account to the police. Sue is grateful for the refund but hopes that someone will be held accountable for the fraud.",
 'requirement': 'Summarize the conclusion of the fraud case.',
 'llm_summary': "After being contacted by BBC Money Box, Lloyds started a new investigation and concluded that its initial response was wrong. They agreed to refund all of Margaret's money, plus interest and £600 by way of compensation. Vodafone also said they would provide the details of an individual who may have used Margaret's account to the police. Sue is grateful for the refund but hopes that someone will be held accountable for the fraud."}

### Load human evaluation data

In [None]:
human_eval = load_dataset(dataset_name, "human_eval")["data"]

Explore the data structure

In [5]:
human_eval[0].keys()

dict_keys(['annotations', 'article', 'requirement'])

In [6]:
human_eval[0]["annotations"].keys()

dict_keys(['gpt-3.5-turbo-0301', 'gpt-4-0314', 'hybrid', 'text-davinci-002', 'text-davinci-003'])

In [7]:
human_eval[0]["annotations"]["gpt-4-0314"]

{'score': {'factual': 1.0,
  'irrelevant': 3.0,
  'missing': 3.6666666666666665,
  'overall': 3.0},
 'summary': "Lloyds Bank has apologized and refunded over £14,000, plus interest and £600 in compensation, to a woman whose late mother's account was targeted by direct debit fraud. The bank initially claimed the direct debits were legitimately set up and refused to refund the money. After being contacted by BBC Money Box, Lloyds conducted a new investigation and admitted its initial response was wrong. Vodafone, which received most of the stolen funds, is also reviewing the case and will provide details of a possible suspect to the police."}

Compute the average system scores

In [18]:
models = ["text-davinci-002", "text-davinci-003", "gpt-3.5-turbo-0301", "gpt-4-0314", "hybrid"]
aspects = ["overall", "missing", "irrelevant", "factual"]
scores = {model: {aspect: [] for aspect in aspects} for model in models}
for row in human_eval:
    for model in models:
        for aspect in aspects:
            scores[model][aspect].append(row["annotations"][model]["score"][aspect])
for model in models:
    for aspect in aspects:
        scores[model][aspect] = sum(scores[model][aspect]) / len(scores[model][aspect])
table = [["Model"] + aspects]
for model in models:
    table.append([model] + [scores[model][aspect] for aspect in aspects])
print(tabulate(table, headers="firstrow", floatfmt=".3f"))

Model                 overall    missing    irrelevant    factual
------------------  ---------  ---------  ------------  ---------
text-davinci-002        2.344      2.595         3.443      0.640
text-davinci-003        3.239      3.702         3.708      0.710
gpt-3.5-turbo-0301      2.897      3.473         2.958      0.800
gpt-4-0314              3.970      4.067         4.205      0.860
hybrid                  3.873      3.948         4.359      0.860


### Load LLM-based evaluation data

In [None]:
llm_eval = load_dataset(dataset_name, "llm_eval")["data"]

Explore the data structure

In [9]:
llm_eval[0].keys()

dict_keys(['system_outputs', 'article', 'requirement', 'llm_scores'])

There are 3 evaluation aspects.

In [10]:
llm_eval[0]["llm_scores"].keys()

dict_keys(['irrelevant', 'missing', 'overall'])

There are 11 LLMs in total.

In [11]:
llm_eval[0]["llm_scores"]["overall"].keys()

dict_keys(['gpt-3.5-turbo-0301', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-instruct', 'gpt-4-0314', 'gpt-4-1106-preview', 'llama-2-13b-chat', 'llama-2-70b-chat', 'llama-2-7b-chat', 'mistral-instruct', 'text-davinci-002', 'text-davinci-003'])

Each LLM is used with different evaluation protocols.

In [19]:
llm_eval[0]["llm_scores"]["overall"]['gpt-3.5-turbo-0301'].keys()

dict_keys(['llmcompare', 'llmeval', 'llmrank'])

Let's check the LLMCompare scores.

In [20]:
llm_eval[0]["llm_scores"]["overall"]['gpt-3.5-turbo-0301']["llmcompare"]

{'gpt-3.5-turbo-0301': 1.75,
 'gpt-4-0314': 1.75,
 'hybrid': 0.25,
 'text-davinci-002': 0.75,
 'text-davinci-003': 0.5}

Compute the system-level correlation of different LLM-based evaluation methods on the overall quality aspect

In [21]:
from scipy.stats import kendalltau

models = [
    "text-davinci-002",
    "text-davinci-003",
    "gpt-3.5-turbo-0301",
    "gpt-3.5-turbo-0613",
    "gpt-3.5-turbo-instruct",
    "gpt-4-0314",
    "gpt-4-1106-preview",
    "llama-2-7b-chat",
    "llama-2-13b-chat",
    "llama-2-70b-chat",
    "mistral-instruct",
]
systems = ["text-davinci-002", "text-davinci-003", "gpt-3.5-turbo-0301", "gpt-4-0314", "hybrid"]
methods = ["llmrank", "llmcompare", "llmeval"]
llm_eval_results = {model: dict() for model in models}

for model in models:
    for method in methods:
        scores = {s: [] for s in systems}
        for row in llm_eval:
            for s in systems:
                scores[s].append(row["llm_scores"]["overall"][model][method][s])
        for s in systems:
            scores[s] = sum(scores[s]) / len(scores[s])
        llm_eval_results[model][method] = scores

human_scores = {s: [] for s in systems}
for row in human_eval:
    for s in systems:
        human_scores[s].append(row["annotations"][s]["score"]["overall"])
for s in systems:
    human_scores[s] = sum(human_scores[s]) / len(human_scores[s])
human_scores = [human_scores[s] for s in systems]

correlations = {model: dict() for model in models}
for model in models:
    for method in methods:
        llm_scores = [llm_eval_results[model][method][s] for s in systems]
        corr, _ = kendalltau(human_scores, llm_scores)
        correlations[model][method] = corr

table = [["Model"] + methods]
for model in models:
    table.append([model] + [correlations[model][method] for method in methods])
print(tabulate(table, headers="firstrow", floatfmt=".3f"))

Model                     llmrank    llmcompare    llmeval
----------------------  ---------  ------------  ---------
text-davinci-002           -0.200         0.400      0.738
text-davinci-003            0.400         0.400      0.949
gpt-3.5-turbo-0301          0.738         0.400      0.600
gpt-3.5-turbo-0613          0.600         0.527      0.527
gpt-3.5-turbo-instruct      0.400         0.600      0.738
gpt-4-0314                  0.800         1.000      1.000
gpt-4-1106-preview          0.400         0.800      0.800
llama-2-7b-chat             0.200         0.527      0.527
llama-2-13b-chat            0.105         0.400      1.000
llama-2-70b-chat           -0.316         0.400      0.949
mistral-instruct           -0.400         0.105      0.447


### Load system outputs

In [13]:
system_outputs = load_dataset(dataset_name, "system_outputs")["data"]

Check one data example

In [14]:
system_outputs[0]

{'system_outputs': {'gpt-3.5-turbo-0301': 'Lloyds Bank has refunded more than £14,000 stolen from the account of a dementia sufferer after direct debits were fraudulently set up. The bank initially refused to refund the money, claiming the direct debits had been "legitimately" set up. The victim\'s daughter, Sue, contacted the police and Action Fraud, but was told it was unlikely any further action would be taken. Lloyds has now apologised and refunded the money, plus interest and £600 compensation. Vodafone, which received most of the stolen money, said it was looking again at the case and would provide details of an individual who may have used the account to the police.',
  'gpt-3.5-turbo-0613': "In this fraud case, Sue discovered that over £14,000 had been stolen from her late mother's bank account through direct debit fraud. Despite reporting the incident to her mother's bank, Lloyds, and contacting the police, Sue received little help or support. However, after the BBC Money Box 