In [1]:
from openai_utils import call_openai_chat_completion
from jinja2 import Environment
from textwrap import dedent
import json
import ast
from tqdm import tqdm
from datasets import load_dataset


# News

In [35]:
news_data = load_dataset('json', data_files="news_annotations.jsonl")['train']

Found cached dataset json (/home/yinhong/.cache/huggingface/datasets/json/default-488211ad4c96df17/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [37]:

evaluation_prompt = dedent("""\
    Please pretend you are a human reader. Read the tree versions of the news article below and rate their coherence scores following the guideline.
    Coherence guidelines:
    1. Evaluate how well the sentences transition from one to another. A fluent text should have seamless connections between sentences.
    2. Evaluate how well the sentences are organized and the ideas are conveyed. A coherent text should have a clear and precise structure.
    General guidelines:
    1. Rate the coherence of the text from 1 to 10, where 1 is the lowest and 10 is the highest.
    2. Utilize the entire rating scale, from the lowest to the highest score, to provide nuanced feedback.
    3. Please return in JSON format. For example, {"score1": 1, "score2": 2, "score3": 3}.

    News headline:
    {{ headline }}

    Version 1:
    {{ version1 }}

    Version 2:
    {{ version2 }}

    Version 3:
    {{ version3 }}

    Rating:\
""")

environment = Environment()
evaluation_prompt = environment.from_string(evaluation_prompt)

gpt_scores = []
for id in tqdm(range(news_data.shape[0])):
    try:
        prompt = evaluation_prompt.render(
            headline=news_data['input'][id],
            version1=news_data['output_shuffled_0'][id],
            version2=news_data['output_shuffled_1'][id],
            version3=news_data['output_shuffled_2'][id]
        )
        random_indice = news_data['random_indice'][id]
        random_indice = ast.literal_eval(random_indice)
        llm_output = call_openai_chat_completion(prompt)
        corrected_score = [list(llm_output.values())[i] for i in random_indice]
        gpt_scores.append(corrected_score)
    except Exception as e:
        print('Too long? ', e)
        gpt_scores.append(None)
        pass


 13%|█▎        | 14/110 [00:17<01:43,  1.08s/it]

Too long?  Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, you requested 4171 tokens (3915 in the messages, 256 in the completion). Please reduce the length of the messages or completion.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


 36%|███▋      | 40/110 [00:53<01:38,  1.41s/it]

Too long?  Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, your messages resulted in 4156 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


 44%|████▎     | 48/110 [01:03<01:06,  1.08s/it]

Too long?  Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, your messages resulted in 4296 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


 70%|███████   | 77/110 [01:39<00:30,  1.09it/s]

Too long?  Error code: 400 - {'error': {'message': "This model's maximum context length is 4097 tokens. However, your messages resulted in 4311 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


100%|██████████| 110/110 [02:21<00:00,  1.29s/it]


In [39]:

news_data = news_data.add_column("gpt_scores", gpt_scores)
news_data.to_json("news_annotations.jsonl", orient="records", lines=True, index=False)


In [23]:
gs = [s for s in gpt_scores if s!=[]]

sum([1 if s[1]<s[2] else 0 for s in gs]) / len(gs)

0.6095238095238096

# LFQA

In [41]:

lfqa_data = load_dataset('json', data_files="lfqa_annotations.jsonl")['train']

Downloading and preparing dataset json/default to /home/yinhong/.cache/huggingface/datasets/json/default-9ebebbe1ac6918db/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/yinhong/.cache/huggingface/datasets/json/default-9ebebbe1ac6918db/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [42]:

evaluation_prompt = dedent("""\
    Please pretend you are a human reader. Read the tree versions of the answers below for a given question and rate their coherence scores following the guideline.
    Coherence guidelines:
    1. Evaluate how well the sentences transition from one to another. A fluent text should have seamless connections between sentences.
    2. Evaluate how well the sentences are organized and the ideas are conveyed. A coherent text should have a clear and precise structure.
    General guidelines:
    1. Rate the coherence of the text from 1 to 10, where 1 is the lowest and 10 is the highest.
    2. Utilize the entire rating scale, from the lowest to the highest score, to provide nuanced feedback.
    3. Please return in JSON format. For example, {"score1": 1, "score2": 2, "score3": 3}.

    Question:
    {{ headline }}

    Answer version 1:
    {{ version1 }}

    Answer version 2:
    {{ version2 }}

    Answer version 3:
    {{ version3 }}

    Rating:\
""")

environment = Environment()
evaluation_prompt = environment.from_string(evaluation_prompt)

gpt_scores = []
for id in tqdm(range(lfqa_data.shape[0])):
    try:
        prompt = evaluation_prompt.render(
            headline=lfqa_data['question'][id],
            version1=lfqa_data['answer_shuffled_0'][id],
            version2=lfqa_data['answer_shuffled_1'][id],
            version3=lfqa_data['answer_shuffled_2'][id]
        )
        random_indice = lfqa_data['random_indice'][id]
        random_indice = ast.literal_eval(random_indice)
        llm_output = call_openai_chat_completion(prompt)
        corrected_score = [list(llm_output.values())[i] for i in random_indice]
        gpt_scores.append(corrected_score)
    except Exception as e:
        print('Too long? ',e)
        gpt_scores.append(None)
        pass

100%|██████████| 110/110 [07:04<00:00,  3.86s/it] 


In [43]:
# save the data
lfqa_data = lfqa_data.add_column("gpt_scores", gpt_scores)
lfqa_data.to_json("lfqa_annotations.jsonl", orient="records", lines=True, index=False)


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

775177

In [20]:
gs = [s for s in gpt_scores if s!=[]]

sum([1 if s[1]<s[2] else 0 for s in gs]) / len(gs)

0.5818181818181818

# Recipe


In [44]:

recipe_data = load_dataset('json', data_files="recipe_annotations.jsonl")['train']

Downloading and preparing dataset json/default to /home/yinhong/.cache/huggingface/datasets/json/default-4e8a2b4466867fad/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/yinhong/.cache/huggingface/datasets/json/default-4e8a2b4466867fad/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [45]:

evaluation_prompt = dedent("""\
    Please pretend you are a human reader. Read the tree versions of the recipes below for a given dish title and rate their coherence scores following the guideline.
    Coherence guidelines:
    1. Evaluate how well the sentences transition from one to another. A fluent text should have seamless connections between sentences.
    2. Evaluate how well the sentences are organized and the ideas are conveyed. A coherent text should have a clear and precise structure.
    General guidelines:
    1. Rate the coherence of the text from 1 to 10, where 1 is the lowest and 10 is the highest.
    2. Utilize the entire rating scale, from the lowest to the highest score, to provide nuanced feedback.
    3. Please return in JSON format. For example, {"score1": 1, "score2": 2, "score3": 3}.

    Dish title:
    {{ title }}

    Recipe version 1:
    {{ version1 }}

    Recipe version 2:
    {{ version2 }}

    Recipe version 3:
    {{ version3 }}

    Rating:\
""")

environment = Environment()
evaluation_prompt = environment.from_string(evaluation_prompt)

gpt_scores = []
for id in tqdm(range(recipe_data.shape[0])):
    try:
        prompt = evaluation_prompt.render(
            title=recipe_data['title'][id],
            version1=recipe_data['answer_shuffled_0'][id],
            version2=recipe_data['answer_shuffled_1'][id],
            version3=recipe_data['answer_shuffled_2'][id]
        )
        random_indice = lfqa_data['random_indice'][id]
        random_indice = ast.literal_eval(random_indice)
        llm_output = call_openai_chat_completion(prompt)
        corrected_score = [list(llm_output.values())[i] for i in random_indice]
        gpt_scores.append(corrected_score)
    except Exception as e:
        print('Too long? ',e)
        gpt_scores.append(None)
        pass

100%|██████████| 110/110 [02:30<00:00,  1.37s/it]


In [46]:
# save data
recipe_data = recipe_data.add_column("gpt_scores", gpt_scores)
recipe_data.to_json("recipe_annotations.jsonl", orient="records", lines=True, index=False)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

484996

In [29]:
gs = [s for s in gpt_scores if s!=[]]

sum([1 if s[1]<s[2] else 0 for s in gs]) / len(gs)

0.5363636363636364