In [10]:
import os
import shutil
import random

import guidance
from tqdm import tqdm
import itertools
from bazaar.lem_utils import get_guidance_cache_directory
import json
import backoff
from collections import defaultdict
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bazaar.lem_utils import OAI_EXCEPTIONS
from typing import List, SupportsFloat, Optional
from bazaar.lem_utils import clean_program_string, select_quotes_with_debate, ask_for_guidance
from bazaar.schema import Quote 
from bazaar.py_utils import dataclass_from_dict
from bazaar.schema import Query
from bazaar.schema import Block
import copy
os.environ["OPENAI_API_KEY"] = "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"

In [193]:
summary = json.load(open("/Users/martinweiss/PycharmProjects/tn-learn/info-bazaar/experiments/fup-specific-gpt-4-4.605-retrieve/Logs/bazaar_summary.json", "r"))
dataset = json.load(open("/Users/martinweiss/PycharmProjects/tn-learn/info-bazaar/data/final_dataset_with_metadata.json", "r"))

In [194]:
class IssuedBy:
    def __init__(self, unique_id):
        self.unique_id = unique_id

In [195]:

@backoff.on_exception(backoff.expo, OAI_EXCEPTIONS, max_tries=5)
def select_quotes_with_debate(
    quotes: List["Quote"],
    budget: Optional[SupportsFloat] = None,
    fraction_of_max_budget: Optional[float] = None,
    model_name: Optional[str] = None,
    use_block_content_metadata: bool = False,
    use_block_metadata_only: bool = False,
) -> List["Quote"]:
    if len(quotes) == 0:
        return []
    assert all(
        [quotes[0].query.compare_content(quote.query) for quote in quotes[1:]]
    ), "All quotes must have the same query."
    # Get the budget
    if budget is None:
        budget = quotes[0].query.max_budget
    else:
        budget = float(budget)
    if fraction_of_max_budget is not None:
        budget = round(fraction_of_max_budget * quotes[0].query.max_budget, 1)

    # We need to scale the prices. For this, we can assume that the scaled budget
    # will always be $100. The prices must be scaled accordingly.
    scale_factor = 100 / budget

    # Get the question
    question = quotes[0].query.text

    # Build the content extractor
    def content_extractor(block: "Block") -> str:
        if use_block_content_metadata:
            return block.content_with_metadata
        elif use_block_metadata_only:
            return block.metadata
        else:
            return block.content

    # Get the options
    options = [
        {
            "answer_block": " [...] ".join(
                [content_extractor(block) for block in quote.answer_blocks]
            ),
            "price": max(int(round(quote.price * scale_factor)), 1),
        }
        for quote in quotes
    ]

    program_string = """
    {{#system~}}
    Bobby William and Michael Burry are employed by a company that specializes in acquiring information. They are trying to answer a question by purchasing information from an information market. In this market, vendors sell pieces of information at a price. 

    Bobby wants to do a really good job at answering the question. This entails knowing as much as possible.

    Michael, on the other hand, is financially responsible. Michael wants to make sure ensures that they don't waste money buying unnecessary information. For instance, if two pieces of information offer the same insight, then Michael would go for the cheaper one.  
    {{~/system}}

    {{#user~}}
    The question is "{{question}}?"

    Here are your options.
    ---{{#each options}}
    Option {{add @index 1}}: {{this.answer_block}}
    {{/each}}---

    {{#each options~}}
    Option {{add @index 1}} costs ${{this.price}}
    {{/each}}
    Together, Bobby and Michael must decide which options to buy and which ones to not buy with their budget of ${{balance}}. Simulate a constructive argument between Bobby and Michael, where they debate about the usefulness of the information provided in each option towards answering the question, and whether their price is worth paying. 

    Note that Bobby and Michael may choose to buy any number of options, or none at all. At the end of the argument, they must arrive at a verdict. This verdict must be printed as: 

    VERDICT:

    {{#each options~}}
    Option {{add @index 1}}: <Buy or Pass>
    {{/each}}
    {{~/user}}

    {{#assistant~}}
    {{gen "answer" temperature=0.0 max_tokens=2048}}
    {{~/assistant}}
    """
    program_string = clean_program_string(program_string)

    # Run the program
    program_output = ask_for_guidance(
        program_string=program_string,
        llm=get_llm(model_name=model_name),
        silent=True,
        inputs=dict(question=question, options=options, balance=100,),
        output_keys=["answer"],
    )
    answer = program_output["answer"]

    # Now parse the answer
    def extract_verdicts(s: str) -> List[bool]:
        # Split the text into sections based on "VERDICT:"
        sections = re.split(r"\bVERDICT\b\s*:\s*", s, flags=re.IGNORECASE)
        if len(sections) < 2:
            return []

        # Dictionary to store the verdicts of each option
        option_verdicts = {}
        for section in sections[1:]:
            # Extract options and their verdicts in a case-insensitive manner
            options = re.findall(
                r"Option (\d+): (Buy|Pass)", section, flags=re.IGNORECASE
            )

            for option_num, verdict in options:
                option_num = int(option_num)
                is_buy = verdict.lower() == "buy"

                # Check if this option was seen before
                if option_num in option_verdicts:
                    # If the verdict is inconsistent, raise an exception
                    if option_verdicts[option_num] != is_buy:
                        raise ValueError(
                            f"Inconsistent verdict for Option {option_num}."
                        )
                else:
                    option_verdicts[option_num] = is_buy

        # Convert the verdicts dictionary to a sorted list based on option numbers
        return [option_verdicts[num] for num in sorted(option_verdicts.keys())]

    # Parse the verdicts, select the quotes and return
    verdicts = extract_verdicts(answer)
    selected_quotes = [quote for quote, verdict in zip(quotes, verdicts) if verdict]
    return selected_quotes


"How does reinforcement learning apply to InstructGPT's training procedure?"

In [222]:
all_candidates = []
for buyer_idx in range(2, 9):
    candidate_quotes = []
    buyer = summary['buyer_agents'][buyer_idx]
    for quote in buyer['accepted_quotes']:
        if quote['query']['text'] != buyer['principal']['query']['text']:
            continue
        quote = copy.deepcopy(quote)
        quote['query']['required_by_time'] = None
        quote['query']['issued_by'] = IssuedBy(quote['query']['issued_by'])
        quote['issued_by'] = IssuedBy(quote['issued_by'])
        quote['query'] = dataclass_from_dict(Query, quote['query'])
        block_dict = quote['answer_blocks'][0]
        block_id = block_dict['block_id']
        document_id, section_title, token_start, token_end = block_id.split("/")
        document_title = dataset[document_id]['metadata']['title']
        publication_date = dataset[document_id]['metadata']['publication_date']
        block_dict['document_id'] = document_id
        block_dict['section_title'] = section_title
        block_dict['token_start'] = token_start
        block_dict['token_end'] = token_end
        block_dict['document_title'] = document_title
        block_dict['publication_date'] = publication_date
        quote['answer_blocks'][0] = dataclass_from_dict(Block, block_dict)
        quote = dataclass_from_dict(Quote, quote)
        candidate_quotes.append(quote)
    
    for quote in buyer['rejected_quotes']:
        if quote['query']['text'] != buyer['principal']['query']['text']:
            continue

        if quote['quote_progression'] == 3:
            quote = copy.deepcopy(quote)
            quote['query']['required_by_time'] = None
            quote['query']['issued_by'] = IssuedBy(quote['query']['issued_by'])
            quote['issued_by'] = IssuedBy(quote['issued_by'])
            quote['query'] = dataclass_from_dict(Query, quote['query'])
            block_dict = quote['answer_blocks'][0]
            block_id = block_dict['block_id']
            document_id, section_title, token_start, token_end = block_id.split("/")
            document_title = dataset[document_id]['metadata']['title']
            publication_date = dataset[document_id]['metadata']['publication_date']
            block_dict['document_id'] = document_id
            block_dict['section_title'] = section_title
            block_dict['token_start'] = token_start
            block_dict['token_end'] = token_end
            block_dict['document_title'] = document_title
            block_dict['publication_date'] = publication_date
            quote['answer_blocks'][0] = dataclass_from_dict(Block, block_dict)
            quote = dataclass_from_dict(Quote, quote)
            candidate_quotes.append(quote)
    # break
    random.shuffle(candidate_quotes)
    print([x.query.text for x in candidate_quotes])
    if len(candidate_quotes) >= 3:
        all_candidates.append(candidate_quotes[:3])



["How does reinforcement learning apply to InstructGPT's training procedure?", "How does reinforcement learning apply to InstructGPT's training procedure?", "How does reinforcement learning apply to InstructGPT's training procedure?"]
['What is the significance of the date November 30th, 2022, in the development of language models?', 'What is the significance of the date November 30th, 2022, in the development of language models?', 'What is the significance of the date November 30th, 2022, in the development of language models?']
['What is the difference between continuous prompts and discrete prompts?', 'What is the difference between continuous prompts and discrete prompts?', 'What is the difference between continuous prompts and discrete prompts?']
["Can in-context learning occur when there's a distribution mismatch between prompts and pretraining data?", "Can in-context learning occur when there's a distribution mismatch between prompts and pretraining data?", "Can in-context learn

In [226]:
import re

In [223]:
all_candidates = all_candidates[2:5]

In [231]:
permutation_results = defaultdict(list)
baseline_results = defaultdict(list)
models = ["gpt-3.5-turbo", "gpt-4"]

for model_name in models:
    for candidate_quotes in tqdm(all_candidates):
        # all_permutations = list(itertools.permutations(candidate_quotes))
        all_baselines = list(itertools.repeat(candidate_quotes, 6))
        for quote_permutation in tqdm(all_permutations):
            result = select_quotes_with_debate(quotes=quote_permutation, budget=quote_permutation[0].query.max_budget, model_name=model_name, use_block_content_metadata=False, use_block_metadata_only=False)
            guidance.llms.Transformers.cache.clear()
            guidance.llms.OpenAI.cache.clear()
            permutation_results[model_name].append((quote_permutation, result))
            
        for quotes in tqdm(all_baselines):
            result = select_quotes_with_debate(quotes=quotes, budget=quotes[0].query.max_budget, model_name=model_name, use_block_content_metadata=False, use_block_metadata_only=False)
            guidance.llms.Transformers.cache.clear()
            guidance.llms.OpenAI.cache.clear()
            baseline_results[model_name].append((quotes, result))


  0%|                                                                                                                                                                                     | 0/3 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                     | 0/6 [00:00<?, ?it/s][A
 17%|████████████████████████████▊                                                                                                                                                | 1/6 [00:19<01:39, 19.84s/it][A
 33%|█████████████████████████████████████████████████████████▋                                                                                                                   | 2/6 [00:37<01:14, 18.71s/it][A
 50%|██████████████████████████████████████████████████████████████████████████████████████▌                                                               

In [235]:
# baseline_results

In [233]:
quote_ids

['2302.11521/Related Work/182/358',
 '2301.03797/Baselines encoder-decoder models/41/412',
 '2101.03961/Introduction/0/157']

In [229]:
for quotes, results in baseline_results:
    result_ids = [r.answer_blocks[0].block_id for r in results]
    print(result_ids)

ValueError: too many values to unpack (expected 2)

In [253]:
for model_name in models:

    perm_positional_count = np.zeros(3, dtype=np.uint8)
    perm_positional_acceptance_count = np.zeros(3, dtype=np.uint8)
    baseline_positional_count = np.zeros(3, dtype=np.uint8)
    baseline_positional_acceptance_count = np.zeros(3, dtype=np.uint8)
    
    for quotes, results in permutation_results[model_name]:
        result_ids = [r.answer_blocks[0].block_id for r in results]
        for quote_idx, quote in enumerate(quotes):
            quote_id = quote.answer_blocks[0].block_id
            perm_positional_count[quote_idx] += 1
            if quote_id in result_ids:
                perm_positional_acceptance_count[quote_idx] += 1
    
    
    for quotes, results in baseline_results[model_name]:
        result_ids = [r.answer_blocks[0].block_id for r in results]
        for quote_idx, quote in enumerate(quotes):
            quote_id = quote.answer_blocks[0].block_id
            baseline_positional_count[quote_idx] += 1
            if quote_id in result_ids:
                baseline_positional_acceptance_count[quote_idx] += 1
    baseline_conditional_prob = baseline_positional_acceptance_count / baseline_positional_count
    permutation_conditional_prob = perm_positional_acceptance_count / perm_positional_count
    print(model_name)
    print("--------")
    print(f"baseline (measures base uncertainty when selecting an option) {100*baseline_conditional_prob}")
    print(f"permutation (measures P(position)) {100*permutation_conditional_prob}")
    print("")
    

gpt-3.5-turbo
--------
baseline (measures base uncertainty when selecting an option) [44.44444444  0.         66.66666667]
permutation (measures P(position)) [16.66666667 38.88888889 44.44444444]

gpt-4
--------
baseline (measures base uncertainty when selecting an option) [66.66666667 38.88888889 50.        ]
permutation (measures P(position)) [44.44444444 50.         27.77777778]



In [254]:
sum(perm_positionalacceptance_count)

22

In [255]:
perm_positional_count

array([18, 18, 18], dtype=uint8)