In [235]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from copy import deepcopy
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from dataclasses import dataclass, field
from typing import List, Union, Tuple, Any, Optional, Dict
from enum import Enum
from tqdm import tqdm
import numpy as np
import random
import backoff as backoff
import openai
openai.api_key = os.environ.get("OPENAI_API_KEY")

import sys
sys.path.append("../")
from bazaar.bots import Bot

%matplotlib inline

In [2]:
os.getcwd()

'/network/scratch/w/weissmar/tn/info-bazaar/notebooks'

In [99]:
with open("../data/machine-learning/dataset_step_1.json", "r") as f:
    x = f.read()
dataset = json.loads(x)

In [None]:

HYDE_PROMPT = """
Below is a question. Your task is to create a paragraph from a fictional document that exactly answers that question. This excerpt might look out of context, but that’s ok — the important bit is that it unambiguously answers the question. Respond only with the text of this fictional excerpt, and nothing else.

Question: {question}
"""


@backoff.on_exception(backoff.expo, (openai.error.RateLimitError, openai.error.ServiceUnavailableError))
def hyde(question):
    bot = Bot("hydebot", "gpt-3.5-turbo", "You are a helpful assistant.")
    bot.add_message(HYDE_PROMPT.format(question=question))
    response = bot.complete()
    # print(f"fake doc: {response}")
    return response
def calculate_embedding(text):
    return openai.Embedding.create(
        input=[text.replace("\n", " ")], model="text-embedding-ada-002"
    )["data"][0]["embedding"]


In [208]:
@dataclass
class Principal:
    name: str

@dataclass
class BuyerPrincipal(Principal):
    queries: List[Query]

@dataclass(frozen=True)
class Nugget:
    question: str
    answer: str
    embedding: str

@dataclass(frozen=True)
class Block:
    block_id: str
    content: str
    num_tokens: int
    embedding: List[float]
    nuggets: Optional[List[Nugget]]

@dataclass
class Institution(Principal):
    id: str
    display_name: str
    ror: str
    country_code: str
    type: str
    blocks: Dict[str, Block] = field(default_factory=dict)
   
    def __init__(self, name=None, *args, **kwargs):
        if not name:
            name = kwargs.get('display_name', '')
        super().__init__(name)
        for k, v in kwargs.items():
            setattr(self, k, v)

    def __hash__(self):
        return hash(self.id)

@dataclass
class Author(Principal):
    id: str
    display_name: str
    orcid: str
    last_known_institution: Optional[Institution] = None
    related_concepts: Optional[List[str]] = None
    blocks: Dict[str, Block] = field(default_factory=dict)
    
    def __init__(self, name=None, *args, **kwargs):
        if not name:
            name = kwargs.get('display_name', '')
        super().__init__(name)
        for k, v in kwargs.items():
            setattr(self, k, v)

    def __hash__(self):
        return hash(self.id)


@dataclass
class Vendor:
    principal: Union[Author, Institution]
    block_price: List[int]
    observed_blocks: Optional[List[Block]] = field(default_factory=list)
    response_time_guarantee: int = 0

    def check_bb(bb_queries: List[Query]):
        for bb_query in bb_queries:
            pass
    

@dataclass
class Quote:
    query: str
    most_similar_question_answered_in_block: str
    answer_block: str
    block_price: int
    eta: Union[int, None] = None

class Urgency(Enum):
    DAY = "day"
    HOUR = "hour"
    INSTANT = "instant"

@dataclass
class Query:
    query: str
    max_budget: int
    urgency: Urgency

@dataclass
class BulletinBoard:
    queries: List[Query]

In [203]:
# Build the queries and set them on a Principal
queries = []
for arxiv_id, data in tqdm(dataset.items()):
    for block in data['blocks']:
        for nugget in block.get('nuggets', []):
            # Assuming mean budget is 1000 and standard deviation is 200
            mu = 0.0  # mean of the underlying normal distribution
            sigma = 1.0  # standard deviation of the underlying normal distribution
            max_budget = np.random.lognormal(mean=mu, sigma=sigma, size=1)
            urgency = random.choice(list(Urgency))
            query = Query(nugget['question'], max_budget, urgency)
            queries.append(query)
buyer_principal = BuyerPrincipal(name="skinner", queries=queries)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:00<00:00, 11243.09it/s]


In [182]:
# Build all authors and institutions
authors = {}
institutions = {}

for arxiv_id, data in tqdm(dataset.items()):
    for authorship in data['authorships']:
        for ins in authorship['institutions']:
            if ins['id'] not in institutions:
                institution = Institution(**ins, blocks={})
                institutions[institution.id] = institution
        author = Author(**authorship['author'], blocks={})
        if author not in authors:
            authors[author.id] = author
        

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:00<00:00, 18858.90it/s]


In [183]:

for arxiv_id, data in tqdm(dataset.items()):
    for block in data['blocks']:
        if block.get("nuggets") is None:
            block["nuggets"] = []
        block_obj = Block(**block)
        
        for authorship in data['authorships']:
            for ins in authorship['institutions']:
                institutions[ins['id']].blocks[block_obj.block_id] = block_obj
            authors[authorship['author']['id']].blocks[block_obj.block_id] = block_obj

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:00<00:00, 2673.62it/s]


In [185]:
# Create vendors and assign some blocks to them
def assign_blocks_to_vendor(principal, block_fraction=0.1):
    num_blocks = len(principal.blocks)
    num_blocks_to_vendor = int(num_blocks * block_fraction)
    
    mu = 0.0  # mean of the underlying normal distribution
    sigma = 1.0  # standard deviation of the underlying normal distribution
    block_prices = np.random.lognormal(mean=mu, sigma=sigma, size=num_blocks_to_vendor)

    observed_blocks = {}
    block_keys = list(principal.blocks.keys())
    random.shuffle(block_keys)

    for i, key in enumerate(block_keys):
        if i == num_blocks_to_vendor:
            break
        observed_blocks[key] = principal.blocks[key]
        del principal.blocks[key]
        
    response_time_guarantee = np.random.randint(1, 10) 

    vendor = Vendor(principal=principal, block_price=block_prices.tolist(), observed_blocks=observed_blocks, response_time_guarantee=response_time_guarantee)

    return vendor

vendors = []
for author in authors.values():
    vendors.append(assign_blocks_to_vendor(author))
for institution in institutions.values():
    vendors.append(assign_blocks_to_vendor(institution))


In [205]:
# OK! We have:
# - A Principal with Queries
# - A BulletinBoard (BB) you can post queries to.
# - Some Authors and Institutions with Blocks and Nuggets
# - Some Vendors assigned Blocks to Hoc
# - All the pricing shit sampled


In [236]:
bb = BulletinBoard(queries=[buyer_principal.queries.pop()])

In [239]:
doc = hyde(bb.queries[0])

In [242]:
emb = calculate_embedding(doc)

In [253]:
fucks = []
for vendor in vendors:
    for block in vendor.observed_blocks:
        for query in bb.queries:
            fucks.append("fuck.")
print(f"number of vendors: {len(vendors)}")
print(f"number of fucks: {len(fucks)}")

number of fucks: 2765


In [192]:
quote = Quote('What is AI?', 'What is Machine Learning?', 'AI is a field of computer science...', 200)
