In [None]:
# Install Weave Library
# (note: this will become `pip install weave` soon - this is a temp dev state)
# !pip uninstall weave -y >/dev/null 2>&1
# !pip install git+https://github.com/wandb/weave@master -U -qqq >/dev/null 2>&1
# !echo "Installed Weave!"

In [None]:
# Authenticate with W&B
# (note: we can eliminate this step import in the future)

import wandb

wandb.login()

In [None]:
# Initialize Weave
# (note: we can eliminate the `weaveflow` import in future)

import weave

client = weave.init("wf_eval")

In [None]:
# Authenticate with OpenAI
import os
from getpass import getpass

if os.getenv("OPENAI_API_KEY") is None:
    os.environ["OPENAI_API_KEY"] = getpass(
        "Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n"
    )
assert os.getenv("OPENAI_API_KEY", "").startswith(
    "sk-"
), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")

In [None]:
import json

from openai import OpenAI


@weave.op()
def simple_openai_complete(message: str, model: str, system_prompt: str) -> str:
    completion = OpenAI().chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": message},
        ],
    )

    return completion.choices[0].message.content


@weave.op()
def lookup_docs(user_message: str) -> list[str]:
    docs = simple_openai_complete(
        f"Query: {user_message}",
        "gpt-3.5-turbo",
        "Please act like a vector database, returning up to 3 documents that relate to a query. Feel free to be creative, make up names & details, etc... - as if you have access to relevant documents containing critical information. Do not include docuemnt titles, just content. You must format your response as a JSON.load-able array of strings.",
    )
    try:
        json_res = json.loads(docs)
        res = []
        for item in json_res:
            res.append("" + item)
    except Exception as e:
        print(e)
        return []

    return res


@weave.type()
class GptRagModel:
    base_model: str
    system_prompt: str

    @weave.op()
    def predict(self, input: str) -> str:
        docs = lookup_docs(input)
        prompt = f"Given the following documents, please formulate a short, consise answer.\n Documents: {','.join(docs)}.\n\n Query: {input}."
        res = simple_openai_complete(prompt, self.base_model, self.system_prompt)
        return res


@weave.type()
class SimpleDataset:
    examples: list[str]


@weave.op()
def brevity_score(example: str, prediction: str) -> float:
    # returns a dict of scores
    return 1 / (1 + len(prediction))


@weave.op()
def relevance_score(example: str, prediction: str) -> int:
    # returns a dict of scores
    return int(
        simple_openai_complete(
            f"Prompt:{example}.\nAnswer:{prediction}.\nScore:",
            "gpt-3.5-turbo",
            "Score relevance of the output to the input. Emit ONLY a number between 0 and 9 inclusive. Nothing else",
        )
    )


@weave.op()
def score(example: str, prediction: str) -> dict:
    # returns a dict of scores
    return {
        "brevity": brevity_score(example, prediction),
        "relevance": relevance_score(example, prediction),
    }


@weave.op()
def eval_iter(model: GptRagModel, example: str) -> dict:
    return score(example, model.predict(example))


@weave.op()
def aggregate_scores(score_dicts) -> float:
    return sum(
        [score_dict["brevity"] * score_dict["relevance"] for score_dict in score_dicts]
    ) / len(score_dicts)


@weave.op()
def evaluate(model: GptRagModel, dataset: SimpleDataset):
    scores = []
    for example in dataset.examples:
        score_dict = eval_iter(model, example)
        scores.append(score_dict)
    score_agg = aggregate_scores(scores)
    return score_agg

In [None]:
models = [
    GptRagModel("gpt-3.5-turbo", "You are a helpful assistant."),
    GptRagModel("gpt-3.5-turbo", "You are a very bored, sarcastic assistant."),
    GptRagModel("gpt-3.5-turbo", "You are a evil, unhelpful assistant."),
]

dataset = SimpleDataset(
    [
        "What is the square root of pi?",
        "Describe inception, not the movie, the concept.",
        "How much is Apple worth today?",
    ]
)

for model in models:
    evaluate(model, dataset)