In [None]:
import typing
import weave
from weave import weaveflow
import models
import example_adaptor

In [None]:
weave.init('self-reflection5')

In [None]:
dataset_ref = weaveflow.publish_huggingface_dataset("hotpot_qa", 'distractor', "validation", 10, 0)

In [None]:
@weave.type()
class HotpotExampleAdaptor(example_adaptor.ExampleAdaptor):
    @weave.op()
    def example_to_prompt(self, example: typing.Any) -> typing.Any:
        context_sentences = example['context']['sentences']
        context = '\n'.join('\n'.join(cs) for cs in context_sentences)
        question = example['question']
        return f'Context\n\n{context}\n\n{question}'

adaptor = HotpotExampleAdaptor("Please provide a minimal answer, with no accompanying explanation and no punctuation.")

In [None]:
chat_model = weaveflow.OpenaiChatModel('gpt-3.5-turbo')
single_step_model = models.SingleStepModel(chat_model, adaptor)
reflect_model = models.SelfReflectModel(chat_model, adaptor)

In [None]:
@weave.op()
def example_answer(example: typing.Any) -> str:
    return example['answer']

eval_exact = weaveflow.EvaluateExactMatch(example_answer)
eval_chat_model = weaveflow.StructuredOutputChatModelSystemPrompt(
    weaveflow.OpenaiChatModel('gpt-3.5-turbo', 0.7))
@weave.op()
def make_llm_eval_messages(example: typing.Any, prediction: typing.Any) -> typing.Any:
    prompt_args = {
        'question': example['question'],
        'answer': example['answer'],
        'prediction': prediction
    }
    prompt = """
Please score the following, on a scale of 1-5, with one being worse. Also provide your rationale.

Question: {question}

Answer: {answer}

Correct answer: {prediction}
""".format(**prompt_args)
    return [{'role': 'user', 'content': prompt}]
eval_llm = weaveflow.EvaluateLLM(eval_chat_model, make_llm_eval_messages)
eval = weaveflow.EvaluateMulti({
    'exact': eval_exact,
    'llm': eval_llm
})

evaluation_single = weaveflow.EvaluateNamedSteps(['final_answer'], eval)
evaluation_reflect = weaveflow.EvaluateNamedSteps(['first_answer', 'final_answer'], eval)

In [None]:
weaveflow.evaluate(evaluation_single, dataset_ref, single_step_model)
weaveflow.evaluate(evaluation_reflect, dataset_ref, reflect_model)