In [1]:
from pathlib import Path
from dotenv import load_dotenv

# import os
#
# os.environ["OPENAI_API_KEY"] = "..."
# os.environ["PAREA_API_KEY"] = "..."

load_dotenv(Path("../.env.test"), override=True)

True

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
from openai import OpenAI
from parea import Parea, trace

parea = Parea()
openai = OpenAI()

parea.wrap_openai_client(openai)

In [4]:
from parea.schemas import Log, EvaluationResult


def score_answer(log: Log) -> EvaluationResult:
    if log.target:
        output = log.output.split("#### ")[-1]
        target = log.target.split("#### ")[-1]
        return EvaluationResult("correctness", int(output == target))


@trace(eval_funcs=[score_answer])
def langchain_chain(inputs):
    from langchain_openai import ChatOpenAI
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_core.output_parsers import StrOutputParser

    messages = [
        (
            "system",
            "You are an expert math solver. Your answer must be just the number with no separators, and nothing else. Follow the format of the examples.",
        ),
        ("user", "{question}")
    ]

    chain = (
        ChatPromptTemplate.from_messages(messages)
        | ChatOpenAI(model="gpt-3.5-turbo")
        | StrOutputParser()
    )

    answer = chain.invoke(inputs)
    return answer


In [5]:
experiment_kwargs = dict(
    data="gsm8k-testset",
    n_workers=2,
)

parea.experiment(
    name="wibbly-wobbly",
    func=langchain_chain,
    **experiment_kwargs
).run()

Run name set to: cushy-jube, since a name was not provided.
Fetching test collection: gsm8k-testset
Fetched 5 test cases from collection: gsm8k-testset 



100%|██████████| 5/5 [00:02<00:00,  2.11it/s]
0it [00:04, ?it/s]


Experiment wibbly-wobbly Run cushy-jube stats:
{
  "latency": "0.84",
  "input_tokens": "0.00",
  "output_tokens": "0.00",
  "total_tokens": "0.00",
  "cost": "0.00000",
  "correctness": "0.40"
}


View experiment & traces at: https://app.parea.ai/experiments/wibbly-wobbly/efcf8e4f-90f3-4c69-993e-c18ca6fa6bd3



In [6]:
from zenbase.types import LMRequest, deflm

# Step 1: Add the deflm decorator
# Step 2: Incorporate few-shot demonstrations

@deflm # Step 1
@trace(eval_funcs=[score_answer])
def zen_chain(request: LMRequest):
    from langchain_core.output_parsers import StrOutputParser
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_openai import ChatOpenAI

    messages = [
        (
            "system",
            "You are an expert math solver. Your answer must be just the number with no separators, and nothing else. Follow the format of the examples.",
        )
    ]

    # Step 2
    for demo in request.zenbase.task_demos:
        messages += [
            ("user", demo.inputs["question"]),
            ("assistant", demo.outputs["target"]),
        ]

    messages.append(("user", "{question}"))

    chain = (
        ChatPromptTemplate.from_messages(messages)
        | ChatOpenAI(model="gpt-3.5-turbo")
        | StrOutputParser()
    )

    answer = chain.invoke(request.inputs)
    return answer


In [7]:
from zenbase.optim.metric.labeled_few_shot import LabeledFewShot
from zenbase.helpers.parea import ZenParea

demoset = ZenParea.collection_demos(parea.get_collection("gsm8k-demoset"))
optimizer = LabeledFewShot(demoset=demoset, shots=3)

best_fn, candidate_results = optimizer.perform(
    zen_chain,
    evaluator=ZenParea.metric_evaluator(**experiment_kwargs),
    samples=2,
    rounds=1,
)

Run name set to: loved-clip, since a name was not provided.
Fetching test collection: gsm8k-testset
Fetched 5 test cases from collection: gsm8k-testset 



100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
0it [00:04, ?it/s]


Experiment zenbase-synchronized-mission-critical-website Run loved-clip stats:
{
  "latency": "1.70",
  "input_tokens": "0.00",
  "output_tokens": "0.00",
  "total_tokens": "0.00",
  "cost": "0.00000",
  "correctness": "0.60"
}


View experiment & traces at: https://app.parea.ai/experiments/zenbase-synchronized-mission-critical-website/e931ed2a-33a9-49c1-9e01-248d2e3dcd95

Run name set to: paced-kiwi, since a name was not provided.
Fetching test collection: gsm8k-testset
Fetched 5 test cases from collection: gsm8k-testset 



100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
0it [00:04, ?it/s]


Experiment zenbase-robust-exuding-adapter Run paced-kiwi stats:
{
  "latency": "1.65",
  "input_tokens": "0.00",
  "output_tokens": "0.00",
  "total_tokens": "0.00",
  "cost": "0.00000",
  "correctness": "0.60"
}


View experiment & traces at: https://app.parea.ai/experiments/zenbase-robust-exuding-adapter/0b3f37bd-3786-4630-8342-0b35efa2579a



In [8]:
output = best_fn({"question": "What is 2+2?"})
output

'4'

In [11]:
# You can even run your function asynchronously in a coroutine
%autoawait

await best_fn.coroutine({
    "question": "I have 30 percent of company and Mamad has 40 percent of company and there are 10M shares, how many shares are unassigned?"
})

IPython autoawait is `on`, and set to use `asyncio`


'The shares assigned to me are 30% of 10M = 0.30 * 10M = 3M shares.\nThe shares assigned to Mamad are 40% of 10M = 0.40 * 10M = 4M shares.\nSo, the total assigned shares are 3M + 4M = 7M shares.\nTherefore, the number of unassigned shares is 10M - 7M = 3M shares.\n#### 3000000'

In [10]:
# You can also save the zenbase params for re-use
import pickle

pickled_zenbase = pickle.dumps(best_fn.zenbase)
zen_chain.zenbase = pickle.loads(pickled_zenbase)

zen_chain({"question": "What is 2 + 2?"}) # uses the best few-shot demos

'4'