In [11]:
from uptrain import Settings
from uptrain.operators import TextCompletion, JsonReader

import os
import polars as pl
import nest_asyncio
nest_asyncio.apply()

In [4]:
url = "https://uptrain-assets.s3.ap-south-1.amazonaws.com/data/uptrain_benchmark.jsonl"
dataset_path = os.path.join('./', "uptrain_benchmark.jsonl")

if not os.path.exists(dataset_path):
    import httpx
    r = httpx.get(url)
    with open(dataset_path, "wb") as f:
        f.write(r.content)  

dataset = pl.read_ndjson(dataset_path)
print(dataset)

shape: (25, 3)
┌───────────────────────────────────┬───────────────────────────────────┬─────┐
│ question                          ┆ context                           ┆ idx │
│ ---                               ┆ ---                               ┆ --- │
│ str                               ┆ str                               ┆ i64 │
╞═══════════════════════════════════╪═══════════════════════════════════╪═════╡
│ How to get a grip on finance?'    ┆ Try downloading a finance app li… ┆ 1   │
│ How do “held” amounts appear on … ┆ "The ""hold"" is just placeholde… ┆ 2   │
│ Does negative P/E ratio mean sto… ┆ P/E is the number of years it wo… ┆ 3   │
│ Should a retail trader choose a … ┆ "That\'s like a car dealer adver… ┆ 4   │
│ Possibility to buy index funds a… ┆ "As user quid states in his answ… ┆ 5   │
│ …                                 ┆ …                                 ┆ …   │
│ Discuss the role of inflation in… ┆ Inflation is a pervasive economi… ┆ 21  │
│ Explain the concept of 

In [5]:
dataset_path="./benchmark.jsonl"
claude_settings = Settings(model="claude-3-opus-20240229", rpm_limit=4)
dataset = JsonReader(fpath=dataset_path).setup(settings=claude_settings).run()["output"]

dataset = dataset.with_columns([pl.lit("claude-3-opus-20240229").alias("model")])
dataset_with_claude_responses = TextCompletion(col_in_prompt="question", col_out_completion="response").setup(settings=claude_settings).run(dataset)["output"]
dataset_with_claude_responses


100%|██████████| 25/25 [05:31<00:00, 13.25s/it]
  with ThreadPoolExecutor(max_workers=1) as executor:


question,context,idx,model,response
str,str,i64,str,str
"""How to get a g…","""Try downloadin…",1,"""claude-3-opus-…","""Getting a grip…"
"""How do “held” …","""""The """"hold"""" …",2,"""claude-3-opus-…","""When a credit …"
"""Does negative …","""P/E is the num…",3,"""claude-3-opus-…","""A negative P/E…"
"""Should a retai…","""""That\'s like …",4,"""claude-3-opus-…","""The decision t…"
"""Possibility to…","""""As user quid …",5,"""claude-3-opus-…","""Yes, it is pos…"
…,…,…,…,…
"""Discuss the ro…","""Inflation is a…",21,"""claude-3-opus-…","""Inflation is a…"
"""Explain the co…",""" The Earth's …",22,"""claude-3-opus-…","""Plate tectonic…"
"""How did the su…",""" The Surreal…",23,"""claude-3-opus-…","""The Surrealist…"
"""Discuss the im…",""" Globalizatio…",24,"""claude-3-opus-…","""Globalization …"


In [12]:
gpt_settings = Settings(model="gpt-4", rpm_limit=100)
dataset = dataset_with_claude_responses.with_columns([pl.lit("gpt-4").alias("model")])
experiment_dataset = TextCompletion(col_in_prompt="question", col_out_completion="ground_truth").setup(settings=gpt_settings).run(dataset)["output"]
experiment_dataset

100%|██████████| 25/25 [00:28<00:00,  1.14s/it]


question,context,idx,model,response,ground_truth
str,str,i64,str,str,str
"""How to get a g…","""Try downloadin…",1,"""gpt-4""","""Getting a grip…","""1. Educate You…"
"""How do “held” …","""""The """"hold"""" …",2,"""gpt-4""","""When a credit …","""""Held"" amounts…"
"""Does negative …","""P/E is the num…",3,"""gpt-4""","""A negative P/E…","""A negative P/E…"
"""Should a retai…","""""That\'s like …",4,"""gpt-4""","""The decision t…","""Whether a reta…"
"""Possibility to…","""""As user quid …",5,"""gpt-4""","""Yes, it is pos…","""Yes, it is pos…"
…,…,…,…,…,…
"""Discuss the ro…","""Inflation is a…",21,"""gpt-4""","""Inflation is a…","""Inflation is a…"
"""Explain the co…",""" The Earth's …",22,"""gpt-4""","""Plate tectonic…","""Plate tectonic…"
"""How did the su…",""" The Surreal…",23,"""gpt-4""","""The Surrealist…","""The Surrealist…"
"""Discuss the im…",""" Globalizatio…",24,"""gpt-4""","""Globalization …","""Globalization …"


In [30]:
experiment_dataset

question,idx,response,ground_truth
str,i64,str,str
"""How to get a g…",1,"""Getting a grip…","""1. Educate You…"
"""How do “held” …",2,"""When a credit …","""""Held"" amounts…"
"""Does negative …",3,"""A negative P/E…","""A negative P/E…"
"""Should a retai…",4,"""The decision t…","""Whether a reta…"
"""Possibility to…",5,"""Yes, it is pos…","""Yes, it is pos…"
…,…,…,…
"""Discuss the ro…",21,"""Inflation is a…","""Inflation is a…"
"""Explain the co…",22,"""Plate tectonic…","""Plate tectonic…"
"""How did the su…",23,"""The Surrealist…","""The Surrealist…"
"""Discuss the im…",24,"""Globalization …","""Globalization …"


In [29]:
from uptrain import EvalLLM, ResponseMatching

settings = Settings(evaluate_locally=False)

# Drop the "context" and "model" columns as they are not needed for local evaluation
experiment_dataset = experiment_dataset.drop(["context", "model"])

eval_llm = EvalLLM(settings=settings)
results = eval_llm.evaluate(
    data=experiment_dataset,
    checks=[
        ResponseMatching(
            method="llm",
            col_ground_truth="ground_truth",
        )
    ]
)

[32m2024-03-06 15:45:10.510[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate_on_server[0m:[36m341[0m - [1mSending evaluation request for rows 0 to <50 to the Uptrain[0m


[32m2024-03-06 15:45:43.867[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m330[0m - [1mServer is not running![0m
