In [None]:
import json

import pandas as pd
from tensorzero import AsyncTensorZeroGateway

In [None]:
TENSORZERO_GATEWAY_URL = "http://localhost:3000"

In [None]:
# Select only a subset of the dataset to speed things up
NUM_TRAIN_DATAPOINTS = 500
NUM_VAL_DATAPOINTS = 500

In [None]:
def load_dataset(path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    # Load the dataset
    df = pd.read_csv(path)
    df.output = df.output.apply(json.loads)

    # Remove entries with < 100 input characters; they are generally pretty noisy
    df = df[df["input"].str.len() >= 100]

    # Split the dataset into train and validation sets
    train_df = df[df["split"] == 0]
    val_df = df[df["split"] == 1]

    # Shuffle the splits
    train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)
    val_df = val_df.sample(frac=1, random_state=0).reset_index(drop=True)

    # Select only a subset of the dataset to speed things up
    train_df = train_df.iloc[:NUM_TRAIN_DATAPOINTS]
    val_df = val_df.iloc[:NUM_VAL_DATAPOINTS]

    return train_df, val_df

In [None]:
train_df, val_df = load_dataset("data/conllpp.csv")

print(f"Train data shape: {train_df.shape}")
print(f"Validation data shape: {val_df.shape}")

In [None]:
def df_to_datapoints(df: pd.DataFrame) -> list[dict]:
    datapoints = []

    for _, row in df.iterrows():
        datapoints.append(
            {
                "function_name": "extract_entities",
                "input": {"messages": [{"role": "user", "content": row["input"]}]},
                "output": row["output"],
            }
        )

    return datapoints


train_datapoints = df_to_datapoints(train_df)
val_datapoints = df_to_datapoints(val_df)

In [None]:
t0 = await AsyncTensorZeroGateway.build_http(gateway_url=TENSORZERO_GATEWAY_URL)

In [None]:
await t0.bulk_insert_datapoints(dataset_name="ner_train", datapoints=train_datapoints);

In [None]:
await t0.bulk_insert_datapoints(dataset_name="ner_val", datapoints=val_datapoints);