In [None]:
from zeno_client import ZenoClient
import pandas as pd
import os
import dotenv

dotenv.load_dotenv(override=True)

In [None]:
client = ZenoClient(os.environ["ZENO_API_KEY"])

In [None]:
df = pd.read_json("https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json")

In [None]:
df["id"] = df.index
df["data"] = df.apply(lambda x: {"instruction": x['instruction'], "input": x['input']}, axis=1)
df["type"] = df["instruction"].str.split(" ").str[0]
df["has input"] = df["input"] != ""
df["instruction length"] = df["instruction"].str.len()
df["input length"] = df["input"].str.len()
df["ouput length"] = df["output"].str.len()

In [None]:
df["type"].value_counts()
top_20_types = df["type"].value_counts().nlargest(20).index
df.loc[~df["type"].isin(top_20_types), "type"] = "other"

In [None]:
proj = client.create_project(
    name="Alpaca Dataset",
    public=True,
    view={
        "data": {
            "type": "vstack",
            "keys": {
                "instruction": {"label": "instruction:", "type": "text"},
                "input": {"label": "input:", "type": "text"},
            },
        },
        "label": {"type": "text"},
        "output": {"type": "text"},
    },
    description="Explore the data that makes up the Alpaca instruction-tuned dataset.",
)

In [None]:
proj.upload_dataset(df, id_column="id", data_column="data", label_column="output")