<img src="http://wandb.me/logo-im-png" width="400" alt="Weights & Biases" />

# Generate Synthetic Trace Data

Generate some synthetic OpenAI data we can explore in a LLM Monitoring Debugging Board.

# Step 0: Setup

In [None]:
import uuid
from datetime import timedelta
import weave

from weave.legacy.weave import ops_arrow
from weave.legacy.weave.monitoring import monitor, StreamTable

In [None]:
from weave.syndata_mon import random_predictions

In [None]:
WB_ENTITY = # replace with your W&B username or team name
WB_PROJECT = "weave"
WB_STREAM = "synthetic_openai_stream"

In [None]:
preds = random_predictions(10)

# Convert synthetic data into the format used by the weave.legacy.weave.monitoring.openai integration

# convert model_version in the synthetic data to an openai model version
# this makes it so that there's a new API key that has appeared in our logs recently, and that key
# has started using gpt-4 which makes a cost spike
versions = sorted(preds.column('model_version').unique())
version_map = {}
for i, v in enumerate(reversed(versions)):
    api_key = 'sk-U4...yK7z'
    model = 'gpt-3.5-turbo-0613'
    if i == 1 or i == 2:
        # second and third most recent versions use a different api key
        api_key = 'sk-U9...a22c'
    if i == 1:
        # second most recent version uses gpt-4
        model = 'gpt-4-0613'
    version_map[v] = (api_key, model)
    
spans = [] 
for i, pred in enumerate(preds):
    api_key, model = version_map[pred['model_version']]
    latency_mult = 1
    if model == 'gpt-4-0613':
        latency_mult = 3
    span = monitor.Span('openai.api_resources.chat_completion.type.create',
                 inputs={
                     'messages':[
                         {"role": "user", "content": pred['prompt']}
                     ]
                 },
                 output={
                     'id': 'chatcmpl-%s' % uuid.uuid4(),
                     'object': 'chat.completion',
                     'created': pred['timestamp'].timestamp(),
                     'model': model,
                     'choices': [
                         {
                             'index': 0,
                             'message': {
                                 'role': 'assistant',
                                 'content': pred['completion']
                             },
                             'finish_reason': 'stop'
                         }
                     ],

                 },
                 attributes={
                     'api_key': api_key,
                     'username': pred['username']
                 },
                 summary={
                     'prompt_tokens': pred['prompt_tokens'],
                     'completion_tokens': pred['completion_tokens'],
                     'total_tokens': (pred['prompt_tokens'] + pred['completion_tokens'])
                 })
    span.start_time = pred['timestamp']
    span.end_time = pred['timestamp'] + timedelta(seconds=pred['latency'] * latency_mult)
    spans.append({'timestamp': pred['timestamp'], **span.asdict()})

In [None]:
st = StreamTable(f"{WB_ENTITY}/{WB_PROJECT}/{WB_STREAM}")

for span in spans:
    st.log(span)

st.finish()
