# Model monitoring dashboard example

This notebook currently requires the plotly library to be installed.


In [None]:
import weave
from weave.legacy.scripts import syndata_mon

## Create synthetic data


In [None]:
#preds = syndata_mon.random_predictions(10)

#predictions = weave.save(preds, 'predictions')
#len(preds.column('prompt').to_pylist_raw())

In [None]:
import logging
import json
import os
from pathlib import Path

import tiktoken
import faiss
import numpy as np
from tenacity import (
    before_sleep_log,
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)
import openai

logger = logging.getLogger(__name__)
openai.api_key = os.environ['OPENAI_API_KEY']

# OpenAI API functions
retry_openai_decorator = retry(
        reraise=True,
        stop=stop_after_attempt(4),
        wait=wait_exponential(multiplier=1, min=4, max=10),
        retry=(
            retry_if_exception_type(openai.error.Timeout)
            | retry_if_exception_type(openai.error.APIError)
            | retry_if_exception_type(openai.error.APIConnectionError)
            | retry_if_exception_type(openai.error.RateLimitError)
            | retry_if_exception_type(openai.error.ServiceUnavailableError)
        ),
        before_sleep=before_sleep_log(logger, logging.WARNING),
    )

@retry_openai_decorator
def openai_embed(model, input):
    return openai.Embedding.create(input = input, model=model)

@retry_openai_decorator
def openai_chatcompletion(model, messages):
    return openai.ChatCompletion.create(
        model="gpt-3.5-turbo", # The deployment name you chose when you deployed the ChatGPT or GPT-4 model.
        messages = messages
    )

# Helper to efficiently embed a set of documents using the OpenAI embedding API
# This is from langchain

embedding_ctx_length = 8191
OPENAI_EMBEDDING_MODEL = "text-embedding-ada-002"
chunk_size = 1000

from typing import List

def embed_texts(texts: List[str], embedding_model: str) -> List[List[float]]:
    embeddings: List[List[float]] = [[] for _ in range(len(texts))]
    tokens = []
    indices = []
    encoding = tiktoken.model.encoding_for_model(embedding_model)
    for i, text in enumerate(texts):
        if embedding_model.endswith("001"):
            # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
            # replace newlines, which can negatively affect performance.
            text = text.replace("\n", " ")
        token = encoding.encode(
            text,
            disallowed_special="all",
        )
        for j in range(0, len(token), embedding_ctx_length):
            tokens += [token[j : j + embedding_ctx_length]]
            indices += [i]

    batched_embeddings = []
    _chunk_size = chunk_size
    for i in range(0, len(tokens), _chunk_size):
        response = openai_embed(
            embedding_model,
            input=tokens[i : i + _chunk_size],
        )
        batched_embeddings += [r["embedding"] for r in response["data"]]

    results: List[List[List[float]]] = [[] for _ in range(len(texts))]
    num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
    for i in range(len(indices)):
        results[indices[i]].append(batched_embeddings[i])
        num_tokens_in_batch[indices[i]].append(len(tokens[i]))

    for i in range(len(texts)):
        _result = results[i]
        if len(_result) == 0:
            average = embed_with_retry(
                embedding_model,
                input="",
            )["data"][0]["embedding"]
        else:
            average = np.average(
                _result, axis=0, weights=num_tokens_in_batch[i]
            )
        embeddings[i] = (average / np.linalg.norm(average)).tolist()

    return embeddings

In [None]:
import pandas
data = pandas.read_csv('/Users/shawn/datasets/wandb_export_2023-06-03T15_01_20.066-07_00.csv')

In [None]:
data

In [None]:
embeddings = embed_texts(data['question'][:100], OPENAI_EMBEDDING_MODEL)

In [None]:
from weave.legacy.weave.ecosystem import umap

In [None]:
umap.umap_projection(embeddings, {})

In [None]:
len(embeddings)

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Dimension reduction and clustering libraries
import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [None]:
standard_embedding = umap.UMAP(random_state=42).fit_transform(embeddings)

In [None]:
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], s=0.1, cmap='Spectral');

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(embeddings)

In [None]:
clusterable_embedding.shape

In [None]:
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=0.1, cmap='Spectral');

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=10,
    min_cluster_size=50,
).fit_predict(clusterable_embedding)

In [None]:
#labels
np.unique(labels, return_counts=True)

In [None]:
clustered = (labels >= 0)
plt.scatter(standard_embedding[~clustered, 0],
            standard_embedding[~clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
plt.scatter(standard_embedding[clustered, 0],
            standard_embedding[clustered, 1],
            c=labels[clustered],
            s=0.1,
            cmap='Spectral');

In [None]:
data['embedding_x'] = standard_embedding[:,0]
data['embedding_y'] = standard_embedding[:,1]
data['cluster_id'] = labels.astype(str)

In [None]:
weave.show(data)

## Create a Weave Board for the data


In [None]:
from weave.legacy.weave.panels_py import panel_autoboard

panel_autoboard.auto_panels(predictions)