In [1]:
import weave
import logging
import json
import os
from pathlib import Path

import tiktoken
import faiss
import numpy as np
import openai
import weave.monitoring.openai as weave_openai

logger = logging.getLogger(__name__)
openai.api_key = os.environ['OPENAI_API_KEY']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Helper to efficiently embed a set of documents using the OpenAI embedding API
# This is from langchain

embedding_ctx_length = 8191
OPENAI_EMBEDDING_MODEL = "text-embedding-ada-002"
chunk_size = 1000

from typing import List

def openai_embed(model, input):
    return openai.Embedding.create(input = input, model=model)

def openai_embed_texts(texts: List[str], embedding_model: str) -> List[List[float]]:
    embeddings: List[List[float]] = [[] for _ in range(len(texts))]
    tokens = []
    indices = []
    encoding = tiktoken.model.encoding_for_model(embedding_model)
    for i, text in enumerate(texts):
        if embedding_model.endswith("001"):
            # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
            # replace newlines, which can negatively affect performance.
            text = text.replace("\n", " ")
        token = encoding.encode(
            text,
            disallowed_special="all",
        )
        for j in range(0, len(token), embedding_ctx_length):
            tokens += [token[j : j + embedding_ctx_length]]
            indices += [i]

    batched_embeddings = []
    _chunk_size = chunk_size
    for i in range(0, len(tokens), _chunk_size):
        response = openai_embed(
            embedding_model,
            input=tokens[i : i + _chunk_size],
        )
        batched_embeddings += [r["embedding"] for r in response["data"]]

    results: List[List[List[float]]] = [[] for _ in range(len(texts))]
    num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
    for i in range(len(indices)):
        results[indices[i]].append(batched_embeddings[i])
        num_tokens_in_batch[indices[i]].append(len(tokens[i]))

    for i in range(len(texts)):
        _result = results[i]
        if len(_result) == 0:
            average = embed_with_retry(
                embedding_model,
                input="",
            )["data"][0]["embedding"]
        else:
            average = np.average(
                _result, axis=0, weights=num_tokens_in_batch[i]
            )
        embeddings[i] = (average / np.linalg.norm(average)).tolist()

    return embeddings

In [3]:
import dataclasses
import typing
import tempfile

# class FaissIndexType(weave.types.Type):
#     instance_classes = [faiss.Index]
    
#     def save_instance(self, obj, artifact, name):
#         with artifact.writeable_file_path(f"{name}.faissindex") as write_path:
#             faiss.write_index(obj, write_path)

#     def load_instance(self, artifact, name, extra):
#         with artifact.open(f"{name}.faissindex", binary=True) as f:
#             return faiss.read_index(f)

class Document(typing.TypedDict):
    path: str
    contents: str

@weave.type()
class DocumentDataset:
    rows: list[Document]

@weave.type()
class EmbeddingModel:
    pass

@weave.type()
class OpenAIEmbeddingModel(EmbeddingModel):
    model: str

    # @weave.op()
    def embed_texts(self, texts: list[str]) -> List[List[float]]:
        return weave.WeaveList(openai_embed_texts(texts, self.model))

@weave.type()
class FAISSStore:
    index: faiss.IndexFlatL2
    docs: DocumentDataset
    embedding_model: EmbeddingModel

    @weave.op()
    def search(self, query: str) -> list[Document]:
        embedded_query = self.embedding_model.embed_texts([query])[0]
        query_vector = np.array([embedded_query], dtype=np.float32)
        scores, indices = self.index.search(query_vector, 4)
        return [self.docs.rows[i] for i in indices[0]]
        
        
def make_faiss_store(docs: DocumentDataset, embedding_model: EmbeddingModel) -> FAISSStore:
    doc_embeddings = embedding_model.embed_texts(weave.WeaveList([d['contents'] for d in docs.rows]))
    faiss_index = faiss.IndexFlatL2(len(doc_embeddings[0]))
    doc_embeddings_vector = np.array(doc_embeddings, dtype=np.float32)
    faiss_index.add(doc_embeddings_vector)
    return FAISSStore(faiss_index, docs, embedding_model)
    

@weave.type()
class DocbotModel(weave.Model):
    vector_store: FAISSStore
    prompt_template: str
    model_name: str

    @weave.op()
    def predict(self, query: str) -> str:
        docs = self.vector_store.search(query)
        prompt = self.prompt_template.format(context='\n\n'.join([d['contents'] for d in docs]), question=query)
        response = weave_openai.ChatCompletion.create(model=self.model_name, messages=[{"role": "user", "content": prompt}])
        return response['choices'][0]['message']['content']

In [4]:
# Get markdown files from our docs repo

DOC_DIR = '/Users/shawn/code2/docodile'
DOC_SUFFIX = '.md'

docs = []
for file in Path(DOC_DIR).glob('**/*' + DOC_SUFFIX):
    with file.open('r') as f:
        docs.append({'path': file.name, 'contents': f.read()})
        
docs_dataset = DocumentDataset(docs[:99])

In [5]:
weave.init('weaveflow-docbot-11')
vs = make_faiss_store(docs_dataset, OpenAIEmbeddingModel("text-embedding-ada-002"))
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:"""
model = DocbotModel(vs, prompt_template, 'gpt-3.5-turbo')

View project at http://localhost:3000/browse2/shawn/weaveflow-docbot-11


In [6]:
model.predict('How do I run a sweep?')

Published OpDef to http://localhost:3000/browse2/shawn/weaveflow-docbot-11/OpDef/DocbotModel-predict/737f1b88847d5d015171
Published Model to http://localhost:3000/browse2/shawn/weaveflow-docbot-11/Model/DocbotModel/c1581b9c98b5245c3107
Published OpDef to http://localhost:3000/browse2/shawn/weaveflow-docbot-11/OpDef/FAISSStore-search/e680f857961ffe143f7a
Published FAISSStore to http://localhost:3000/browse2/shawn/weaveflow-docbot-11/FAISSStore/FAISSStore/717d48c6503cfb26c447


'To run a sweep, you can use the `wandb sweep` command followed by the path to your sweep configuration YAML file or the sweep ID. \n\nFor example:\n```\nwandb sweep my_sweep.yaml\n```\nor\n```\nwandb sweep 123abc\n```\n\nYou can also include additional options such as setting the project and entity, specifying a launch config, and more. \n\nOnce you have created a sweep, you can use the `wandb agent` command with the sweep ID to generate hyperparameter suggestions from the sweep and train your model.\n\nFor example:\n```\nwandb agent 123abc\n```\n\nNote that the above information is specific to using the `wandb` library for managing and running sweeps.'