In [1]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import sys

sys.path.append(str(Path().cwd().parent.resolve()))

from pprint import PrettyPrinter
pp = PrettyPrinter()

# Uncomment to get more debugging printouts:

import logging

root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)


In [2]:
from trulens_eval.keys import *
set_openai_key()

2023-06-16 17:21:56,903 - numexpr.utils - INFO - Note: NumExpr detected 10 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-06-16 17:21:56,927 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.
KEY SET: OPENAI_API_KEY
KEY SET: PINECONE_API_KEY
KEY SET: PINECONE_ENV
KEY SET: HUGGINGFACE_API_KEY
KEY SET: SLACK_TOKEN
KEY SET: SLACK_SIGNING_SECRET
KEY SET: COHERE_API_KEY


In [3]:
from types import ModuleType
from typing import Callable, Tuple, TypeVar
from trulens_eval.util import get_local_in_call_stack

import inspect
import openai
from langchain.schema import LLMResult
from langchain.callbacks.openai_info import OpenAICallbackHandler
import pydantic

import logging
from typing import Any
logger = logging.getLogger(__name__)

T = TypeVar("T")

INSTRUMENT = "__tru_instrument"

class Tracker(pydantic.BaseModel):
    """
    Track api requests for usage.
    """

    class Config:
        pass

    # Track costs not run inside "track_cost" here.
    global_callback: Any
    endpoint_name: str = "api"
    callback_name: str
    callback_class: type

    def __init__(self, callback_class: type, endpoint_name: str = "api", *args, **kwargs):
        kwargs['endpoint_name'] = endpoint_name
        kwargs['global_callback'] = callback_class()
        kwargs['callback_class'] = callback_class
        kwargs['callback_name'] = f"callback_{endpoint_name}"

        super().__init__(*args, **kwargs)

    def instrument(self, mod: ModuleType, method_name: str):
        for m in dir(mod):
            sub = getattr(mod, m)
            
            if hasattr(sub, method_name):
                logger.debug(f"Instrumenting {mod.__package__}.{method_name} for {self.endpoint_name}")
                func = getattr(sub, method_name)  
                w = self.wrap_function(func)
                setattr(sub, method_name, w)

    def track_cost(self, thunk: Callable[[], T]) -> Tuple[T, Any]: # Any -> langchain llm callback handler
        """
        Tally only the openai API usage performed within the execution of the
        given thunk. Returns the thunk's result alongside the langchain callback
        that includes the usage information.
        """

        # Keep this here for access by wrappers higher in call stack.
        cb = self.callback_class()
        locals()[self.callback_name] = cb

        return thunk(), cb

    @staticmethod
    def __find_tracker(f):
        return id(f) == id(Tracker.track_cost.__code__)

    def wrap_function(self, func):
        if hasattr(func, INSTRUMENT):
            # TODO: What if we want to instrument the same method for multiple
            # endpoints?
            logger.debug(f"{func.__name__} already instrumented")
            return func

        def wrapper(*args, **kwargs):
            logger.debug(f"Calling wrapped {func.__name__} for {self.endpoint_name}.")
            
            res = func(*args, **kwargs)

            model_name = None
            if 'model' in kwargs:
                model_name = kwargs['model']

            usage = None
            if 'usage' in res:
                usage = res['usage']

            llm_res = LLMResult(
                generations=[[]],
                llm_output=dict(token_usage=usage, model_name=model_name),
                run=None
            )

            cb = get_local_in_call_stack(
                key=self.callback_name,
                func=self.__find_tracker,
                offset=0
            )

            self.global_callback.on_llm_end(response=llm_res)
            
            if cb is not None:
                cb.on_llm_end(response=llm_res)
    
            return res
        
        setattr(wrapper, INSTRUMENT, func)
        wrapper.__name__ = func.__name__
        wrapper.__signature__ = inspect.signature(func)

        logger.debug(f"Instrumenting {func.__name__} for {self.endpoint_name} .")

        return wrapper

class OpenAITracker(Tracker):
    """
    Track openai uses. This makes use of langchain OpenAICallbackHandler for
    extracting and tallying various openai API response content.
    """

    def __init__(self, *args, **kwargs):
        kwargs['endpoint_name'] = "openai"
        kwargs['callback_class'] = OpenAICallbackHandler

        super().__init__(*args, **kwargs)

        import openai
        self.instrument(openai, "create")

In [4]:
oait = OpenAITracker()

2023-06-16 17:21:58,577 - __main__ - DEBUG - Instrumenting openai.create for openai
2023-06-16 17:21:58,578 - __main__ - DEBUG - Instrumenting create for openai .
2023-06-16 17:21:58,578 - __main__ - DEBUG - Instrumenting openai.create for openai
2023-06-16 17:21:58,578 - __main__ - DEBUG - Instrumenting create for openai .
2023-06-16 17:21:58,579 - __main__ - DEBUG - Instrumenting openai.create for openai
2023-06-16 17:21:58,579 - __main__ - DEBUG - Instrumenting create for openai .
2023-06-16 17:21:58,579 - __main__ - DEBUG - Instrumenting openai.create for openai
2023-06-16 17:21:58,580 - __main__ - DEBUG - Instrumenting create for openai .
2023-06-16 17:21:58,580 - __main__ - DEBUG - Instrumenting openai.create for openai
2023-06-16 17:21:58,580 - __main__ - DEBUG - Instrumenting create for openai .
2023-06-16 17:21:58,580 - __main__ - DEBUG - Instrumenting openai.create for openai
2023-06-16 17:21:58,581 - __main__ - DEBUG - Instrumenting create for openai .
2023-06-16 17:21:58,58

In [5]:
from trulens_eval import feedback
provider_openai = feedback.OpenAI()
# provider_openai.qs_relevance("Who is Piotr?", "Piotr is a person.")

2023-06-16 17:21:59,996 - trulens_eval.util - DEBUG - *** Creating new Endpoint singleton instance for name = openai ***
2023-06-16 17:21:59,996 - trulens_eval.provider_apis - DEBUG - *** Creating openai endpoint ***


openai api: 0requests [00:00, ?requests/s]

In [6]:
def make_request():
    return provider_openai.qs_relevance("Who is Piotr?", "Piotr is a person.")

res, cb = oait.track_cost(make_request)

2023-06-16 17:22:01,016 - __main__ - DEBUG - Calling wrapped create for openai.
2023-06-16 17:22:01,017 - openai - DEBUG - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/chat/completions
2023-06-16 17:22:01,017 - openai - DEBUG - api_version=None data='{"model": "gpt-3.5-turbo", "temperature": 0.0, "messages": [{"role": "system", "content": "You are a RELEVANCE classifier; providing the relevance of the given STATEMENT to the given QUESTION.\\nRespond only as a number from 1 to 10 where 1 is the least relevant and 10 is the most relevant.\\nNever elaborate.\\n\\nQUESTION: Who is Piotr?\\n\\nSTATEMENT: Piotr is a person.\\n\\nRELEVANCE: "}]}' message='Post details'
2023-06-16 17:22:01,018 - urllib3.util.retry - DEBUG - Converted retries value: 2 -> Retry(total=2, connect=None, read=None, redirect=None, status=None)
2023-06-16 17:22:01,056 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): api.openai.com:443
2023-06-16 17:22:01,636 - urllib3

In [7]:
cb

Tokens Used: 82
	Prompt Tokens: 81
	Completion Tokens: 1
Successful Requests: 1
Total Cost (USD): $0.000164

In [None]:
from langchain.callbacks import get_openai_callback


with get_openai_callback() as cb:
    provider_openai.qs_relevance("Who is Piotr?", "Piotr is a person.")
    total_tokens = cb.total_tokens
    total_cost = cb.total_cost
    

In [None]:
total_cost

In [None]:
Tru().reset_database()

In [None]:
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader('llama_index/data').load_data()
index = GPTVectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()
# response = query_engine.query("What did the author do growing up?")
# print(response)

In [None]:
# For aggregation,
import numpy as np

from trulens_eval import feedback, Feedback, Query, Tru

# Construct feedback functions.

hugs = feedback.Huggingface()
openai = feedback.OpenAI()

# Language match between question/answer.
f_lang_match = Feedback(hugs.language_match).on(
    text1=Query.RecordInput, text2=Query.RecordOutput
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on(
    prompt=Query.RecordInput, response=Query.RecordOutput
)

# Question/statement relevance between question and each context chunk.
f_qs_relevance = feedback.Feedback(openai.qs_relevance).on(
    question=Query.RecordInput,
    statement=Query.Record.model.retriever.retrieve.rets[:].node.text
).aggregate(np.min)

feedbacks = [
#    f_lang_match, 
#    f_qa_relevance, 
#    f_qs_relevance
]

In [None]:
l = Tru().Llama(engine=query_engine, feedbacks=feedbacks, chain_id="default")

In [None]:
list(l.instrumented())

In [None]:
res, record = l.query_with_record("Who is Shayak?")

In [None]:
proc = Tru().start_dashboard(force=True, _dev=Path.cwd().parent)
# thread = Tru().start_evaluator(restart=True)

In [None]:
import llama_index
dir(llama_index.llm_predictor)

In [None]:
type(l.app._response_synthesizer._response_builder)