### Comparing Flan Model Sizes

Here we'll build a simple app with langchain and load large and small flan.

Then we'll ask it a few football questions and compare the quality of the responses.

### Import libraries

In [1]:
import os

from IPython.display import JSON

import numpy as np

# Imports main tools:
from trulens_eval import TruChain, Feedback, Huggingface, Tru
# Imports main tools:
from trulens_eval import Feedback
from trulens_eval import feedback
from trulens_eval import FeedbackMode
from trulens_eval import Select
from trulens_eval import TP
from trulens_eval import Tru
from trulens_eval.utils.langchain import WithFeedbackFilterDocuments

# Tru object manages the database of apps, records, and feedbacks; and the
# dashboard to display these
tru = Tru()

# Imports from langchain to build app. You may need to install langchain first
# with the following:
# ! pip install langchain>=0.0.170
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts.chat import ChatPromptTemplate, PromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate
from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain import LLMChain

No .env found in /Users/jreini/Desktop/development/trulens/trulens_eval/examples/vector-dbs/pinecone or its parents. You may need to specify secret keys in another manner.


### Set API Keys

In [2]:
from trulens_eval.keys import setup_keys

setup_keys(
    OPENAI_API_KEY="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",
    HUGGINGFACE_API_KEY="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"
)

os.environ['HUGGINGFACEHUB_API_TOKEN'] = "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"
os.environ['OPENAI_API_KEY'] = "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"

✅ Key OPENAI_API_KEY set explicitly.
✅ Key HUGGINGFACE_API_KEY set explicitly.


### Set up prompt template

In [3]:
template = """Question: {question}

Answer: """
prompt = PromptTemplate(
        template=template,
    input_variables=['question']
)

# user question
question = "Which NFL team won the Super Bowl in the 2010 season?"

### Set up feedback functions

In [4]:
# API endpoints for models used in feedback functions:
hugs = feedback.Huggingface()
openai = feedback.OpenAI()

# Language match between question/answer.
f_lang_match = Feedback(hugs.language_match).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/statement relevance between question and each context chunk.
f_qs_relevance = feedback.Feedback(openai.qs_relevance).on_input().on(
    Select.Record.app.combine_docs_chain._call.args.inputs.input_documents[:].page_content
).aggregate(np.min)
# First feedback argument is set to main app input, and the second is taken from
# the context sources as passed to an internal `combine_docs_chain._call`.

all_feedbacks = [f_lang_match, f_qa_relevance, f_qs_relevance]

✅ In language_match, input text1 will be set to *.__record__.main_input or `Select.RecordInput` .
✅ In language_match, input text2 will be set to *.__record__.main_output or `Select.RecordOutput` .
✅ In relevance, input prompt will be set to *.__record__.main_input or `Select.RecordInput` .
✅ In relevance, input response will be set to *.__record__.main_output or `Select.RecordOutput` .
✅ In qs_relevance, input question will be set to *.__record__.main_input or `Select.RecordInput` .
✅ In qs_relevance, input statement will be set to *.__record__.app.combine_docs_chain._call.args.inputs.input_documents[:].page_content .


### Load a couple sizes of Flan and ask questions

In [5]:
from langchain import HuggingFaceHub, LLMChain

model = 'google/flan-t5-small'

# initialize Hub LLM
hub_llm = HuggingFaceHub(
        repo_id = model,
    model_kwargs = {'temperature':1e-10}
)

# create prompt template > LLM chain
llm_chain = LLMChain(
    prompt=prompt,
    llm=hub_llm
)

# Trulens instrumentation.
tc = tru.Chain(
        app_id=f"{model}/v1",
        chain=llm_chain,
        feedbacks=all_feedbacks
    )

tc('Who won the superbowl in 2010?')    
tc('Who won the heisman in 1995')

✅ app google/flan-t5-small/v1 -> default.sqlite
✅ feedback def. feedback_definition_hash_36e27643030c60771697d9b90efef699 -> default.sqlite
✅ feedback def. feedback_definition_hash_4d0a28967bbc08fd2f0e93a95b9a94ab -> default.sqlite
✅ feedback def. feedback_definition_hash_f6905e7a2bf42e13d252ecd56ebb5f25 -> default.sqlite
✅ record record_hash_8dc34057bbe527fd1367079bf69d7816 from google/flan-t5-small/v1 -> default.sqlite


{'question': 'Who won the heisman in 1995', 'text': 'samuel wilson'}

✅ record record_hash_f9692dad61bba85cd437f780f28748d9 from google/flan-t5-small/v1 -> default.sqlite


In [6]:
model = 'google/flan-t5-large'

# initialize Hub LLM
hub_llm = HuggingFaceHub(
        repo_id = model,
    model_kwargs = {'temperature':1e-10}
)

# create prompt template > LLM chain
llm_chain = LLMChain(
    prompt=prompt,
    llm=hub_llm
)

# Trulens instrumentation.
tc = tru.Chain(
        app_id=f"{model}/v1",
        chain=llm_chain,
        feedbacks=all_feedbacks
    )

tc('Who won the superbowl in 2010?')    
tc('Who won the heisman in 1995')

✅ app google/flan-t5-large/v1 -> default.sqlite
✅ feedback def. feedback_definition_hash_36e27643030c60771697d9b90efef699 -> default.sqlite
✅ feedback def. feedback_definition_hash_4d0a28967bbc08fd2f0e93a95b9a94ab -> default.sqlite
✅ feedback def. feedback_definition_hash_f6905e7a2bf42e13d252ecd56ebb5f25 -> default.sqlite
✅ record record_hash_bf442db5bb9ce0eff4991649852c4685 from google/flan-t5-large/v1 -> default.sqlite


{'question': 'Who won the heisman in 1995', 'text': 'joe hudson'}

✅ record record_hash_9d0e8c8cf06a9776b9fc2d7b32c8c501 from google/flan-t5-large/v1 -> default.sqlite


### Load OpenAI Models

In [7]:
model = 'text-davinci-003'

davinci = OpenAI(model_name=model)

llm_chain = LLMChain(
    prompt=prompt,
    llm=davinci
)

# Trulens instrumentation.
tc = tru.Chain(
        app_id=f"{model}/v1",
        chain=llm_chain,
        feedbacks=all_feedbacks
    )

tc('Who won the superbowl in 2010?')    
tc('Who won the heisman in 1995')

✅ app text-davinci-003/v1 -> default.sqlite
✅ feedback def. feedback_definition_hash_36e27643030c60771697d9b90efef699 -> default.sqlite
✅ feedback def. feedback_definition_hash_4d0a28967bbc08fd2f0e93a95b9a94ab -> default.sqlite
✅ feedback def. feedback_definition_hash_f6905e7a2bf42e13d252ecd56ebb5f25 -> default.sqlite
✅ record record_hash_0bc9bb8a16eb76eb79cca70541cd64da from text-davinci-003/v1 -> default.sqlite


{'question': 'Who won the heisman in 1995',
 'text': ' Eddie George won the Heisman Trophy in 1995.'}

✅ record record_hash_7f40192b8d91c40ac6529aa67a6a3b7d from text-davinci-003/v1 -> default.sqlite


In [8]:
tru.run_dashboard()

Starting dashboard ...


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.4.23:8503 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>