### Comparing Flan Model Sizes

Here we'll build a simple app with langchain and load large and small flan.

Then we'll ask it a few football questions and compare the quality of the responses.

### Import libraries

In [None]:
import os

from IPython.display import JSON

import numpy as np

# Imports main tools:
from trulens_eval import TruChain, Feedback, Huggingface, Tru
# Imports main tools:
from trulens_eval import Feedback
from trulens_eval import feedback
from trulens_eval import FeedbackMode
from trulens_eval import Select
from trulens_eval import TP
from trulens_eval import Tru
from trulens_eval.utils.langchain import WithFeedbackFilterDocuments

# Tru object manages the database of apps, records, and feedbacks; and the
# dashboard to display these
tru = Tru()

# Imports from langchain to build app. You may need to install langchain first
# with the following:
# ! pip install langchain>=0.0.170
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts.chat import ChatPromptTemplate, PromptTemplate
from langchain.prompts.chat import HumanMessagePromptTemplate
from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain import LLMChain

### Set API Keys

In [None]:
from trulens_eval.keys import check_keys

check_keys(
    "OPENAI_API_KEY",
    "HUGGINGFACE_API_KEY",
    "HUGGINGFACEHUB_API_TOKEN"
)

### Set up prompt template

In [None]:
template = """Question: {question}

Answer: """
prompt = PromptTemplate(
        template=template,
    input_variables=['question']
)

# user question
question = "Which NFL team won the Super Bowl in the 2010 season?"

### Set up feedback functions

In [None]:
# API endpoints for models used in feedback functions:
hugs = feedback.Huggingface()
openai = feedback.OpenAI()

# Language match between question/answer.
f_lang_match = Feedback(hugs.language_match).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# By default this will evaluate feedback on main app input and main app output.

# Question/statement relevance between question and each context chunk.
f_qs_relevance = feedback.Feedback(openai.qs_relevance).on_input().on(
    Select.Record.app.combine_docs_chain._call.args.inputs.input_documents[:].page_content
).aggregate(np.min)
# First feedback argument is set to main app input, and the second is taken from
# the context sources as passed to an internal `combine_docs_chain._call`.

all_feedbacks = [f_lang_match, f_qa_relevance, f_qs_relevance]

### Load a couple sizes of Flan and ask questions

In [None]:
from langchain import HuggingFaceHub, LLMChain

model = 'google/flan-t5-small'

# initialize Hub LLM
hub_llm = HuggingFaceHub(
        repo_id = model,
    model_kwargs = {'temperature':1e-10}
)

# create prompt template > LLM chain
llm_chain = LLMChain(
    prompt=prompt,
    llm=hub_llm
)

# Trulens instrumentation.
tc = tru.Chain(
        app_id=f"{model}/v1",
        chain=llm_chain,
        feedbacks=all_feedbacks
    )

tc('Who won the superbowl in 2010?')    
tc('Who won the heisman in 1995')

In [None]:
model = 'google/flan-t5-large'

# initialize Hub LLM
hub_llm = HuggingFaceHub(
        repo_id = model,
    model_kwargs = {'temperature':1e-10}
)

# create prompt template > LLM chain
llm_chain = LLMChain(
    prompt=prompt,
    llm=hub_llm
)

# Trulens instrumentation.
tc = tru.Chain(
        app_id=f"{model}/v1",
        chain=llm_chain,
        feedbacks=all_feedbacks
    )

tc('Who won the superbowl in 2010?')    
tc('Who won the heisman in 1995')

### Load OpenAI Models

In [None]:
model = 'text-davinci-003'

davinci = OpenAI(model_name=model)

llm_chain = LLMChain(
    prompt=prompt,
    llm=davinci
)

# Trulens instrumentation.
tc = tru.Chain(
        app_id=f"{model}/v1",
        chain=llm_chain,
        feedbacks=all_feedbacks
    )

tc('Who won the superbowl in 2010?')    
tc('Who won the heisman in 1995')

In [None]:
tru.run_dashboard()