In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb -U langchain langchain_chroma

Collecting langchain_community
  Downloading langchain_community-0.0.34-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-openai
  Downloading langchain_openai-0.1.4-py3-none-any.whl (33 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.15-py3-none-any.whl (4.6 kB)
Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[

In [None]:
import os

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = 'true'
os.environ["LANGCHAINENDPOINT"] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_API_KEY"] = ''
os.environ["OPENAI_API_KEY"] = ''
os.environ["LANGCHAIN_PROJECT"] = ""

In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI

Politifact: https://www.politifact.com/factchecks/2024/apr/25/social-media/columbia-not-all-remote-offers-students-hybrid-lea/

In [None]:
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [None]:
### INDEXING ###
# load
documents = []
loader =  TextLoader("/content/content_1.txt")
documents.extend(loader.load())
loader = TextLoader("/content/unrelated_1.txt")
documents.extend(loader.load())
# loader = TextLoader("/content/Columbia_spector.txt")
# documents.extend(loader.load())
# loader = TextLoader("/content/support_1_NYTimes.txt")
# documents.extend(loader.load())
# loader = TextLoader("/content/support_2_NYPost.txt")
# documents.extend(loader.load())

In [None]:
# split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
splits = text_splitter.split_documents(documents)
# Embed
vectorstore = Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings())
retriever = vectorstore.as_retriever() #search_kwargs = {"k":1}

In [None]:
import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai
class RagBot:
    def __init__(self, retriever, model: str = "gpt-3.5-turbo-0125"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client())
        self._model = model

    @traceable
    def get_answer(self, question: str):
        similar = self._retriever.invoke(question)
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You worked as experienced fact checker for 20 years and you're \
                              good at sniffing false claim or fake news which are unsupported by or contradict to\
                              the given the source documents delimited by tripple backticks."
                    f"""```Source Docs: {similar}```"""
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [str(doc) for doc in similar],
        }

rag_bot = RagBot(retriever)

In [None]:
statement = "Judge Rosemary M. Collyer was bribed by House Republicans to rule in their favor against the Obama administration."
question = f"""Question: Evaluate the varicity of the statement delimited by angle
brackets. If it was unsupported by or contradict to the evidence, it's more likely to be false, return 0 along with a reason;
otherwise, it's more likely to be true, return 1 along with a reason.
<Statement: {statement}>
Your answer should be in JSON format, whose keys are veracity and reason"""

In [None]:
rag_bot = RagBot(retriever)
response = rag_bot.get_answer(question)

In [None]:
response["answer"]

'{\n    "veracity": 0,\n    "reason": "The provided evidence does not support or suggest any bribery towards Judge Rosemary M. Collyer by House Republicans. The issue at hand primarily revolves around legal disputes concerning spending powers and constitutional violations, not bribery."\n}'

In [None]:
import json
json.loads(response["answer"])

{'veracity': 0,
 'reason': 'The provided evidence does not support or suggest any bribery towards Judge Rosemary M. Collyer by House Republicans. The issue at hand primarily revolves around legal disputes concerning spending powers and constitutional violations, not bribery.'}

In [None]:
statement_list = [
    "The Trump administration plans to eliminate all health insurance subsidies immediately after inauguration, leaving millions without any health coverage.",
    "House Republicans celebrated the potential collapse of the healthcare system as a victory against the Obama legacy.",
    "Donald J. Trump personally directed the legal strategies to ensure the failure of the Affordable Care Act as part of his first 100 days' agenda.",
    "The Obama administration secretly admitted that the health care subsidies were unconstitutional but continued them to gain political favor.",
    "All major health insurers have already started withdrawing from the market in anticipation of the lawsuit's success",
    "The White House pressured the Treasury Department to illegally fund the health insurance subsidies despite public opposition.",
    "Legal experts unanimously agree that the lawsuit against the Obama administration has no merit and is purely politically motivated.",
    "The halt of the health care lawsuit was secretly agreed upon by the Obama administration to protect its legacy",
    "House Republicans have already drafted legislation to replace the Affordable Care Act entirely by the end of the month."
]

In [None]:
len(statement_list)

9

In [None]:
veracity = []
reason = []
for i in statement_list:
    statement = i
    question = f"""Question: Evaluate the varicity of the statement delimited by angle
    brackets. If it was unsupported by or contradict to the evidence, it's more likely to be false, return 0 along with a reason;
    otherwise, it's more likely to be true, return 1 along with a reason.
    <Statement: {statement}>
    Your answer should be in JSON format, whose keys are veracity and reason"""
    rag_bot = RagBot(retriever)
    response = rag_bot.get_answer(question)
    print(response["answer"])
    # dict_ = json.loads(response["answer"])
    # veracity.append(dict_["veracity"])
    # reason.append(dict_['reason'])

{
    "veracity": 0,
    "reason": "The statement is not supported by the evidence. The documents indicate discussions on potential options and resolutions post-inauguration, but there is no explicit plan mentioned to eliminate all health insurance subsidies immediately after inauguration."
}
```Source Docs: [Document(page_content='had the
```Source Docs: [Document(page_content='demanding an end to the law for
{
    "veracity": 0,
    "reason": "The provided documents do not mention that the Obama administration admitted the health care subsidies were unconstitutional or that they continued them for political favor. This statement is unsupported by the evidence."
}
{
    "veracity": 0,
    "reason": "The evidence suggests that insurers receiving subsidies might drop coverage, but it does not indicate that all major health insurers have already started withdrawing from the market."
}
{
    "veracity": 0,
    "reason": "The provided documents do not support the claim of illegal funding o

In [None]:
true_statement_list = [
    # first 2 fake
    "House Republicans celebrated the potential collapse of the healthcare system as a victory against the Obama legacy.",
    "Donald J. Trump personally directed the legal strategies to ensure the failure of the Affordable Care Act as part of his first 100 days' agenda.",
    "Congressional Republicans are concerned about the potential implications if they win their health care lawsuit against the Obama administration.",
    "The lawsuit challenges the administration's authority to spend billions on health insurance subsidies without congressional approval.",
    "Judge Rosemary M. Collyer ruled that the Obama administration had been distributing health insurance subsidies in violation of the Constitution.",
    "The outcome of the lawsuit could destabilize the health care program and lead to a lack of insurance coverage for millions.",
    "The Trump administration will review the case and all related aspects of the Affordable Care Act upon taking office."
]

In [None]:
len(true_statement_list)

7

In [None]:
for i in true_statement_list:
    statement = i
    question = f"""Question: Evaluate the varicity of the statement delimited by angle
    brackets. If it was unsupported by or contradict to the evidence, it's more likely to be false, return 0 along with a reason;
    otherwise, it's more likely to be true, return 1 along with a reason.
    <Statement: {statement}>
    Your answer should be in JSON format, whose keys are veracity and reason"""
    rag_bot = RagBot(retriever)
    response = rag_bot.get_answer(question)
    print(response["answer"])

```Source Docs: [Document(page_content='had the
{
    "veracity": 0,
    "reason": "The provided source documents do not support the claim that Donald J. Trump personally directed the legal strategies to ensure the failure of the Affordable Care Act as part of his first 100 days' agenda. The documents mention discussions and considerations of the lawsuit involving the ACA, but there is no explicit mention of Trump personally directing legal strategies for its failure."
}
{
    "veracity": 1,
    "reason": "The source documents indicate that Congressional Republicans are indeed concerned about the potential implications if they win their health care lawsuit against the Obama administration. The implications include the possibility of causing the health care program to implode and leaving millions of people without access to health insurance."
}
{
    "veracity": 1,
    "reason": "The statement is supported by the evidence provided in the documents. The lawsuit indeed challenges the admi

In [None]:
### Evaluate ###
from langsmith.evaluation import RunEvaluator, EvaluationResult
from langchain.evaluation import load_evaluator


class FaithfulnessEvaluator(RunEvaluator):
    def __init__(self):
        self.evaluator = load_evaluator(
            "labeled_score_string",
            criteria={
                "faithful": "How faithful is the submission to the reference context?"
            },
            normalize_by=10,
        )

    def evaluate_run(self, run, example) -> EvaluationResult:
        res = self.evaluator.evaluate_strings(
            prediction=next(iter(run.outputs.values())),
            input=run.inputs["question"],
            # We are treating the documents as the reference context in this case.
            reference=example.inputs["documents"],
        )
        return EvaluationResult(key="labeled_criteria:faithful", **res)

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Return the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
num_tokens_from_string(question, "cl100k_base")

57

In [None]:
emb = OpenAIEmbeddings()
query_result = emb.embed_query(question)
len(query_result)

1536

In [None]:
#### Retrieval ####
# prompt = hub.pull("")
docs = retriever.invoke(question)

In [None]:
len(docs)

4

1) Can't tract in Langsmith
2) What about adding other files
