In [1]:
#!pip3 install langchain==0.0.348 openai weaviate-client ragas

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

True

In [4]:
import os
import numpy as np

In [5]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

query = "What did the president say about Justice Breyer"


llm.predict(query)

"As an AI language model, I don't have real-time information or the ability to browse the internet. Therefore, I cannot provide you with the most recent statements made by the president about Justice Breyer. However, it's worth noting that the president's opinions on Justice Breyer may vary depending on the specific context and time period you are referring to. If you are looking for recent statements, I recommend checking reliable news sources or official statements from the president."

# Step 1: Collect and load data
The first step is to identify the key sources of data you want to leverage. 
To index our knowledge base, we first need to load the data. 

Specify a `DocumentLoader` to load in your unstructured data as `Documents`.

A `Document` is a dict with text (`page_content`) and `metadata`.

Since we have a curated list of URLs, we can use one of Langchain’s many built-in data loaders— WebBaseLoader.  link to other loaders

In [6]:
import requests
from langchain.document_loaders import TextLoader

url = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt"
res = requests.get(url)
with open("state_of_the_union.txt", "w") as f:
    f.write(res.text)
    

loader = TextLoader('./state_of_the_union.txt')
documents = loader.load()

documents

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citize

# Step 2: Chunk your Document
Split the `Document` into chunks for embedding and vector storage. 

It’s important to chunk the data as we want to embed a meaningful length of context within our vector index. 

Embedding just a word or two is too little information to match relevant vectors, and embedding entire pages would be too long to fit within the context window of the prompt. Try to strike the right balance for your use case and dataset.

There are many text splitters that Langchain supports.


Here we use...
We also set a small overlap length so that text continuity is preserved between our chunks.

In [7]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# Step 3: Embed and store chunks
To be able to look up our document splits, we first need to store them where we can later look them up.

The most common way to do this is to embed the contents of each document split.

We store the embedding and splits in a vectorstore. 

For this demo, I used Weaviate as the vector database, and OpenAI for the Embeddings. 

Once our vector store is indexed, it’s time to define our retriever. Retriever is the module that determines how the relevant documents are fetched from the vector database, determined by its search algorithm.

In [8]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions

client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

vectorstore = Weaviate.from_documents(
    client = client,
    documents = chunks,
    embedding = OpenAIEmbeddings(),
    by_text = False)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

Started /Users/leonie/.cache/weaviate-embedded: process ID 20222


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-12-12T10:38:28+01:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-12-12T10:38:28+01:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"cache_minilm_oR6E2ceA4AcJ","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-12-12T10:38:28+01:00","took":36542}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"cache_text2vec_cohere_tqmRz1Bf69qc","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-12-12T10:38:28+01:00","took":38666}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"cache_text2vec_openai_ke0jOxmTjA0s","level":"info","limit":1000000000000,"msg":"prefilled vector

{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"langchain_3bf499a5579744a9b781d44633a2c49c_yJIwQVQ1quz2","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-12-12T10:38:29+01:00","took":45375}
/opt/homebrew/lib/python3.11/site-packages/langchain/embeddings/openai.py:501: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  response = response.dict()
/opt/homebrew/lib/python3.11/site-packages/pydantic/main.py:962: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/


# Step 4: Prompt
As shown above, we can load prompts (e.g., this RAG prompt) from the prompt hub.

The prompt can also be easily customized, as shown below.

In [9]:
from langchain import PromptTemplate

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = PromptTemplate(
    template=template, 
    input_variables=["context","question"]
  )

print(prompt)

input_variables=['context', 'question'] template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nUse three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:\n"


# Step 5: Generate
Distill the retrieved documents into an answer using an LLM/Chat model (e.g., `gpt-3.5-turbo`).

We use the Runnable protocol to define the chain.

Runnable protocol pipes together components in a transparent way.

We used a prompt for RAG that is checked into the LangChain prompt hub (here).

In [10]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

In [12]:
# Ragas wants ['question', 'answer', 'contexts', 'ground_truths'] as
'''
{
    "question": [], <-- question from faq doc
    "answer": [], <-- answer from generated result
    "contexts": [], <-- context
    "ground_truths": [] <-- actual answer
}
'''

from datasets import Dataset

questions = ["What did the president say about Justice Breyer?", 
             "What did the president say about Intel's CEO?",
             "What did the president say about gun violence?",
            ]
ground_truths = [["The president said that Justice Breyer has dedicated his life to serve the country and thanked him for his service."],
                ["The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion."],
                ["The president asked Congress to pass proven measures to reduce gun violence."]]
answers = []
contexts = []

# Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)


/opt/homebrew/lib/python3.11/site-packages/langchain/embeddings/openai.py:501: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  response = response.dict()
/opt/homebrew/lib/python3.11/site-packages/pydantic/main.py:962: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
/opt/homebrew/lib/python3.11/site-packages/langchain/chat_models/openai.py:445: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  response = response.dict()
/opt/homebrew/lib/python3.11/site-packages/pydantic/main.py:962: Pydantic

In [13]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

result

evaluating with [context_precision]


100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.09s/it]


evaluating with [context_recall]


100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.84s/it]


evaluating with [faithfulness]


100%|█████████████████████████████████████████████| 1/1 [00:41<00:00, 41.45s/it]


evaluating with [answer_relevancy]


  0%|                                                     | 0/1 [00:00<?, ?it/s]/opt/homebrew/lib/python3.11/site-packages/langchain/embeddings/openai.py:501: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  response = response.dict()
/opt/homebrew/lib/python3.11/site-packages/pydantic/main.py:962: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.51s/it]


{'context_precision': 0.5000, 'context_recall': 1.0000, 'faithfulness': 0.8333, 'answer_relevancy': 0.8650}

In [14]:
import pandas as pd
pd.set_option("display.max_colwidth", None)

df = result.to_pandas()
df

Unnamed: 0,question,contexts,answer,ground_truths,context_precision,context_recall,faithfulness,answer_relevancy
0,What did the president say about Justice Breyer?,"[Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court., And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence., A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.]","The president thanked Justice Breyer for his service and referred to him as an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. The president also mentioned that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson, who will continue Justice Breyer's legacy of excellence.",[The president said that Justice Breyer has dedicated his life to serve the country and thanked him for his service.],0.5,1.0,1.0,0.852447
1,What did the president say about Intel's CEO?,"[But that’s just the beginning. \n\nIntel’s CEO, Pat Gelsinger, who is here tonight, told me they are ready to increase their investment from \n$20 billion to $100 billion. \n\nThat would be one of the biggest investments in manufacturing in American history. \n\nAnd all they’re waiting for is for you to pass this bill. \n\nSo let’s not wait any longer. Send it to my desk. I’ll sign it. \n\nAnd we will really take off. \n\nAnd Intel is not alone. \n\nThere’s something happening in America., This is where Intel, the American company that helped build Silicon Valley, is going to build its $20 billion semiconductor “mega site”. \n\nUp to eight state-of-the-art factories in one place. 10,000 new good-paying jobs. \n\nSome of the most sophisticated manufacturing in the world to make computer chips the size of a fingertip that power the world and our everyday lives. \n\nSmartphones. The Internet. Technology we have yet to invent. \n\nBut that’s just the beginning., For the past 40 years we were told that if we gave tax breaks to those at the very top, the benefits would trickle down to everyone else. \n\nBut that trickle-down theory led to weaker economic growth, lower wages, bigger deficits, and the widest gap between those at the top and everyone else in nearly a century. \n\nVice President Harris and I ran for office with a new economic vision for America.]",The president did not mention Intel's CEO specifically in the given context.,[The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion.],0.0,1.0,0.5,0.833773
2,What did the president say about gun violence?,"[And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon? \n\nBan assault weapons and high-capacity magazines. \n\nRepeal the liability shield that makes gun manufacturers the only industry in America that can’t be sued. \n\nThese laws don’t infringe on the Second Amendment. They save lives., As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice., Let’s stop seeing each other as enemies, and start seeing each other for who we really are: Fellow Americans. \n\nWe can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n\nI recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n\nThey were responding to a 9-1-1 call when a man shot and killed them with a stolen gun.]","The president called for Congress to pass measures to reduce gun violence, including universal background checks and a ban on assault weapons and high-capacity magazines. He also mentioned the need to repeal the liability shield for gun manufacturers.",[The president asked Congress to pass proven measures to reduce gun violence.],1.0,1.0,1.0,0.90887


In [15]:
df.to_csv('results.csv')