This notebook demonstrates the use of the RAG technique, reading data from a text file to answer users' questions based on the provided data.Used OpenAI embeddings to convert text to embeddings for vectorDB.

Two methods are explored in this notebook:

1) Using the similarity search function of VectorDB.
2) Using OpenAI's LLM to search for relevant information.
The text file used contains data from news articles. For guidance on scraping a news website to obtain news article data, please refer to the news_extraction file.

In [2]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser

In [53]:
# !pip install python-dotenv
# !pip install langchain
# !pip install openai
#pip install chromadb

In [7]:
import os
directory = r'your_dir'

# Change the current working directory to the specified directory
os.chdir(directory)

In [10]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
os.environ["OPENAI_API_KEY"] = "your_openaikey"


In [57]:
from langchain.document_loaders import TextLoader

#loader = TextLoader("C:/Users/vnirwan/Desktop/Vaishu/AI Course/article_text_files/Welcome to Playtesting in Guildford!.txt",encoding='utf-8')
loader = TextLoader("your_dir/merged_content.txt",encoding='utf-8')
loader.load()
pages = loader.load()

In [61]:
from langchain.text_splitter import CharacterTextSplitter
#Split the text so that there is some overlap to preserve connected information
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=150,
    length_function=len
)

In [65]:
splits = text_splitter.split_documents(pages)

Created a chunk of size 295, which is longer than the specified 200
Created a chunk of size 306, which is longer than the specified 200
Created a chunk of size 387, which is longer than the specified 200


In [126]:
#len(splits)

In [69]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

  warn_deprecated(


Vectorstore

In [71]:
from langchain.vectorstores import Chroma
persist_directory = 'your_dir/docs/chroma_news/'
vectordb = Chroma.from_documents(
    documents=splits, #splits that we created of our documents
    embedding=embedding, #the openai embedd model
    persist_directory=persist_directory #this is a variable specific to chroma, that allows us to save 
)

In [72]:
print(vectordb._collection.count())

5


Using similarity_search function to find information in vectorDB

In [122]:
question = "your_question" #Lets ask a questions
docs = vectordb.similarity_search(question,k=3)
#print(docs[0].page_content)

In [124]:
#if you want to see metadata
#for doc in docs:
#    print(doc.metadata)

In [120]:
#print(docs[1].page_content) #See content of a page

In [118]:
#print(docs[2].page_content)

Using openai model to add a conversational feature to our search.

In [81]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [83]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding) #loading our vector db that we created in previously

In [84]:
print(vectordb._collection.count())

5


In [87]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

  warn_deprecated(


Retriaval QnA chain

In [89]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [91]:
result = qa_chain({"query": question})

  warn_deprecated(


In [116]:
#result["result"]

Can also provide default prompts to your llm, so that users don't have to write it eveytime.

In [95]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [97]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True, #This will let us easily inspect the documents that we retrieve
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine" #this type of chain iterates over results to get you the best result possible
)

In [99]:
question = "Is there any infrmation on Welcome to Playtesting in Guildford!?"
result = qa_chain({"query": question})


In [103]:
result_refine = qa_chain_mr({"query": question})

In [114]:
#result_refine["result"]