Installing packages that will be utilized in RAG

In [None]:
!pip install langchain langchain-community langchain-core langchain-openai
!pip install chromadb
!pip install tiktoken google-search-results
!pip install pydantic
!pip install typing-inspect typing_extensions
!pip install loguru

Importing libraries

In [None]:
import openai
import os

from langchain_community.document_loaders.directory import DirectoryLoader

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma

from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from loguru import logger

from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

Configure OS environment to utilize OpenAI and LangChain APIs

In [None]:
os.environ["OPENAI_API_KEY"] = # Insert OpenAI API key here
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = # Insert LangChain API key here

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
directory_path = # Enter path to data folder stored in Google Drive

In [None]:
from langchain_community.document_loaders import TextLoader

Load data into LangChain using Directory Loader

In [None]:
loader1 = DirectoryLoader(directory_path, glob = "**/*.txt", show_progress = True, loader_cls = TextLoader)
txt_docs = loader1.load_and_split()

100%|██████████| 23/23 [00:00<00:00, 167.78it/s]


Create embeddings and generate local ChromaDB vector database using the documents provided

In [None]:
import chromadb
embeddings = OpenAIEmbeddings()
client = chromadb.Client()
# Local vector database containing document embeddings
txt_docsearch = Chroma.from_documents(documents=txt_docs, embedding=embeddings)
#logfile = "output.log"

handler = StreamingStdOutCallbackHandler()

# Define LLM
llm = ChatOpenAI(model_name="gpt-4", temperature=0.6) # "gpt-4"

# Create Retriever
qa_txt = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", callbacks=[handler], retriever=txt_docsearch.as_retriever())

In [None]:
# q = 'Empty query here'  
qa_txt.invoke(q)