In [1]:
# Install required packages
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai langchain-chroma bs4 --user

Note: you may need to restart the kernel to use updated packages.


In [2]:
import dotenv
dotenv.load_dotenv()

True

In [3]:
from langchain import hub
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.runnables import RunnableParallel

In [4]:
import os

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [5]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", max_tokens=1000)

In [6]:
with open('textbook_as_text.txt', 'r', encoding= 'utf-8') as f:
    text = f.read()

loader = TextLoader(
    'textbook_as_text.txt',
    encoding='utf-8'
)

docs = loader.load()

In [7]:
chunk_size, chunk_overlap = 1000, 200
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# Save vectorstore to disk
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(), persist_directory="./chroma_db")

In [8]:
vectorstore2 = Chroma(persist_directory="./chroma_db", embedding_function=OpenAIEmbeddings())

In [9]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore2.as_retriever(search_type="similarity", search_kwargs={"k": 2})
prompt = hub.pull("rlm/rag-prompt")


In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [11]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [12]:
# see how many vectors are in the Chroma vectorstore2
print(dir(vectorstore2))

['_Chroma__query_collection', '_LANGCHAIN_DEFAULT_COLLECTION_NAME', '__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_asimilarity_search_with_relevance_scores', '_client', '_client_settings', '_collection', '_cosine_relevance_score_fn', '_embedding_function', '_euclidean_relevance_score_fn', '_get_retriever_tags', '_max_inner_product_relevance_score_fn', '_persist_directory', '_select_relevance_score_fn', '_similarity_search_with_relevance_scores', 'aadd_documents', 'aadd_texts', 'add_documents', 'add_images', 'add_texts', 'adelete', 'afrom_documents', 'afrom_texts', 'amax_marginal_relevance_search', 'amax_margin

In [13]:
practice_questions = [
    "What are the main DDL keywords?",
    "Can you explain when to use Group By in an SQL query? Can you give me an example?",
    "What are the first three levels of database normalisation?",
    "What does the distinct keyword mean in an SQL query?"
]

In [14]:
rag_chain.invoke(practice_questions[1] + " Please include the page number in your answer.")

'You use GROUP BY in an SQL query to group rows that have the same values into summary rows. An example would be using GROUP BY to find the total sales for each product category. (Page 321)'