# Langchain doc retriever with ChromaDB

get the OpenAI key from .env file

In [19]:
from dotenv import load_dotenv
# Import the function load_dotenv from the dotenv package.

import os
# Import the os module for interacting with the operating system.

load_dotenv()
# Load environment variables from a .env file.

openai_api_key = os.getenv('OPENAI_API_KEY')
# Get the value of the 'OPENAI_API_KEY' environment variable.
# I have a .gitignore file

In [20]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader

In [21]:
# Load and process the text files
# The 'papers' file was created in the 'get_the_text_from_arxiv.ipynb' file."
loader = TextLoader('papers.txt')

papers = loader.load()

In [22]:
# splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
papers = text_splitter.split_documents(papers) 

# every chunk has 1000 chracters
# below we see the features of OpenAI model = text-embedding-ada-002 and chunk_size=1000
# we can not hit the size chunk of OpenAI Embeddings

# This variable starts with the first 200 characters of a chunk and the last 200 characters of the previous chunk.
# It's not necessary, but it positively affects the results.

len(papers) # We have 114 chunks

114

# Create ChromaDB

In [23]:
# Embed and store the text
# Supplying a persist directory will store the embeddings on disk
persist_directory = 'db'

In [24]:
embedding = OpenAIEmbeddings()   

# We are using the OpenAI embeddings model. 
# The 'papers' text is being converted into vectors with OpenAI embeddings.
# print(embeddings) When we run the command 'print(embeddings)', we can see the features of the model.
# I didn't run this print(embeddings) because the 'openai_api_key' appears inside when it is run.

In [25]:
vectordb = Chroma.from_documents(documents = papers, # our all papers from ArXiv
                                 embedding = embedding, 
                                 persist_directory = persist_directory)
                                 

# Here, the 'papers' text is converted into vectors using OpenAI embeddings 
# and saved in Chromadb. When the code is run,
# a database named 'db' is created and the data is stored inside it

In [26]:
# persiste the db to disk
vectordb.persist()  # The converted vectors are saved to disk.
vectordb = None     # Since we have saved it to the disk, let's invalidate the above parameter.

# The purpose of saving to disk is so that the vectors can be retrieved 
# from the saved location each time the program runs.
# If we don't save them, the entire text will need to be converted 
# into vectors again every time it runs.

In [27]:
# now we can load the persisted database from disk and use it as normal
vectordb = Chroma(persist_directory = persist_directory,
                  embedding_function = embedding)


#### make a retriever

In [29]:
retriever = vectordb.as_retriever()

#This 'vector database' is transformed into a search engine 
# and used for text searches. 
# It is connected to the retriever in the code above and can be used in a function below.
#  We will use this 'retriever' when creating a 'chain' below.

# Make a Chain

In [31]:
# create the chain to answer questions

qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                        chain_type='stuff',              # The type of retrieval QA chain to use
                                        retriever=retriever,             # The retriever to use to retrieve documents
                                        return_source_documents = True)  # Whether or not to return the source documents for each answer


In [32]:
query = 'For which tasks has Llama-2 already been used successfully? What are promising areas of application for Llama-2?'

qa_chain(query)

# This is our response value.
# It contains outputs for query, result, and source_documents


{'query': 'For which tasks has Llama-2 already been used successfully? What are promising areas of application for Llama-2?',
 'result': ' Llama-2 has been successfully used for fine-tuning large language models for dialogue use cases and financial news analysis. Promising areas of application for Llama-2 include language tasks such as text analysis, summarization, and named entity extraction with sentiment analysis, as well as other use cases that require large language models such as predictive features in supervised machine learning models.',
 'source_documents': [Document(page_content='Title: Llama 2: Open Foundation and Fine-Tuned Chat Models\nSummary:   In this work, we develop and release Llama 2, a collection of pretrained and\nfine-tuned large language models (LLMs) ranging in scale from 7 billion to 70\nbillion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for\ndialogue use cases. Our models outperform open-source chat models on most\nbenchmarks we teste

In [33]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')

    if llm_response['source_documents']:
        source = llm_response['source_documents'][0]
        print(source.metadata['source'])

# This function written to better see the outputs.
# I will use this function for query output


# Example Questions

Question 1:

In [34]:
query = 'For which tasks has Llama-2 already been used successfully? What are promising areas of application for Llama-2?'
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Llama-2 has been successfully used for analyzing financial news from a market perspective, highlighting main points, summarizing text, and extracting named entities with appropriate sentiments. Some promising areas of application for Llama-2 include financial news analysis, multitask analysis, and potentially as a substitute for closed-source models in dialogue use cases.


Sources:
papers.txt


Question 2:

In [35]:
query = 'Name at least 5 domain-specific LLMs that have been created by fine-tuning Llama-2.'
llm_response = qa_chain(query)
process_llm_response(llm_response)

 I don't know.


Sources:
papers.txt


Question 3:

In [36]:
query = 'What can you find out about the model structure of Llama-2 (required memory, required computing capacity, number of parameters, available quantizations)?'
llm_response = qa_chain(query)
process_llm_response(llm_response)


Unfortunately, the summary does not provide enough information about the model structure of Llama-2 to answer this question. It mentions the scale of the models (ranging from 7 billion to 70 billion parameters) and that they are optimized for dialogue use cases, but it does not mention anything about required memory, computing capacity, or available quantizations.


Sources:
papers.txt
