Importing libraries

In [1]:
import streamlit as st
from langchain.chains import LLMChain
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from langchain.retrievers.multi_query import MultiQueryRetriever
import os, logging
from dotenv import load_dotenv

In [None]:
load_dotenv()


True

In [4]:
#API KEY for Huggingface

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

if HUGGINGFACEHUB_API_TOKEN == None:
    HUGGINGFACEHUB_API_TOKEN = st.secrets["HUGGINGFACEHUB_API_TOKEN"]


In [5]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [7]:
#loading the data
loader = PyPDFDirectoryLoader(path='documents/', glob="**/*.pdf")
pdfs = loader.load()
len(pdfs)

418

In [8]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = splitter.split_documents(pdfs)

In [9]:
#making the embeddings
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
db = Chroma(persist_directory='./db', embedding_function=embeddings)

  embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
  from tqdm.autonotebook import tqdm, trange


In [10]:
model_path = "openai-community/gpt2"

llm = HuggingFaceHub(repo_id=model_path, model_kwargs={'temperature': 0.5, 'max_length': 200})

llm

  llm = HuggingFaceHub(repo_id=model_path, model_kwargs={'temperature': 0.5, 'max_length': 200})


HuggingFaceHub(client=<InferenceClient(model='openai-community/gpt2', timeout=None)>, repo_id='openai-community/gpt2', task='text-generation', model_kwargs={'temperature': 0.5, 'max_length': 200})

In [12]:
template = """
You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: {question}
"""

prompt_template = PromptTemplate(template=template, input_variables=['question'])
prompt_template

PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='\nYou are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: {question}\n')

In [13]:
#retriving the data
retriever = MultiQueryRetriever.from_llm(retriever=db.as_retriever(), llm=llm, prompt=prompt_template)

retriever

MultiQueryRetriever(retriever=VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000001D1E5858C70>, search_kwargs={}), llm_chain=PromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, template='\nYou are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: {question}\n')
| HuggingFaceHub(client=<InferenceClient(model='openai-community/gpt2', timeout=None)>, repo_id='openai-community/gpt2', task='text-generation', model_kwargs={'temperature': 0.5, 'max_length': 200})
| LineListOutputParser())

In [14]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
#sample question
question = "What are the differences in the business of Tesla and Uber?"

unique_documents = retriever.get_relevant_documents(query=question)
print(unique_documents)

INFO:langchain.retrievers.multi_query:Generated queries: ['You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: What are the differences in the business of Tesla and Uber?', 'Answer: The business of Tesla and Uber is different. A Tesla is a "business" that is run by a company that is owned and operated by an individual. Uber is a "business" that is run by an individual who is a member of a group of people who are on a team.', 'Tesla is a "business" that is run by a company that is owned and operated by an individual. Uber is a "business" that is run by an individual who is a member']


[]
