# Loading multiple PDF from a directory

In [1]:
from langchain_community.document_loaders import DirectoryLoader

In [2]:
loader = DirectoryLoader("1_knowledge_base", 
                         glob="**/*.pdf",
                         show_progress=True,
                         use_multithreading=True)
docs = loader.load()
len(docs)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 25/25 [00:53<00:00,  2.16s/it]


25

# Splitting the documents into chunks

In [21]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [5]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


'''vectorstore = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)

retriever = vectorstore.as_retriever()'''

'vectorstore = InMemoryVectorStore.from_documents(\n    documents=splits, embedding=OpenAIEmbeddings()\n)\n\nretriever = vectorstore.as_retriever()'

# Vectorize the split chunks and store in Deeplake

In [None]:
# update this part accordingly.

my_activeloop_org_id = "sundiu"
my_activeloop_dataset_name = "canadapost_guides"
dataset_path = f"hub://sundiu/canadapost_guides"


In [7]:
from langchain_community.vectorstores import DeepLake

embeddings = OpenAIEmbeddings()
db = DeepLake(dataset_path=dataset_path, 
              embedding=embeddings, 
              overwrite=True)
ids = db.add_documents(docs)



Your Deep Lake dataset has been successfully created!


Creating 25 embeddings in 1 batches of size 25:: 100%|██████████| 1/1 [00:11<00:00, 11.35s/it]

Dataset(path='hub://sundiu/canadapost_guides', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (25, 1)      str     None   
 metadata     json      (25, 1)      str     None   
 embedding  embedding  (25, 1536)  float32   None   
    id        text      (25, 1)      str     None   





# The following codes are for testing only

In [12]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['k'] = 5

In [19]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a manager in Canada Post for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use five sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)



In [20]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

question = "What is Registered mail?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

second_question = "How is it different from non-registered mail?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

Registered Mail provides additional security features compared to non-registered mail, including proof of mailing, tracking, delivery confirmation, and a signature at delivery. Registered Mail includes liability coverage for loss or damage, while non-registered mail does not. Additionally, Registered Mail is handled separately with more stringent security measures throughout the mailing process. Non-registered mail lacks these features and may not offer the same level of accountability or proof of delivery. Overall, Registered Mail is ideal for sending valuable or sensitive items securely.


In [16]:
results = rag_chain.invoke({"input": "What are the prohibited items for airmail items?"})


In [17]:
results['context']

[Document(metadata={'source': '1_knowledge_base/non-mailable-matter.pdf'}, page_content='Non-mailable matter\n\nLast Updated: October 21, 2024\n\nTable of contents\n\n1. What is non-mailable matter?\n\n2. Illegal mail\n\n3. Solicitations by mail\n\n4. Controlled items\n\n4.1 Controlled items overview\n\n4.2 Tobacco and vaping products\n\n4.3 Firearms\n\n4.4 Intoxicating beverages\n\n4.5 Drugs and other controlled substances\n\n4.6 Medical or biological materials\n\n4.7 Perishable items\n\n4.8 Liquids, liquefiable substances, and powders\n\n4.9 Animals\n\n4.10 Plants\n\n5. Prohibited items\n\n5.1 Prohibited items overview\n\n5.2 Replica or inert munitions\n\n5.3 Other prohibited items and restrictions\n\n6. Dangerous goods\n\n6.1 Dangerous goods overview\n\n6.2 Transportation of dangerous goods classes and index\n\n6.3 Commodities\n\nNon-mailable matter\n\n2\n\n3\n\n3\n\n5\n\n5\n\n6\n\n8\n\n9\n\n10\n\n10\n\n11\n\n11\n\n13\n\n13\n\n14\n\n14\n\n16\n\n17\n\n22\n\nPage 1\n\n1. What is non-m

In [18]:
results['answer']

'Prohibited items for airmail include any items that are dangerous or illegal under Canadian law or the laws of the destination country. This includes explosives, highly flammable materials, radioactive substances, and items that may cause injury or damage. Additionally, items containing food perishables or live animals that do not meet shipping requirements are also prohibited. For detailed information on specific prohibited items, you can refer to the Canada Border Services Agency website or the Non-mailable matter regulations.'

In [41]:
for n in iter(results['context']):
    print(n.metadata)

{'source': '1_knowledge_base/non-mailable-matter.pdf'}
{'source': '1_knowledge_base/abcs-of-mailing.pdf'}
{'source': '1_knowledge_base/customs-requirements.pdf'}
{'source': '1_knowledge_base/parcel-services-us-and-international.pdf'}
{'source': '1_knowledge_base/registered-mail-us-and-international.pdf'}


In [37]:
results['context'][1].metadata

{'source': '1_knowledge_base/abcs-of-mailing.pdf'}

In [44]:
# Parse and display the context metadata in a tidy format
def display_context_metadata(context_metadata):
    # Use a set to store unique document titles
    unique_documents = set()
    
    for entry in context_metadata:
        # Extract the filename from the metadata source and remove the extension
        filename = entry.metadata['source'].split('/')[-1].replace('.pdf', '')
        
        # Convert filename to a more readable title format
        readable_title = filename.replace('-', ' ').title()
        
        # Add the formatted title to the set of unique documents
        unique_documents.add(readable_title)
    
    # Format the unique document titles as a numbered list
    formatted_references = "Reference:\n" + "\n".join(
        [f"{i+1}) {title}" for i, title in enumerate(unique_documents)]
    )
    
    return formatted_references

# Example usage with results['context'] from LangChain
context_metadata = results['context']  # Assume this is the metadata list from LangChain
references_text = display_context_metadata(context_metadata)
print(references_text)


Reference:
1) Abcs Of Mailing
2) Registered Mail Us And International
3) Customs Requirements
4) Non Mailable Matter
5) Parcel Services Us And International
