#### Load Data From PDF

In [31]:
from langchain_community.document_loaders import PyPDFLoader

def load_pdf(file_path: str):
    """
    Load a PDF file and return its content as a list of documents.

    Args:
        file_path (str): The path to the PDF file.

    Returns:
        list: A list of documents extracted from the PDF.
    """
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    
    return documents  # This returns a list of Document objects

In [32]:
documents = load_pdf("../data/Passport1.pdf")
print(documents)

[Document(metadata={'producer': 'FREE PDFill PDF and Image Writer', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2024-01-31T19:54:07+05:30', 'title': 'Microsoft Word - Instructions for Online TD - Local Applicants -english.doc', 'author': 'ewis', 'moddate': '2024-01-31T19:54:08+05:30', 'source': '../data/Passport1.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='Department of Immigration and Emigration \nOnline Submission of Travel Document Applications –  Local Applicants \n  \n \nInstructions to Online Apply for a Passport \n(Please read the instructions carefully) \n \n1.  How to apply? \n1.1. You can apply for either urgent service or normal service.  \n1.2. If you select the urgent service, your travel docum ent will be issued after three days of capturing \nyour fingerprints. \n1.3. If you select the normal service, your travel docum ent will be issued after thirty days of capturing \nyour fingerprints. \n \n2.  Eligibility \n2.1 The eligibility c

In [33]:
type(documents)

list

In [34]:
print(documents[0].page_content)

Department of Immigration and Emigration 
Online Submission of Travel Document Applications –  Local Applicants 
  
 
Instructions to Online Apply for a Passport 
(Please read the instructions carefully) 
 
1.  How to apply? 
1.1. You can apply for either urgent service or normal service.  
1.2. If you select the urgent service, your travel docum ent will be issued after three days of capturing 
your fingerprints. 
1.3. If you select the normal service, your travel docum ent will be issued after thirty days of capturing 
your fingerprints. 
 
2.  Eligibility 
2.1 The eligibility criteria in order to avail this service are as follows: 
2.1.1  Your age shall be 16 years or above as at th e date of submitting your application. 
2.1.2  You shall possess your valid passport, if an y.  
2.1.3  Your NIC /Passport/Full Name shall not be bl acklisted. 
2.1.4  You shall not have a travel ban imposed by t he Courts of Law. 
2.1.5 You shall not have a travel ban imposed by th e Tri-forces or any o

In [35]:
len(documents)

4

In [36]:
len(documents[0].page_content)

2015

- You can directly give the output of load_pdf() to the split_text function

#### Lets Split the text into smaller chunks

In [37]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document

def split_text(documents: List[Document], chunk_size=1000, chunk_overlap=100):
    """
    Split a list of LangChain Document objects into smaller chunks.

    Args:
        documents (List[Document]): The documents to split.
        chunk_size (int): Max size of each chunk.
        chunk_overlap (int): Overlap between chunks.

    Returns:
        List[Document]: Smaller chunks with metadata preserved.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(documents) 
    return chunks
# You can directly give the output of load_pdf() to the split_text function

In [38]:
chunks = split_text(documents)
chunks

[Document(metadata={'producer': 'FREE PDFill PDF and Image Writer', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2024-01-31T19:54:07+05:30', 'title': 'Microsoft Word - Instructions for Online TD - Local Applicants -english.doc', 'author': 'ewis', 'moddate': '2024-01-31T19:54:08+05:30', 'source': '../data/Passport1.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='Department of Immigration and Emigration \nOnline Submission of Travel Document Applications –  Local Applicants \n  \n \nInstructions to Online Apply for a Passport \n(Please read the instructions carefully) \n \n1.  How to apply? \n1.1. You can apply for either urgent service or normal service.  \n1.2. If you select the urgent service, your travel docum ent will be issued after three days of capturing \nyour fingerprints. \n1.3. If you select the normal service, your travel docum ent will be issued after thirty days of capturing \nyour fingerprints. \n \n2.  Eligibility \n2.1 The eligibility c

In [39]:
type(chunks)

list

In [40]:
for i in chunks:
    print(i.page_content)
    print("----------------------------------------------")

Department of Immigration and Emigration 
Online Submission of Travel Document Applications –  Local Applicants 
  
 
Instructions to Online Apply for a Passport 
(Please read the instructions carefully) 
 
1.  How to apply? 
1.1. You can apply for either urgent service or normal service.  
1.2. If you select the urgent service, your travel docum ent will be issued after three days of capturing 
your fingerprints. 
1.3. If you select the normal service, your travel docum ent will be issued after thirty days of capturing 
your fingerprints. 
 
2.  Eligibility 
2.1 The eligibility criteria in order to avail this service are as follows: 
2.1.1  Your age shall be 16 years or above as at th e date of submitting your application. 
2.1.2  You shall possess your valid passport, if an y.  
2.1.3  Your NIC /Passport/Full Name shall not be bl acklisted. 
2.1.4  You shall not have a travel ban imposed by t he Courts of Law.
----------------------------------------------
2.1.4  You shall not have a

In [41]:
print(chunks[0].page_content)

Department of Immigration and Emigration 
Online Submission of Travel Document Applications –  Local Applicants 
  
 
Instructions to Online Apply for a Passport 
(Please read the instructions carefully) 
 
1.  How to apply? 
1.1. You can apply for either urgent service or normal service.  
1.2. If you select the urgent service, your travel docum ent will be issued after three days of capturing 
your fingerprints. 
1.3. If you select the normal service, your travel docum ent will be issued after thirty days of capturing 
your fingerprints. 
 
2.  Eligibility 
2.1 The eligibility criteria in order to avail this service are as follows: 
2.1.1  Your age shall be 16 years or above as at th e date of submitting your application. 
2.1.2  You shall possess your valid passport, if an y.  
2.1.3  Your NIC /Passport/Full Name shall not be bl acklisted. 
2.1.4  You shall not have a travel ban imposed by t he Courts of Law.


#### Embedding the Chunks

In [42]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from typing import List

def embed_documents(documents: List[Document], model_name: str = "all-MiniLM-L6-v2"):
    """
    Embed the list of documents using a pre-trained HuggingFace model.

    Args:
        documents (List[Document]): The documents to embed.
        model_name (str): The Hugging Face embedding model to use (default: "all-MiniLM-L6-v2").

    Returns:
        List[List[float]]: A list of embeddings (vector representation of the documents).
    """
    # Initialize embedding model
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    
    # Embed the documents
    embeddings = embedding_model.embed_documents([doc.page_content for doc in documents])

    return embeddings


In [43]:
embeddings = embed_documents(chunks, model_name="all-MiniLM-L6-v2")
embeddings

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 8s [Retry 4/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 8s [Retry 5/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
HTTP Error 429 thrown while requesting HEAD https://hugg

KeyboardInterrupt: 

In [None]:
type(embeddings)

list

In [None]:
len(embeddings)

10

####  Store embeddings in a Vector Database

In [44]:
# load the text loader 
from langchain_community.document_loaders import TextLoader

In [45]:
loader = TextLoader("./test_data/speech.txt")
documents = loader.load()
documents

[Document(metadata={'source': './test_data/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and

In [46]:
# split the text into smaller chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [47]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x26c9a280b50>

In [None]:
chunks = splitter.split_documents(documents) 
chunks

[Document(metadata={'source': './test_data/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={'source': './test_data/speech.txt'}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…'),
 Document(metadata={'source'

In [48]:
# Embed the chunks using HuggingFace embeddings
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
# Initialize the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embedding_model

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 4s [Retry 3/5].


HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [50]:
embeddings = embedding_model.embed_documents([doc.page_content for doc in documents])
embeddings

[[-0.024979230016469955,
  0.060726016759872437,
  -0.07438061386346817,
  -0.03582463040947914,
  -0.00046451992238871753,
  0.010692726820707321,
  0.05784572288393974,
  -0.044231779873371124,
  -0.02094457484781742,
  0.0728975236415863,
  0.043148502707481384,
  -0.01467179786413908,
  0.06261970847845078,
  0.01238931342959404,
  -0.0034853811375796795,
  0.02171238139271736,
  -0.05261489748954773,
  -0.05676167458295822,
  -0.12298355996608734,
  0.13666757941246033,
  -0.0205841064453125,
  0.0547347366809845,
  0.06125496327877045,
  0.09263648837804794,
  -0.12914802134037018,
  -4.801653267350048e-05,
  0.034465234726667404,
  -0.011513428762555122,
  0.014853750355541706,
  -0.03045596554875374,
  0.04792337119579315,
  -0.11821431666612625,
  0.06203392148017883,
  0.01731581799685955,
  0.028626980260014534,
  0.027012474834918976,
  0.04707871749997139,
  -0.10649622231721878,
  0.047832392156124115,
  -0.106280118227005,
  -0.007873442023992538,
  -0.09647613763809204,

In [52]:
# Store the embeddings in a vector database
from langchain_chroma import Chroma

In [53]:
vectorstore  = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
)
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x26c9b067550>

In [63]:
query = " How to apply?"
docs = vectorstore.similarity_search(query)
docs

[Document(id='3d30e808-d4a5-4a5a-a760-d8a71601fff6', metadata={'moddate': '2024-01-31T19:54:08+05:30', 'page': 1, 'source': '../data/Passport1.pdf', 'author': 'ewis', 'producer': 'FREE PDFill PDF and Image Writer', 'total_pages': 4, 'title': 'Microsoft Word - Instructions for Online TD - Local Applicants -english.doc', 'page_label': '2', 'creationdate': '2024-01-31T19:54:07+05:30', 'creator': 'PScript5.dll Version 5.2.2'}, page_content='5.5 Immediately after the acknowledgment of your ap plication, you will receive an SMS. You are \nrequested NOT to report to the relevant Service Fac ilitating Center to submit fingerprints until you \nreceive the SMS. \n5.6 You shall be ready to furnish the following doc uments to the sub office of the Department for \nRegistration of Persons at Divisional Secretariat.   \n  5.6.1. Message of appointment for providing finge rprints (Mandatory)'),
 Document(id='b6eee32b-7e64-4f2a-82d3-2019370a2db6', metadata={'total_pages': 4, 'moddate': '2024-01-31T19:

In [64]:
docs[1].page_content

'4.  General Instructions   \n4.1 Please log on to the following URL to apply online. \nhttps://www.immigration.gov.lk/  \n4.2 The application must be completed in English. \n4.3 The applicants who have applied under “Urgent servi ce” will be facilitated to receive passports by \ncourier service and under the “Normal service” to r eceive passports by registered post.  \n4.4 The courier charges/postal charges will be borne by the Department.  \n4.5 If you have already obtained a photograph, please e nter the acknowledgement number of the \nphotograph. \n4.6 All documents required to be uploaded shall be in J PEG format and size of file has to be less than 5 \nMB. \n5. Instructions on Service Facilitating Centers \n5.1 You are requested to select one of the following Service Facilitating Centers.  \n  5.1.1  Head office of the Department  \n  5.1.2 Regional offices of the Department \n 5.1.3 Sub offices of the Department for Registrati on of Persons established at nominated'

In [71]:
from langchain_chroma import Chroma
from langchain_core.documents import Document
from typing import List
import os

def store_embeddings_with_vectors(
    documents: List[Document],
    embeddings: List[List[float]],
    persist_directory: str = "./chroma_store"
):
    """
    Store pre-computed document embeddings in a Chroma vector database.

    Args:
        documents (List[Document]): The original documents.
        embeddings (List[List[float]]): Precomputed embeddings.
        persist_directory (str): Directory to save the Chroma vector DB.
    
    Returns:
        Chroma: The Chroma vector store instance.
    """
    # Ensure persistence directory exists
    os.makedirs(persist_directory, exist_ok=True)

    # Create Chroma vector store using pre-computed embeddings
    vectorstore = Chroma.from_documents(
        documents=documents,
        embeddings=embeddings,
        persist_directory=persist_directory
    )

    # Save the vector store to disk
    vectorstore.persist()

    print(f"Stored {len(documents)} documents with pre-computed embeddings to {persist_directory}")
    return vectorstore


In [72]:
store_embeddings_with_vectors(chunks, embeddings, persist_directory="./chroma_store")   

TypeError: Chroma.__init__() got an unexpected keyword argument 'embeddings'