In [1]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA


In [2]:
#Read pdfs from the folder

loader = PyPDFDirectoryLoader("c:/Users/vamsh/OneDrive/Desktop/Langchain/Langchain/huggingface/us_census")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(documents)
final_documents[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.2 (Windows)', 'creationdate': '2023-09-09T07:52:17-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'acsbr-015', 'moddate': '2023-09-12T14:44:47+01:00', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022', 'trapped': '/false', 'source': 'c:\\Users\\vamsh\\OneDrive\\Desktop\\Langchain\\Langchain\\huggingface\\us_census\\acsbr-015.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public 

In [3]:
len(final_documents)

316

In [11]:
## Embedding using hUggingFace

huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

In [12]:
import numpy as np  
np.array(huggingface_embeddings.embed_query(final_documents[0].page_content))

array([-2.49750037e-02,  7.48803746e-03,  6.07665442e-02,  2.87433956e-02,
        7.38343671e-02,  1.22540459e-01, -1.49952834e-02, -7.34637398e-03,
       -1.00141063e-01, -1.65566173e-03, -1.06491204e-02,  9.63608548e-02,
       -2.55548172e-02, -1.02058493e-01,  1.60375722e-02,  1.36243366e-02,
        1.47681190e-02, -3.22496593e-02, -2.84116454e-02,  8.82342458e-02,
       -3.01534869e-02,  2.01462917e-02, -5.24294749e-02, -3.65702435e-03,
        1.23102581e-02,  5.61478641e-03,  4.09074202e-02, -1.13199349e-03,
       -9.47215408e-03,  6.47103265e-02,  8.98134559e-02, -5.35427313e-03,
        2.55072191e-02,  1.17485765e-02,  2.94315796e-02, -4.34861965e-02,
       -3.71153727e-02,  2.39203274e-02, -9.60926637e-02,  1.52798379e-02,
       -3.21496390e-02, -4.64301072e-02, -8.10123980e-02,  1.04659721e-01,
        2.16492992e-02, -1.19760667e-03, -4.37057056e-02,  8.71711299e-02,
       -1.37320897e-02,  5.47637120e-02,  7.84206903e-04,  3.08593363e-02,
        3.65715213e-02,  

In [13]:
vector_store = FAISS.from_documents(final_documents[:120], huggingface_embeddings)

In [14]:
## Query using similarity search

query = "What is Health Insurance Covergage?"
relevant_documents = vector_store.similarity_search(query)

print(relevant_documents[0].page_content)

private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.
gov), or coverage through TRICARE. Public insurance coverage 
includes federal programs (such as Medicare, Medicaid, and the 
Children’s Health Insurance Program or CHIP), individual state 
health plans, and CHAMPVA (Civilian Health and Medical Program 
at the Department of Veterans Affairs), as well as care provided 
by the Department of Veterans Affairs. In the ACS, people are 
considered insured if they were covered by any of these types 
of health insurance at time of interview. People are considered 
uninsured if they were not covered by any of these types of health 
insurance at time of interview or if they only had coverage through 
the Indian Health Service (IHS), as IHS coverage is not considered 
comprehensive.


In [15]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={'k':3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000027F4234ACC0> search_kwargs={'k': 3}


In [16]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['HUGGINGFACE_API_KEY']=os.getenv("HUGGINGFACE_API_KEY")

In [None]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}
)