In [1]:
!!pip install -U langchain-community faiss-cpu pypdf pillow torch transformers huggingface-hub




In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA



In [3]:
## Read the ppdfs from the folder
loader=PyPDFDirectoryLoader("./census")

documents=loader.load()

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

final_documents=text_splitter.split_documents(documents)
final_documents[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-19T11:35:38-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'household income in states and metropolitan areas 2022', 'moddate': '2023-11-30T12:35:09+00:00', 'title': 'Household Income in States and Metropolitan Areas: 2022', 'trapped': '/false', 'source': 'census/acsbr-017.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='KEY DEFINITIONS\nHousehold income: Includes income of the \nhouseholder and all other people 15 years and \nolder in the household, whether or not they are \nrelated to the householder.\nMedian: The point that divides the household \nincome distribution into halves, one half with \nincome above the median and the other with \nincome below the median. The median is based \non the income distribution of all households, \nincluding those with no income.\nGini index: A summary measure of income \ninequality. The Gini index v

In [4]:
len(final_documents)

316

In [5]:
## Embedding Using Huggingface
from huggingface_hub import login
login("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")
huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",      #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}

)

  huggingface_embeddings=HuggingFaceBgeEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
import  numpy as np
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape)

[-2.57332940e-02 -3.59337442e-02 -1.84767619e-02 -3.56312394e-02
  4.11861651e-02  1.88778229e-02  9.25847515e-02 -2.68622991e-02
  3.54797877e-02  3.44294310e-02  5.45945019e-02 -4.76480871e-02
  3.22299078e-02  2.65094806e-02  2.59273220e-03 -7.35303666e-03
 -2.04164125e-02  5.36096990e-02 -2.23209150e-02  1.09560834e-02
  1.45142689e-01 -4.14664373e-02 -5.62099591e-02 -9.70577728e-03
  1.53211296e-01  2.54773442e-02 -5.28363325e-03 -3.46149132e-02
 -4.65290993e-02 -1.32081121e-01  4.12024967e-02  3.17118652e-02
  3.41978483e-02  1.28360530e-02 -2.74796691e-03  3.88467405e-03
  6.42073900e-03  5.70742823e-02  3.01501970e-03 -1.28241684e-02
 -3.67485061e-02 -4.79584094e-03  2.59009860e-02 -2.68718079e-02
 -5.77792935e-02  2.40455344e-02 -4.64231893e-02  4.44490239e-02
 -1.91176459e-02  4.33444455e-02 -7.53747672e-02 -3.72263370e-03
 -2.46607773e-02  6.12502098e-02  2.41764542e-02 -4.93594296e-02
  6.81420267e-02 -1.69117581e-02 -7.62011716e-03  5.50068505e-02
  4.86901775e-02  2.48413

In [7]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [8]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

16 U.S. Census Bureau
Employer-Provided Health 
Insurance Coverage
Earnings paid to employees are 
only one component of a worker’s 
total compensation. In addition to 
wages and salaries, which repre -
sent over 60 percent of employ-
ers’ costs for employee compen -
sation (Bureau of Labor Statistics, 
2018), employers may confer 
nonwage benefits to employees 
like paid leave, retirement and 
savings plans, and health insur -
ance. According to the Bureau 
of Labor Statistics (2018), health 
insurance made up 8.3 percent 
of employers’ costs for the total 
compensation awarded to civilian 
workers in December 2017.
Given the overall cost of health 
insurance for employers, and the 
importance of health insurance 
to workers, this report explores 
workers’ private health insurance 
as well as their source of cover-
age. In contrast with other sur-
veys, the SIPP collects detailed 
information on individuals’ health 
insurance at a monthly level. For 
each month, the SIPP collects


In [9]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7c607c3088c0> search_kwargs={'k': 3}


In [10]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']="

In [12]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Show the name of your GPU


True
Tesla T4


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to('cuda')

input_text = "What is the health insurance coverage?"
inputs = tokenizer(input_text, return_tensors="pt")
inputs = {k: v.to('cuda') for k, v in inputs.items()}  # move inputs to CUDA

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=50)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The Hugging Face Hub is an platform with over 350k models, 75k datasets, and 150k demo apps (Spaces), all open source and publicly available, in an online platform where people can easily collaborate and build ML together.

In [6]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117


Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Found existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu117
Collecting torch
  Downloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.23.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvi

In [1]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf
llm.invoke(query)

KeyboardInterrupt: 

In [None]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [None]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [None]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [None]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [None]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

comparison of ACS and CPS ASEC measures 
of health insurance coverage, refer to < www.
census.gov/topics/health/health-insurance/
guidance.html >.
9 Respondents may have more than one 
health insurance coverage type at the time 
of interview. As a result, adding the total 
number of people with private coverage and 
the total number with public coverage will 
sum to more than the total number with any 
coverage.• From 2021 to 2022, nine states 
reported increases in private 
coverage, while seven reported 
decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, uninsured rates at the 
time of interview ranged across 
states from a low of 2.4 percent 
in Massachusetts to a high of 16.6 
percent in Texas, compared to the 
national rate of 8.0 percent.10 Ten 
of the 15 states with uninsured 
10 The uninsured rates in the Distr