In [None]:
!pip install transformers sentence-transformers langchain==0.3.25 torch faiss-cpu numpy langchain_community pypdf sentence_transformers langchain_huggingface

In [None]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate

# Data loading

In [None]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [None]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
print(len(docs_before_split))

63


# Chunking

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-19T11:35:38-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'household income in states and metropolitan areas 2022', 'moddate': '2023-11-30T12:35:09+00:00', 'title': 'Household Income in States and Metropolitan Areas: 2022', 'trapped': '/false', 'source': 'us_census/acsbr-017.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='KEY DEFINITIONS\nHousehold income: Includes income of the \nhouseholder and all other people 15 years and \nolder in the household, whether or not they are \nrelated to the householder.\nMedian: The point that divides the household \nincome distribution into halves, one half with \nincome above the median and the other with \nincome below the median. The median is based \non the income distribution of all households, \nincluding those with no income.\nGini index: A summary measure of income \ninequality. The Gini inde

In [None]:
docs_after_split = docs_after_split[:50]

# Embedding Model Initialization

In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

  huggingface_embeddings = HuggingFaceBgeEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Embedding Model Test

In [None]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-4.50778641e-02  1.50797488e-02  2.46991422e-02 -6.65423111e-04
 -4.38780524e-02 -1.07891159e-02 -5.07413708e-02  3.44191343e-02
 -5.41927777e-02 -9.30074137e-03 -1.30154910e-02 -5.37789166e-02
  2.13211495e-02  1.02954237e-02 -9.92281437e-02 -6.54240847e-02
 -3.09051052e-02 -1.58301797e-02  1.41140707e-02  6.87583685e-02
  9.43290964e-02  2.07248311e-02  7.63684185e-03 -4.11620140e-02
  1.31620556e-01 -3.66067402e-02  7.67383585e-03 -3.41178812e-02
  2.31406316e-02  1.17237911e-01  2.92219147e-02  7.96835274e-02
  1.58325985e-01 -3.14751416e-02 -3.04871630e-02 -2.70445719e-02
  5.88974282e-02 -1.05799651e-02  9.17627141e-02 -3.91758494e-02
  2.69899424e-02 -5.07464521e-02 -5.43461367e-03 -5.83154075e-02
  2.54384838e-02 -2.80841272e-02  1.25270542e-02  6.83660731e-02
  4.43656109e-02  3.13634425e-02 -9.45478603e-02  6.73463047e-02
  4.78056408e-02  3.92849743e-02  5.03921211e-02  1.96698923e-02
 -1.25939446e-02 -4.76628207e-02  5.71067259e-02  5

# Embeddings creation & Vector db data injection

In [None]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

# Retriving Chunks

In [None]:
query = """Details of Clayton Gumber and Briana Sullivan"""
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query,k=4)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

Occupation, Earnings, and Job 
Characteristics
July 2022
P70-178
Clayton Gumber and Briana Sullivan
Current Population Reports
INTRODUCTION
Work is a critical component of our lives and provides 
a way to obtain material and nonmonetary benefits 
like employer-provided health insurance. Scholars 
suggest that our identities are also tied to the notion 
of “what we do” (Christiansen, 1999), and that who 
we are is determined partly by our occupational iden -
tity (Skorikov and Vondracek, 2011). However, work 
is time consuming—the American Time Use Survey 
shows that in 2017 workers spent an average 8.21 
hours each day engaged in work and work-related


In [None]:
final_content=""
for i in range(len(relevant_documents)):
  final_content+=relevant_documents[i].page_content
final_content

'Occupation, Earnings, and Job \nCharacteristics\nJuly 2022\nP70-178\nClayton Gumber and Briana Sullivan\nCurrent Population Reports\nINTRODUCTION\nWork is a critical component of our lives and provides \na way to obtain material and nonmonetary benefits \nlike employer-provided health insurance. Scholars \nsuggest that our identities are also tied to the notion \nof “what we do” (Christiansen, 1999), and that who \nwe are is determined partly by our occupational iden -\ntity (Skorikov and Vondracek, 2011). However, work \nis time consuming—the American Time Use Survey \nshows that in 2017 workers spent an average 8.21 \nhours each day engaged in work and work-relatedaccuracy-statements.html >.\nThe 2018 ACS 1-year Accuracy \nof the Data document is located \nat <www.census.gov/programs-\nsurveys/acs/technical-documen -\ntation/code-lists.html>.\nCONTACTS\nFor more information on the \nSIPP, including data and meth -\nodology, please contact the SIPP \nCoordination and Outreach staff \

In [None]:
final_prompt = f"""Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{final_content}

Question: {query}

Helpful Answer:
"""
final_prompt

'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nOccupation, Earnings, and Job \nCharacteristics\nJuly 2022\nP70-178\nClayton Gumber and Briana Sullivan\nCurrent Population Reports\nINTRODUCTION\nWork is a critical component of our lives and provides \na way to obtain material and nonmonetary benefits \nlike employer-provided health insurance. Scholars \nsuggest that our identities are also tied to the notion \nof “what we do” (Christiansen, 1999), and that who \nwe are is determined partly by our occupational iden -\ntity (Skorikov and Vondracek, 2011). However, work \nis time consuming—the American Time Use Survey \nshows that in 2017 workers spent an average 8.21 \nhours each day

In [None]:
!pip install groq

# Answer Generation

In [None]:
import os
from groq import Groq
client = Groq(
    # This is the default and can be omitted
    api_key='Your API_Key'
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant."
        },
        {
            "role": "user",
            "content": final_prompt,
        }
    ],
    model="llama-3.3-70b-versatile",
)

print(chat_completion.choices[0].message.content)

Clayton Gumber and Briana Sullivan are the authors of the report "Occupation, Earnings, and Job Characteristics" (Current Population Reports, P70-178). They are affiliated with the U.S. Census Bureau. Clayton Gumber can be contacted at Clayton.M.Gumber@census.gov for further information on the report's content. Briana Sullivan is a co-author of the report, but her contact information is not provided. For more information on the Survey of Income and Program Participation (SIPP), readers can contact the SIPP Coordination and Outreach staff at census.sipp@census.gov.


In [None]:
!pip freeze

absl-py==1.4.0
accelerate==1.7.0
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.11.15
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.1
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.9.0
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
array_record==0.7.2
arviz==0.21.0
astropy==7.1.0
astropy-iers-data==0.2025.6.2.0.38.23
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
backports.tarfile==1.2.0
beautifulsoup4==4.13.4
betterproto==2.0.0b6
bigframes==2.5.0
bigquery-magics==0.9.0
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.0.0
blosc2==3.3.4
bokeh==3.7.3
Bottleneck==1.4.2
bqplot==0.12.45
branca==0.8.1
build==1.2.2.post1
CacheControl==0.14.3
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.4.26
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.2
chex==0.1.89
clarabel==0.11.0
click==8.2.1
cloudpathlib==0.21.1
cloudpickle==3.1.1
cmake==3.31.6
cmdstanpy