In [None]:
pip install transformers sentence-transformers langchain torch faiss-cpu numpy



In [None]:
!pip install pypdf



In [None]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [None]:
 #Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'us_census/p70-178.pdf', 'page': 0}, page_content='Occupation, Earnings, and Job \nCharacteristics\nJuly 2022P70-178Clayton Gumber and Briana SullivanCurrent Population Reports\nINTRODUCTION\nWork is a critical component of our lives and provides \na way to obtain material and nonmonetary benefits \nlike employer-provided health insurance. Scholars \nsuggest that our identities are also tied to the notion \nof “what we do” (Christiansen, 1999), and that who \nwe are is determined partly by our occupational iden -\ntity (Skorikov and Vondracek, 2011). However, work is time consuming—the American Time Use Survey \nshows that in 2017 workers spent an average 8.21')

In [None]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 63 documents loaded, with average characters equal to 3830.
After split, there were 400 documents (chunks), with average characters equal to 618 (average chunk length).


In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 4.45974525e-03  2.45670304e-02 -9.26598441e-03 -1.57660805e-02
  2.28831787e-02  5.06654419e-02  6.28824010e-02 -4.42066453e-02
  1.57757532e-02 -8.59610178e-03  4.39736098e-02  3.47311832e-02
 -3.64995413e-02 -2.15880629e-02 -2.73355823e-02  1.86276669e-03
  7.97495432e-03 -1.32879382e-02 -3.86800282e-02  2.56957323e-03
  3.25495377e-03 -7.44617265e-03 -3.43394615e-02 -1.11159673e-02
  3.93330865e-02  2.13205907e-02 -1.46255931e-02 -1.28421504e-02
  5.33078052e-03 -9.22772735e-02 -2.51137326e-03  3.05945091e-02
  7.01794848e-02  4.55944873e-02  5.70631102e-02  1.62480865e-02
 -2.30799839e-02  5.79102375e-02 -8.16257391e-03  2.15840735e-03
  1.87474198e-03  1.72113162e-02 -3.75275128e-02  2.44996371e-03
 -1.03556486e-02  8.26411098e-02 -4.40971740e-02 -4.65424396e-02
 -6.70143217e-02  2.73963213e-02  4.45195194e-03  6.88931206e-03
  3.49869542e-02  1.23116158e-01  1.69170480e-02 -6.50550285e-03
  4.83519249e-02  4.96574561e-04 -1.58660021e-02 -3

In [None]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [None]:
query = """What were the trends in median household income across
           different states in the United States between 2021 and 2022."""
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

in 2022 was $74,755, according 
Figure 1.
Median Household Income in the Past 12 Months in the United States: 2005–2022
 
Note: Estimates for 2020 experimental data not shown. For more information on the 2020 experimental data products, 
refer to <www.census.gov/programs-surveys/acs/technical-documentation/user-notes/2021-02.html>. Information on conﬁdentiality protection, sampling error, nonsampling error, and deﬁnitions is available at <www.census.gov/acs>.
Source: U.S. Census Bureau, 2005–2022 American Community Survey, 1-year estimates.Recession
/zero.tab/five.tab/five.tab/six.tab/zero.tab/six.tab/five.tab/seven.tab/zero.tab/seven.tab/five.tab/eight.tab/zero.tab


In [None]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",  # Ensure the model ID is correct
    model_kwargs={"temperature": 0.1, "max_length": 500},
    huggingfacehub_api_token="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"
)

query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""
response = hf.invoke(query)
print(response)


  warn_deprecated(


What were the trends in median household income across different states in the United States between 2021 and 2022.

## Introduction

The median household income in the United States increased by 1.3% from 2021 to 2022, according to data from the U.S. Census Bureau. The median household income in 2022 was $67,521, up from $66,516 in 2021.

The increase in median household income was driven by an increase in earnings for full-time


In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="gpt2",
    task="text-generation",
    pipeline_kwargs={"temperature": 0.7, "max_new_tokens": 300}  # Set temperature to a positive value
)

query = "What were the trends in median household income across different states in the United States between 2021 and 2022?"
response = hf.invoke(query)
llm=hf
print(response)

What were the trends in median household income across different states in the United States between 2021 and 2022?

The average income of the median household in a different state is lower than the average income for the same state.

Did you have a choice between buying private insurance, or paying a premium?

The average premium for private insurance in a state is $1,735.

Did you own insurance or insurance with you?

Private insurance with you is a good choice but you could probably get paid less if you buy a premium.

What if I have a medical condition and am sick?

You could buy insurance with you for $1,000 if you have a medical condition, but you could easily get paid less if you do have a physical condition.

What if your condition is not life-threatening or you get sick?

If you can't reach for help, you could get paid more.

How much money do you have to buy insurance with your insurance?

You could pay up to $1,000 for insurance.

How much are you paying for insurance?

You 

In [None]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [None]:


retrievalQA = RetrievalQA.from_chain_type(
    llm=llm, # Now you are passing the initialized LLM
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [None]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and 

In [None]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: us_census/acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
----------------------------------------------------------------------------------