In [2]:
!pip install PyPDF2 pandas sentence-transformers faiss-cpu


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, faiss-cpu
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.9.0.post1


In [3]:
import requests

# Download the PDF
url = "https://www.hunter.cuny.edu/dolciani/pdf_files/workshop-materials/mmc-presentations/tables-charts-and-graphs-with-examples-from.pdf"
pdf_path = "tables-charts-and-graphs.pdf"

response = requests.get(url)
with open(pdf_path, "wb") as file:
    file.write(response.content)


In [4]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(file_path, page_number):
    pdf_reader = PdfReader(file_path)
    page = pdf_reader.pages[page_number]
    return page.extract_text()

# Extract text from page 2 (index 1) and page 6 (index 5)
page_2_text = extract_text_from_pdf(pdf_path, 1)
page_6_text = extract_text_from_pdf(pdf_path, 5)

print("Page 2 Text:\n", page_2_text)
print("\nPage 6 Text:\n", page_6_text)


Page 2 Text:
 

Page 6 Text:
 Table of Yearly U.S. GDP by 
Industry (in millions of dollars)
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real 
Estate, Rental, 
Leasing4522451 4618678 4797313 5031881 5339678 5597018
Arts, 
Entertainment, 
Recreation, 
Accommodation,
and Food Service964032 1015238 1076249 1120496 1189646 1283813
Other 15614511 16320113 16948076 17495515 18318606 18686638Source: U.S. Bureau of Labor Statistics


In [6]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Prepare data for RAG
data = [page_2_text, page_6_text]
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode data
embeddings = model.encode(data)
embeddings = np.array(embeddings)  # Ensure embeddings are in NumPy array format

# Verify embedding shape
print("Embedding Shape:", embeddings.shape)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # Ensure embedding dimension matches
index.add(embeddings)

# Query RAG model
def query_rag(query, index, model, data):
    query_vec = model.encode([query])
    query_vec = np.array(query_vec)  # Convert query vector to NumPy array
    distances, indices = index.search(query_vec, k=1)
    return data[indices[0][0]]

# Example queries
query_1 = "Unemployment rate for a Bachelor's degree"
query_2 = "Table data on page 6"

# Retrieve results
result_1 = query_rag(query_1, index, model, data)
result_2 = query_rag(query_2, index, model, data)

print("Query 1 Result:\n", result_1)
print("\nQuery 2 Result:\n", result_2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding Shape: (2, 384)
Query 1 Result:
 Table of Yearly U.S. GDP by 
Industry (in millions of dollars)
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real 
Estate, Rental, 
Leasing4522451 4618678 4797313 5031881 5339678 5597018
Arts, 
Entertainment, 
Recreation, 
Accommodation,
and Food Service964032 1015238 1076249 1120496 1189646 1283813
Other 15614511 16320113 16948076 17495515 18318606 18686638Source: U.S. Bureau of Labor Statistics

Query 2 Result:
 Table of Yearly U.S. GDP by 
Industry (in millions of dollars)
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real 
Estate, Rental, 
Leasing4522451 4618678 4797313 5031881 5339678 5597018
Arts, 
Entertainment, 
Recreation, 
Accommodation,
and Food Service96403

In [7]:
def chatbot_interface(query):
    if "unemployment" in query.lower():
        return parse_unemployment_data(page_2_text)
    elif "table" in query.lower():
        return table_data
    else:
        return "I can only answer questions about unemployment rates or table data."

# Example interaction
user_query = input("Ask me a question: ")
response = chatbot_interface(user_query)
print("\nChatbot Response:\n", response)


Ask me a question: what is content page2

Chatbot Response:
 I can only answer questions about unemployment rates or table data.
