In [2]:
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Example PDF file path
pdf_path = "tables-charts-and-graphs-with-examples-from.pdf"  # Replace with your PDF file path
pdf_content = extract_text_from_pdf(pdf_path)

In [3]:
# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to create chunks and embeddings
def create_chunks_and_embeddings(content, chunk_size=512):
    # Split content into chunks
    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
    embeddings = embedding_model.encode(chunks)
    return chunks, embeddings

# Create chunks and embeddings
chunks, embeddings = create_chunks_and_embeddings(pdf_content)

# Convert to numpy array for FAISS
embeddings = np.array(embeddings).astype('float32')

# Create a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
index.add(embeddings)  # Add embeddings to the index

In [4]:
def handle_query(query):
    # Convert the query to an embedding
    query_embedding = embedding_model.encode([query])
    
    # Perform a similarity search
    D, I = index.search(np.array(query_embedding).astype('float32'), k=5)  # Retrieve top 5 results
    return I[0]  # Return indices of the most relevant chunks

# Example query
user_query = "What is the unemployment information based on type of degree?"
relevant_indices = handle_query(user_query)

# Retrieve relevant chunks
relevant_chunks = [chunks[i] for i in relevant_indices]

In [5]:
from transformers import pipeline

# Load a pre-trained language model for response generation
llm = pipeline("text-generation", model="gpt2")

def generate_response(relevant_chunks, user_query):
    context = " ".join(relevant_chunks)
    prompt = f"Based on the following information, answer the question: {user_query}\n\nContext: {context}\n\nAnswer:"
    
    # Use max_new_tokens instead of max_length
    response = llm(prompt, max_new_tokens=50, num_return_sequences=1)  # Generate up to 50 new tokens
    return response[0]['generated_text']

# Generate a response
response = generate_response(relevant_chunks, user_query)
print(response)

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Based on the following information, answer the question: What is the unemployment information based on type of degree?

Context:  Bureau of Labor Statistics
19%
18%
4%
59%
2015 U.S. GDP (in millions of dollars)
Manufacturing
Finance, insurance, real
estate, rental, and
leasing
Arts, entertainment,
recreation,
accommodation, and
food services
Other
• The chart below is called a pie chart.  It shows what 
percent “of the pie” a particular category occupies 
out of the whole.
• If total GDP in 2015 is the entire pie, then 
manufacturing makes up 19% of that pie and finance 
makes up 18%.  Notice that visually speaking, since 19% 
and 18 ng different groups of 
variables.  We used it to compare different components 
of US GDP.  We did the same with the pie chart; 
depending on your purposes you may choose to use a 
pie chart or a bar graph.
x
y
0
0
1
3
2
6
3
9
4
12
5
15
6
18
7
21
8
24
•
If given a table of data, we should be able to plot it.  Below is 
some sample data; plot the data with 