In [1]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to scrape website content
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text content (simplified example)
    paragraphs = soup.find_all('p')
    text_content = [p.get_text() for p in paragraphs]
    
    return text_content

# Convert the scraped content into embeddings
def create_embeddings(content):
    embeddings = model.encode(content)
    return embeddings

# Store embeddings in FAISS vector database
def store_embeddings_in_faiss(embeddings, content):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity
    index.add(np.array(embeddings))
    
    # Optionally store content metadata (e.g., text) for reference
    return index, content

# Example website scraping
urls = [
    'https://www.uchicago.edu/',
    'https://www.washington.edu/',
    'https://www.stanford.edu/',
    'https://und.edu/'
]

# Scraping and processing each URL
content_list = []
for url in urls:
    scraped_content = scrape_website(url)
    embeddings = create_embeddings(scraped_content)
    index, content = store_embeddings_in_faiss(embeddings, scraped_content)
    content_list.append(content)

# Save FAISS index for later use
faiss.write_index(index, "website_content.index")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
def query_to_embedding(query):
    return model.encode([query])

def retrieve_relevant_chunks(query_embedding, index, top_k=3):
    # Perform similarity search
    D, I = index.search(np.array(query_embedding), k=top_k)  # Retrieve top K results
    
    # Check if there are valid results
    if I.shape[1] == 0 or len(I[0]) == 0:
        return []  # Return empty if no results found
    
    # Ensure that the indices are within range of content_list
    relevant_chunks = []
    for i in I[0]:
        if i < len(content_list):  # Check if the index is valid
            relevant_chunks.append(content_list[i])
        else:
            print(f"Invalid index: {i}")
    
    return relevant_chunks


# Example Query
user_query = "What is the main focus of research at the University of Washington?"

query_embedding = query_to_embedding(user_query)
relevant_chunks = retrieve_relevant_chunks(query_embedding, index)

# Display relevant chunks
print(relevant_chunks)


Invalid index: 17
Invalid index: 10
[['20241217T020920Z-16ccd9d5bb924x6khC1MAAs95s00000018700000000002ns']]


In [6]:
from transformers import pipeline

# Initialize a question-answering pipeline using a pre-trained model
qa_pipeline = pipeline("question-answering", model="facebook/bart-large")

def generate_response(query, relevant_chunks):
    # Flatten relevant_chunks to a single list of strings if it's nested
    flat_chunks = [item for sublist in relevant_chunks for item in (sublist if isinstance(sublist, list) else [sublist])]
    print("Flat Chunks:", flat_chunks)

    # Combine the chunks into a single context string
    context = " ".join(flat_chunks)  # Join all chunks into a single context
    response = qa_pipeline(question=query, context=context)
    
    return response['answer']


# Generate the response for the user query
response = generate_response(user_query, relevant_chunks)
print("Answer:", response)
relevant_chunks = [["This is chunk 1."], ["This is chunk 2."]]
flat_chunks = ["This is chunk 1.", "This is chunk 2."]



Some weights of BartForQuestionAnswering were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Flat Chunks: ['20241217T020920Z-16ccd9d5bb924x6khC1MAAs95s00000018700000000002ns']
Answer: 20241217
