In [1]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Function to scrape content from a website
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract text content
    paragraphs = soup.find_all('p')
    text_content = ' '.join([para.get_text() for para in paragraphs])
    
    return text_content

# Example URLs
urls = [
    "https://www.stanford.edu/",
    "https://und.edu/"
]

# Scrape content from each URL
website_contents = {}
for url in urls:
    website_contents[url] = scrape_website(url)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to create chunks and embeddings
def create_chunks_and_embeddings(content, chunk_size=512):
    # Split content into chunks
    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
    embeddings = embedding_model.encode(chunks)
    return chunks, embeddings

# Store chunks and embeddings
all_chunks = []
all_embeddings = []

for url, content in website_contents.items():
    chunks, embeddings = create_chunks_and_embeddings(content)
    all_chunks.extend(chunks)
    all_embeddings.extend(embeddings)

# Convert to numpy array for FAISS
all_embeddings = np.array(all_embeddings).astype('float32')

# Create a FAISS index
index = faiss.IndexFlatL2(all_embeddings.shape[1])  # L2 distance
index.add(all_embeddings)  # Add embeddings to the index

In [5]:
def handle_query(query):
    # Convert the query to an embedding
    query_embedding = embedding_model.encode([query])
    
    # Perform a similarity search
    D, I = index.search(np.array(query_embedding).astype('float32'), k=5)  # Retrieve top 5 results
    return I[0]  # Return indices of the most relevant chunks

# Query
user_query = "Write in brief about Stanford University?"
relevant_indices = handle_query(user_query)

# Retrieve relevant chunks
relevant_chunks = [all_chunks[i] for i in relevant_indices]

In [6]:
from transformers import pipeline

# Load a pre-trained language model for response generation
llm = pipeline("text-generation", model="gpt2")

def generate_response(relevant_chunks, user_query):
    context = " ".join(relevant_chunks)
    prompt = f"Based on the following information, answer the question: {user_query}\n\nContext: {context}\n\nAnswer:"
    
    # Use max_new_tokens instead of max_length
    response = llm(prompt, max_new_tokens=50, num_return_sequences=1)  # Generate up to 50 new tokens
    return response[0]['generated_text']

# Generate a response
response = generate_response(relevant_chunks, user_query)
print(response)

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Based on the following information, answer the question: Write in brief about Stanford University?

Context: 
      Other ways to search:
        Map
Profiles
 Stanford Explore Stanford Stanford was founded almost 150 years ago on a bedrock of societal purpose. Our mission is to contribute to the world by educating students for lives of leadership and contribution with integrity; advancing fundamental knowledge and cultivating creativity; leading in pioneering research for effective clinical therapies; and accelerating solutions and amplifying their impact. Stories about people, research, and innovation across the e evolutions of yourself.” Class of 2024 Building a vibrant community of creative and accomplished people from around the world A residential campus with diverse housing, exceptional dining, and over 600 student organizations Student Affairs A rich tradition of fostering creativity and a vibrant arts district on campus Stanford Arts State-of-the-art facilities and fitness pro