<a href="https://colab.research.google.com/github/tharakagaddam/Sithafal/blob/main/Chat_with_Website_Using_RAG_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# -*- coding: utf-8 -*-
"""Chat with Website Using RAG Pipeline.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1GgnRSWQ9R4WnyDc0OjQZaRgG9-xVx7vO

# 2. Chat with Website Using RAG Pipeline
"""

!pip install requests beautifulsoup4 sentence-transformers faiss-cpu

import requests
import hashlib
import hmac
import json
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss

# Gemini API Key Setup
API_KEY_GEMINI = 'AIzaSyA6RFC3qRYB8zCmbgZjVYJugMRmT8WlPJA'

# Gemini API Authentication
def gemini_api_auth():
    headers = {
        "Content-Type": "application/json",
        "X-GEMINI-APIKEY": API_KEY_GEMINI,
    }
    response = requests.get("https://api.gemini.com/v1/pubticker/btcusd", headers=headers)
    if response.status_code == 200:
        print("Gemini API Authentication Successful!")
    else:
        raise Exception(f"API Authentication Failed: {response.json()}")

# Scrape website using requests
def fetch_website_data(site_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(site_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        content_paragraphs = soup.find_all("p")
        combined_text = " ".join([para.get_text() for para in content_paragraphs])
        return combined_text
    else:
        raise Exception(f"Error Fetching {site_url}: {response.status_code}")

# Scrape website using Selenium for JavaScript-rendered content
def fetch_js_content_with_selenium(js_url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = Service("/usr/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(js_url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    content_paragraphs = soup.find_all("p")
    combined_text = " ".join([para.get_text() for para in content_paragraphs])
    driver.quit()
    return combined_text

# Chunking the text
def text_chunking(raw_text, segment_size=300):
    words = raw_text.split()
    return [' '.join(words[i:i+segment_size]) for i in range(0, len(words), segment_size)]

# Embedding and storing in FAISS
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def store_embeddings_in_faiss(chunks):
    chunk_embeddings = embedding_model.encode(chunks)
    vector_dimension = chunk_embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(vector_dimension)
    faiss_index.add(chunk_embeddings)
    return faiss_index, chunk_embeddings

# Query Handling
def search_similar_vectors(input_query, faiss_index, text_chunks, top_results=5):
    query_embedding = embedding_model.encode([input_query])
    distances, indices = faiss_index.search(query_embedding, top_results)
    results = [text_chunks[i] for i in indices[0]]
    return results

# Generate a simple response
def build_response(query_input, matched_context):
    return f"Query: {query_input}\n\nRelevant Context:\n{matched_context}"

# Complete Workflow
try:
    # Authenticate with Gemini API
    gemini_api_auth()

    # List of URLs to scrape
    website_urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/"
    ]

    # Scrape content from websites
    processed_chunks = []
    for site_url in website_urls:
        try:
            print(f"Scraping Data From: {site_url}")
            website_content = fetch_website_data(site_url)
        except Exception as err:
            print(f"Switching to Selenium for: {site_url}")
            website_content = fetch_js_content_with_selenium(site_url)

        chunks = text_chunking(website_content)
        processed_chunks.extend(chunks)

    # Store embeddings into FAISS index
    faiss_index, embeddings = store_embeddings_in_faiss(processed_chunks)

    # Querying the system
    user_query = input("Enter Your Query: ")
    retrieved_results = search_similar_vectors(user_query, faiss_index, processed_chunks)
    context_combined = "\n".join(retrieved_results)

    # Generate response
    final_response = build_response(user_query, context_combined)
    print("\nSystem Response:")
    print(final_response)

except Exception as e:
    print("An Error Occurred:", e)


Gemini API Authentication Successful!
Scraping Data From: https://www.uchicago.edu/
Scraping Data From: https://www.washington.edu/
Scraping Data From: https://www.stanford.edu/
Scraping Data From: https://und.edu/
Enter Your Query: tell uchicago.edu

System Response:
Query: tell uchicago.edu

Relevant Context:
A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life. UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges. UChicago researchers have contributed to some of the world’s greatest discoveries, advancements, and bodies of knowledge. Faculty have a free and challenging environment in which to pursue the most original research. As a community partner, we invest in Chicago’s South Side across such areas as health, education, economic growth, and the arts. We are an international community of scholar