<a href="https://colab.research.google.com/github/tharakagaddam/Sithafal/blob/main/Chat_with_Website_Using_RAG_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
"""Chat with Website Using RAG Pipeline.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1GgnRSWQ9R4WnyDc0OjQZaRgG9-xVx7vO

# 2. Chat with Website Using RAG Pipeline
"""

!pip install requests beautifulsoup4 sentence-transformers faiss-cpu

import requests
import hashlib
import hmac
import json
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss

# Gemini API Key Setup
GEMINI_API_KEY = 'AIzaSyA6RFC3qRYB8zCmbgZjVYJugMRmT8WlPJA'

# Gemini API Authentication
def authenticate_gemini():
    headers = {
        "Content-Type": "application/json",
        "X-GEMINI-APIKEY": GEMINI_API_KEY,
    }
    response = requests.get("https://api.gemini.com/v1/pubticker/btcusd", headers=headers)
    if response.status_code == 200:
        print("Authenticated successfully with Gemini API!")
    else:
        raise Exception(f"Authentication failed: {response.json()}")

# Scrape website using requests
def scrape_website(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join([para.get_text() for para in paragraphs])
        return text
    else:
        raise Exception(f"Failed to fetch {url}: {response.status_code}")

# Scrape website using Selenium for JavaScript-rendered content
def scrape_website_with_selenium(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = Service("/usr/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    paragraphs = soup.find_all("p")
    text = " ".join([para.get_text() for para in paragraphs])
    driver.quit()
    return text

# Chunking the text
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Embedding and storing in FAISS
model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_and_store(chunks):
    embeddings = model.encode(chunks)
    dimension = embeddings.shape[1]
    vector_db = faiss.IndexFlatL2(dimension)
    vector_db.add(embeddings)
    return vector_db, embeddings

# Query Handling
def query_vector_search(query, vector_db, chunks, top_k=5):
    query_vec = model.encode([query])
    distances, indices = vector_db.search(query_vec, top_k)
    results = [chunks[i] for i in indices[0]]
    return results

# Generate a simple response
def generate_response(query, context):
    response = f"Query: {query}\n\nRelevant Context:\n{context}"
    return response

# Complete Workflow
try:
    # Authenticate with Gemini
    authenticate_gemini()

    # List of URLs to scrape
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/"
    ]

    # Scrape content from websites
    all_chunks = []
    for url in urls:
        try:
            print(f"Scraping: {url}")
            website_text = scrape_website(url)
        except Exception as e:
            print(f"Using Selenium for: {url}")
            website_text = scrape_website_with_selenium(url)
        chunks = chunk_text(website_text)
        all_chunks.extend(chunks)

    # Store embeddings in FAISS
    vector_db, embeddings = embed_and_store(all_chunks)
    # Query the system
    query = input("Enter user query")
    retrieved_chunks = query_vector_search(query, vector_db, all_chunks)
    context = "\n".join(retrieved_chunks)

    # Generate response
    response = generate_response(query, context)
    print("\nResponse:")
    print(response)

except Exception as e:
    print("Error occurred:", e)

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Authenticated successfully with Gemini API!
Scraping: https://www.uchicago.edu/
Scraping: https://www.washington.edu/
Scraping: https://www.stanford.edu/
Scraping: https://und.edu/
Enter user querytell me about uchicago

Response:
Query: tell me about uchicago

Relevant Context:
A diversity of people and ideas, coupled with free and open discourse, lays the foundation for students and scholars to bring forth original ideas that define fields and enrich human life. UChicago students develop the habits of mind and intellectual skills needed to confront complex challenges. UChicago researchers have contributed to some of the world’s greatest discoveries, advancements, and bodies of knowledge. Faculty have a free and challenging environment in which to pursue the most original research. As a community partner, we invest in Chicago’s South Side across such areas as health, education, economic growth, and the arts. We are an international community of scholars working to solve the world's mo