In [27]:
%pip install google-colab-selenium



In [28]:
!pip install openai==0.28



In [29]:
!pip install selenium
!pip install faiss-cpu
!pip install sentence-transformers
!pip install google-colab-selenium




In [30]:
%%writefile scraper.py

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import google_colab_selenium as gs
from sentence_transformers import SentenceTransformer
import faiss
import openai
import numpy as np
import time



openai.api_key = 'tgp_v1_0ysrQ6G3LHVeSrjWLlDr6vAEk198rulj2JLuGlMKR8E'
openai.api_base = "https://api.together.xyz/v1"

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Get Headless Chrome Driver
def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    return gs.Chrome(options=chrome_options)

# Extract Meta Tags
def extract_meta_tags(soup):
    meta_tags = soup.find_all("meta")
    extracted = {}
    for tag in meta_tags:
        name = tag.get("name") or tag.get("property")
        content = tag.get("content")
        if name and content:
            extracted[name] = content
    return extracted

# Scrape + Filter Meaningful Text
def scrape_page(url):
    driver = get_driver()
    driver.get(url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    driver.quit()

    # Extract text from <div>, <title>, and <meta>
    div_blocks = soup.find_all(['div', 'title'])
    div_texts = [
        tag.get_text(strip=True)
        for tag in div_blocks
        if tag.get_text(strip=True) and len(tag.get_text(strip=True)) > 30  # skip tiny content
    ]

    # Add meta tag content
    meta = extract_meta_tags(soup)
    meta_texts = list(meta.values())

    return div_texts + meta_texts

#  Build FAISS Vector Index
def build_faiss_index(chunks):
    vectors = embedder.encode(chunks)
    dim = vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(vectors))
    return index, vectors, chunks

#  Retrieve Top Relevant Chunks
def retrieve_context(query, index, vectors, chunks, top_k=3):
    query_vec = embedder.encode([query])
    D, I = index.search(np.array(query_vec), top_k)
    return [chunks[i] for i in I[0]]

# Use LLM with Retrieved Context
def model_llm(user_input, index, vectors, chunks, chat_history):
    chat_history.append({"role": "user", "content": user_input})

    retrieved_chunks = retrieve_context(user_input, index, vectors, chunks)
    context = "\n\n".join(retrieved_chunks)

    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant. Only use the following website content to answer. "
                "If the answer is not found in this content, reply with 'I don’t know.'\n\n"
                f"{context}"
            )
        }
    ] + chat_history

    response = openai.ChatCompletion.create(
        model="mistralai/Mistral-7B-Instruct-v0.1",
        messages=messages,
        temperature=0.85,
        top_p=0.95,
        presence_penalty=0.5,
        frequency_penalty=0.4,
        max_tokens=400
    )

    ai_reply = response["choices"][0]["message"]["content"]
    chat_history.append({"role": "assistant", "content": ai_reply})
    return ai_reply, chat_history


Overwriting scraper.py


In [31]:
!pip install streamlit
!pip install cloudflared



In [32]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

--2025-04-13 02:02:43--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2025.4.0/cloudflared-linux-amd64 [following]
--2025-04-13 02:02:44--  https://github.com/cloudflare/cloudflared/releases/download/2025.4.0/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/f756c1d5-fdc6-4b60-9a49-bdc7883319c0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250413%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250413T020244Z&X-Amz-Expires=300&X-Amz-Signature=bb81c26e69ef58e153b954e2ac3a258500610c3826b32437549f9f744a81edb6&X-Amz-S

In [33]:
%%writefile app.py
import streamlit as st
from scraper import scrape_page, build_faiss_index, model_llm

st.set_page_config(page_title="AI Web Chatbot")
st.title("AI Chatbot")

# URL Input
url = st.text_input("Enter a website URL to scrape")

# Scrape Button
if st.button("Scrape Website"):
    if url:
        with st.spinner("Scraping and preparing knowledge base..."):
            try:
                # Scrape site content
                chunks = scrape_page(url)

                # Build FAISS index
                index, vectors, chunk_texts = build_faiss_index(chunks)

                # Save everything in session state
                st.session_state.index = index
                st.session_state.vectors = vectors
                st.session_state.chunks = chunk_texts
                st.session_state.chat_history = []

                st.success("Website scraped! You can now chat.")
            except Exception as e:
                st.error(f"Failed to scrape/index: {e}")
    else:
        st.warning("Please enter a valid website URL.")

# Ensure session state is ready
if "index" not in st.session_state:
    st.session_state.index = None

if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

# Chat Interface
if st.session_state.index:
    user_input = st.chat_input("Ask something about the website...")

    if user_input:
        with st.spinner("Thinking..."):
            try:
                ai_response, updated_history = model_llm(
                    user_input,
                    st.session_state.index,
                    st.session_state.vectors,
                    st.session_state.chunks,
                    st.session_state.chat_history
                )
                st.session_state.chat_history = updated_history
            except Exception as e:
                ai_response = f"Error: {e}"
                st.session_state.chat_history.append({"role": "assistant", "content": ai_response})

    # Display Full Chat History
    for msg in st.session_state.chat_history:
        with st.chat_message(msg["role"]):
            st.markdown(msg["content"])


Overwriting app.py


In [34]:
import os
import threading

# Launch Streamlit app in a background thread
def run_app():
    os.system('streamlit run app.py')

thread = threading.Thread(target=run_app)
thread.start()

In [35]:
!./cloudflared tunnel --url http://localhost:8501

[90m2025-04-13T02:02:45Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-04-13T02:02:45Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-04-13T02:02:48Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-04-13T02:02:48Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025