In [None]:
# Install required packages
!pip install -U flask pyngrok langchain langchain-community sentence-transformers faiss-cpu beautifulsoup4 requests transformers

# Import required libraries
import os
from flask import Flask, request, jsonify
from pyngrok import ngrok
import threading
from langchain_community.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import WebBaseLoader
from transformers import pipeline

def scrape_brainlox():
    """Extract course data using Langchain WebBaseLoader"""
    try:
        loader = WebBaseLoader("https://brainlox.com/courses/category/technical")
        documents = loader.load()

        # Transform documents into a simplified structure
        courses = [
            {
                'title': doc.metadata.get('title', 'Untitled Course'),
                'description': doc.page_content[:150] + "...",  # Trim content for simplicity
                'content': doc.page_content
            }
            for doc in documents
        ]
        return courses if courses else []

    except Exception as e:
        print(f"Error loading data with WebBaseLoader: {str(e)}")
        # Fallback to a sample course list
        sample_courses = [
            {
                'title': 'Python Programming',
                'description': 'Complete Python course from basics to advanced',
                'content': 'Python Programming: Complete Python course from basics to advanced'
            },
            {
                'title': 'Web Development',
                'description': 'Full-stack web development bootcamp',
                'content': 'Web Development: Full-stack web development bootcamp'
            },
            {
                'title': 'Data Science',
                'description': 'Comprehensive data science and ML course',
                'content': 'Data Science: Comprehensive data science and ML course'
            }
        ]
        return sample_courses

class VectorStoreManager:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(
            model_name="all-MiniLM-L6-v2"
        )
        self.index_path = '/content/brainlox_courses.faiss'

    def create_vectorstore(self, courses):
        """Create and save FAISS vectorstore"""
        try:
            documents = [
                Document(page_content=course['content'],
                         metadata={'title': course['title']})
                for course in courses
            ]

            vectorstore = FAISS.from_documents(documents, self.embeddings)
            vectorstore.save_local(self.index_path)
            return vectorstore
        except Exception as e:
            print(f"Error creating vectorstore: {str(e)}")
            return None

    def load_vectorstore(self):
        """Load existing FAISS vectorstore"""
        try:
            if os.path.exists(self.index_path):
                return FAISS.load_local(self.index_path, self.embeddings)
            return None
        except Exception as e:
            print(f"Error loading vectorstore: {str(e)}")
            return None

class Chatbot:
    def __init__(self, vectorstore):
        if vectorstore is None:
            raise ValueError("Vectorstore cannot be None")

        self.llm = HuggingFacePipeline(
            pipeline=pipeline("text2text-generation", model="google/flan-t5-base")
        )

        self.chain = ConversationalRetrievalChain.from_llm(
            self.llm,
            vectorstore.as_retriever(),
            return_source_documents=True
        )
        self.chat_history = []

    def get_response(self, question):
        """Get response from the chatbot"""
        try:
            response = self.chain({"question": question, "chat_history": self.chat_history})
            self.chat_history.append((question, response['answer']))

            return {
                'answer': response['answer'],
                'sources': [doc.metadata['title'] for doc in response['source_documents']]
            }
        except Exception as e:
            print(f"Error getting response: {str(e)}")
            return {
                'answer': "I apologize, but I encountered an error. Please try again.",
                'sources': []
            }

# Initialize vector store manager and chatbot
print("Initializing vector store...")
vector_manager = VectorStoreManager()
vectorstore = vector_manager.load_vectorstore()
if vectorstore is None:
    print("Creating vector store...")
    courses = scrape_brainlox()
    vectorstore = vector_manager.create_vectorstore(courses)
    print("Vector store created successfully!")

chatbot = Chatbot(vectorstore)

# Flask app initialization
app = Flask(__name__)

@app.route("/")
def home():
    return "Welcome to the Brainlox Chatbot API! Use /chat to interact."

@app.route("/chat", methods=["POST"])
def chat():
    data = request.json
    question = data.get("question", "")
    if not question:
        return jsonify({"error": "Question is required"}), 400

    response = chatbot.get_response(question)
    return jsonify(response)

def run_flask():
    """Run the Flask app"""
    app.run(port=6000)  # Change port if needed

def start_ngrok():
    """Start Ngrok and expose the Flask app"""
    public_url = ngrok.connect(6000)  # Match the Flask app port
    print(f" * Ngrok tunnel: {public_url}")
    return public_url

if __name__ == "__main__":
    from pyngrok import ngrok
    # Set Ngrok auth token
    ngrok.set_auth_token("2sj9Z1ebBXClcjPaWZ9TkflLyB5_2Tb2UBhQTLCdRKPhNpn69")

    # Start Ngrok and Flask in parallel
    flask_thread = threading.Thread(target=run_flask)
    flask_thread.start()

    ngrok_url = start_ngrok()


Initializing vector store...
Error loading vectorstore: The de-serialization relies loading a pickle file. Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on the internet.).
Creating vector store...
Vector store created successfully!


Device set to use cuda:0


 * Serving Flask app '__main__'
 * Debug mode: off
 * Ngrok tunnel: NgrokTunnel: "https://501f-34-83-251-195.ngrok-free.app" -> "http://localhost:6000"


Address already in use
Port 6000 is in use by another program. Either identify and stop that program, or start the server with a different port.


In [None]:
import requests


ngrok_url = "https://501f-34-83-251-195.ngrok-free.app//chat"

# Define the payload
payload = {"question": "what is AI"}

# Send the POST request
response = requests.post(ngrok_url, json=payload)

# Check and print the response
if response.status_code == 200:
    print("Response:", response.json())
else:
    print("Error:", response.status_code, response.text)

Token indices sequence length is longer than the specified maximum sequence length for this model (3240 > 512). Running this sequence through the model will result in indexing errors
INFO:werkzeug:127.0.0.1 - - [07/Feb/2025 20:40:19] "POST /chat HTTP/1.1" 200 -


Response: {'answer': 'AI is a computer program that helps people learn about the world around them.', 'sources': ['Brainlox: Learn technical courses.']}
