# This is an RAG Chatbot that can answer questions based on an article about Vendée Globe

### Imports and warnings off

In [4]:
import requests
from bs4 import BeautifulSoup
from langchain.schema import Document  
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline, AutoTokenizer
from langchain.memory import ConversationBufferWindowMemory
from langchain.schema import AIMessage, HumanMessage 
import warnings
warnings.filterwarnings("ignore")

### Scraping function

In [5]:
def content_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        content = " ".join([para.get_text() for para in paragraphs])
        return content.strip()
    else:
        print(f"Failed: {response.status_code}")
        return None

### Initializing FAISS Database and Q&A pipeline

In [6]:
def init_db(content, url, model_path, model_name):
    documents = [Document(page_content=content, metadata={"source": url})]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)

    model_kwargs = {'device': 'cpu'} 
    encode_kwargs = {'normalize_embeddings': False}
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    db = FAISS.from_documents(docs, embeddings)
    retriever = db.as_retriever(search_kwargs={"k": 4})

    qa_pipeline_instance = pipeline(
        "question-answering",
        model=model_name,
        tokenizer=tokenizer,
        return_tensors='pt'
    )

    return retriever, qa_pipeline_instance

### Chatbot class

In [10]:
class Chatbot:
    def __init__(self, retriever, qa_pipeline, memory_length=6, prompt_template="Hello! I am your assistant. {context}"):
        self.retriever = retriever
        self.qa_pipeline = qa_pipeline
        self.memory = ConversationBufferWindowMemory(k=memory_length)
        self.prompt_template = prompt_template
        self.prompt = self.prompt_template.format(context="How can I help you today?")
        self.memory.chat_memory.add_message(AIMessage(content=self.prompt))  # Add prompt to memory   

    def ask(self, question):
        # Retrieve relevant documents
        search_docs = self.retriever.invoke(question)
        context = "\n".join([doc.page_content for doc in search_docs])
        
        try:
            
            answer = self.qa_pipeline(question=question, context=context)
            
            self.memory.chat_memory.add_message(HumanMessage(content=question))
            self.memory.chat_memory.add_message(AIMessage(content=answer['answer']))
            
            return answer['answer']
        except Exception as e:
            return f"Error: {e}"
    
    def print_memory(self):
        return self.memory.load_memory_variables({})['history']
    
    def reset_memory(self):
        self.memory.clear()
        self.memory.chat_memory.add_message(AIMessage(content=self.prompt))
    
    def chat(self):
        print(self.prompt)
        print("Type 'exit' to end the chat.")
        while True:
            user_input = input("You: ")
            if user_input.lower() == 'exit':
                print("Chat ended.")
                break
            response = self.ask(user_input)
            print(f"Bot: {response}")


### Building

In [8]:
url = "https://www.vendeeglobe.org/en/article/first-interim-debrief-after-one-month-10th-edition-vendee-globe"
content =content_from_url(url)

if content:
    model_path = "sentence-transformers/all-MiniLM-l6-v2"
    model_name = "Intel/dynamic_tinybert"
    
    retriever, qa_pipeline_instance = init_db(content, url, model_path, model_name)
    
    # Init
    chatbot = Chatbot(
        retriever, 
        qa_pipeline_instance, 
        memory_length=5, 
        prompt_template="Hi! I am your vendee globe article bot. {context}"
    )
    
    print("Use `chatbot.chat()` to start a conversation,\n`chatbot.print_memory()` to view memory, and\n`chatbot.reset_memory()` to clear memory.")
else:
    print("Failed to fetch url content.")

chatbot = Chatbot(
    retriever, 
    qa_pipeline_instance, 
    memory_length=6
)

Use `chatbot.chat()` to start a conversation,
`chatbot.print_memory()` to view memory, and
`chatbot.reset_memory()` to clear memory.


## Test

In [9]:
chatbot.chat()

Hello! I am your assistant. How can I help you today?
Type 'exit' to end the chat.


You:  who is the captain of Macif?


Bot: Charlie Dalin


You:  exit


Chat ended.
