<a href="https://colab.research.google.com/github/dodeeric/langchain-ai-assistant-with-hybrid-rag/blob/main/BMAE_AI_Assistant_with_hybrid_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### AI Assistant (LLM Chatbot) with Hybrid RAG
Hybrid RAG: keyword search (bm25) and semantic search (vector db)

## Scape Web pages and save the result in JSON file

In [None]:
!pip install --upgrade --quiet bs4

import requests, json, time
from bs4 import BeautifulSoup

In [None]:
# Function: Scrape Commons Summary = scs
# Scrape the summary section and the metadata fields of a Wikimedia Commons web page.

def scrape_commons_summary(url):
    """
    Scrape the summary section and the metadata fields of a Wikimedia Commons web page.
    Input: URL of the page
    Output: JSON with: url: url, metadata: metadata, text: summary text
    """

    # Get the HTML code
    response = requests.get(url)
    # Transform the HTML code from a Response object type into a BeautifulSoup object type to be scraped by Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")

    # Get the Summary content which is in a specific table
    content_table = soup.find('div', {'class': 'hproduct commons-file-information-table'})
    # The actual summary text is within 'td' tags
    summary_cells = content_table.find_all('td')
    # Extract the text from each cell. The \n is needed to avoid (key-value): v1 k2\n v2 k3\n v3 k4 ==> k1\n v1\n k2\n v2\n
    summary = '\n'.join([td.get_text() for td in summary_cells])
    # Replace '\n\n' by '\n' (remove empty lines):
    while '\n\n' in summary:
        summary = summary.replace('\n\n', '\n')

    # Get the metadata fields
    metadata = {} # Empty dictionary
    # Find all the meta tags in the HTML
    meta_tags = soup.find_all("meta")
    # Loop through the meta tags
    for tag in meta_tags:
        property = tag.get("property")
        content = tag.get("content")
        # Add the property-content pair to the dictionary
        if property and content:
            metadata[property] = content

    # Build JSON string with: url: url, metadata: metadata, text: summary text
    # Create a dictionary
    page = {
        "url": url, # String
        "metadata": metadata, # Dictionary
        "text": summary # String
    }
    # Convert the dictionary to a JSON string
    page_json = json.dumps(page)
    # Convert in clear text (convert codes in text)
    #page_json_clear_text = page_json.encode('utf-8').decode('unicode_escape')

    return page_json

In [None]:
# Scrape the URLs and save the results in a Python list

file_path = "/content/drive/MyDrive/colab/commons-urls-ds1"

with open(f"{file_path}.txt", "r") as url_file:
    data = []
    for line in url_file:
        url = line.strip()
        url = url.replace("\ufeff", "")  # Remove BOM
        page_json = scrape_commons_summary(url)
        data.append(page_json)
        time.sleep(1)

# Save the Python list in a JSON file. File name: xxx.txt --> xxx-scs.json
with open(f"{file_path}-scs.json", "w") as output_file:
        json.dump(data, output_file)

In [None]:
# Open the JSON file to check its content (will produce an error if it's not a correctly formated JSON file)
with open("/content/drive/MyDrive/colab/commons-urls-ds1-scs.json", "r") as file:
    data_read = json.load(file)

## Index

Open the JSON file and embed the items in a Chroma vector DB.

In [None]:
!pip install --upgrade --quiet jq langchain langchain-community langchain-openai langchain-chroma langchainhub rank_bm25

import jq
from google.colab import userdata
from langchain_community.document_loaders import JSONLoader
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain.retrievers import BM25Retriever, EnsembleRetriever

OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
LANGCHAIN_API_KEY = userdata.get("LANGCHAIN_API_KEY")
%env OPENAI_API_KEY = $OPENAI_API_KEY
%env LANGCHAIN_API_KEY = $LANGCHAIN_API_KEY
%env LANGCHAIN_TRACING_V2 = "true"

In [None]:
# Open the JSON file and parse/embed each item one by one

file_path = "/content/drive/MyDrive/colab/commons-urls-ds1-scs.json"
collection_name = "bmae-json"

loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
documents = loader.load() # Chunks (JSON item) from the JSON file

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large") # 1536 dimenssions vectors used to embed the json items and the questions

vector_db = Chroma.from_documents(documents, embedding_model, collection_name=collection_name, persist_directory="/content/drive/MyDrive/colab/chromadb2")

## Retrieve and generate

In [None]:
# LLM chatbot with a hybrid RAG chain:
# (To embed the question, the same model is used as for the data; the model is given in "vector_db".)

llm = ChatOpenAI(model="gpt-4-turbo-2024-04-09", temperature=0.1)

# Semantic search (vector retriever)
vector_retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 6}) # Chroma DB

# Keyword search (bm25 retriever)
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 6

# Ensemble retriever (mix of both retrivers)
ensemble_retriever = EnsembleRetriever(retrievers=[keyword_retriever, vector_retriever], weights=[0.5, 0.5])

# Download prompt template (system prompt + context (rag documents) + user question)
prompt = hub.pull("dodeeric/rag-prompt-bmae")

# Take the text content of each doc, and concatenate them in one string to pass to the prompt (context)
def format_docs_clear_text(docs):
    return "\n\n".join(doc.page_content.encode('utf-8').decode('unicode_escape') for doc in docs)

# Function to display the text content of the prompt in ai_assistant_chain
def print_and_pass(data):
    print(f"Prompt content sent to the LLM: {data}")
    return data

ai_assistant_chain = (
    {"context": ensemble_retriever | format_docs_clear_text, "question": RunnablePassthrough()}
    | prompt
    #| print_and_pass
    | llm
    | StrOutputParser() # Convert to string
)

Querry the AI Assistant:

In [None]:
question = "Pouvez-vous me montrer des portraits du roi Léopold Ier ?"

In [None]:
ai_assistant_chain.invoke(question)

In [None]:
# Query the vector RAG only
docs = vector_db.similarity_search(question, k=2) # List of Documents; page_content of a Document: string
rag_context = format_docs_clear_text(docs) # One string composed of k json items
print(rag_context)