In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# Vector DB
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFacePipeline

# Chat
import torch
from langchain_openai import ChatOpenAI
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline, ChatHuggingFace
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

# UI
import gradio as gr

In [2]:
# Paths
ABSOLUTE_PATH = os.path.abspath(os.getcwd())
VDB_PATH = os.path.join(ABSOLUTE_PATH, "vector_dbs")

def set_path(db_name: str, vdb_path: str=VDB_PATH) -> str:
    return os.path.join(vdb_path, db_name)

# FAISS vector store paths
FAISSDB_OPENAI_PATH = set_path("openai")
FAISSDB_HF_PATH = set_path("hugging_face")

In [3]:
# Load OpenAI Embeddings
load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "type-your-api-key-here")
openai_embeddings = OpenAIEmbeddings()

# Load Hugging Face Embeddings
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "type-your-token-here")
login(HF_TOKEN)
hf_embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

if openai_embeddings and hf_embeddings:
    print("Embeddings are loaded.")

Embeddings are loaded.


In [4]:
# Load FAISS vectorstore function
def load_faiss_db(db_path: str, embeddings) -> FAISS:
    if os.path.exists(db_path):
        print(f"Loading {db_path} vector stores ...")
        vectorstores = FAISS.load_local(db_path, embeddings=embeddings, allow_dangerous_deserialization=True)
        num_docs = vectorstores.index.ntotal
        dim = vectorstores.index.d
        print(f"{db_path} vector stores are loaded.")
        print(f"Found {num_docs} documents with {dim} dimensions.\n")
        return vectorstores
    else:
        print(f"{db_path} not found in directory.\n")

faiss_openai_vectorstores = load_faiss_db(FAISSDB_OPENAI_PATH, openai_embeddings)
faiss_hf_vectorstores = load_faiss_db(FAISSDB_HF_PATH, hf_embeddings)

Loading D:\Learn\LLM\llm_engineering\my_projects\nawatech_test\case2\vector_dbs\openai vector stores ...
D:\Learn\LLM\llm_engineering\my_projects\nawatech_test\case2\vector_dbs\openai vector stores are loaded.
Found 11 documents with 1536 dimensions.

Loading D:\Learn\LLM\llm_engineering\my_projects\nawatech_test\case2\vector_dbs\hugging_face vector stores ...
D:\Learn\LLM\llm_engineering\my_projects\nawatech_test\case2\vector_dbs\hugging_face vector stores are loaded.
Found 11 documents with 384 dimensions.



# RAG

In [5]:
# Ollama model
OLLAMA_API_KEY = "ollama"
OLLAMA_BASE_URL = "http://localhost:11434/v1"
llama_model = "llama3.2:latest"

# GPT model
gpt_model = "gpt-4o-mini"

In [7]:
# Hugging Face Model
hf_model = "meta-llama/Llama-3.2-1B-Instruct"

# 1. Load model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

llama_hf_model = AutoModelForCausalLM.from_pretrained(
    hf_model,
    device_map="auto",
    quantization_config=quantization_config
)

# 2. Load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

if llama_hf_model and tokenizer:
    print(f"Hugging Face model and its tokenizer are loaded.")

Hugging Face model and its tokenizer are loaded.


In [9]:
def setup_conversation_chain(model: str, vectorstores, temperature: float):
    # 1. Set-up the model
    if model == gpt_model:
        llm = ChatOpenAI(temperature=temperature, model=gpt_model)
        print(f"{gpt_model} model loaded.")
        
    elif model == llama_model:
        llm = ChatOpenAI(temperature=temperature, model=llama_model, api_key=OLLAMA_API_KEY, base_url=OLLAMA_BASE_URL)
        print(f"{llama_model} model loaded.")
    
    elif model == hf_model or (llama_hf_model is not None and tokenizer is not None):
        print(f"Loading {hf_model} model ...")
        
        text_pipeline = pipeline(
            "text-generation",
            model=llama_hf_model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            return_full_text=False
        )
        hf_llm = HuggingFacePipeline(pipeline=text_pipeline)
        print("Hugging face pipeline created.")
        
        llm = ChatHuggingFace(llm=hf_llm, model_id=hf_model)
        print(f"{hf_model} loaded.")
    else:
        raise ValueError(f"Unknown or invalid model: {model}")
    
    # 2. Set-up the retriever: the retriever is an abstraction over the VectorStore that will be used during RAG
    retriever = vectorstores.as_retriever()
    print(f"{vectorstores} has been set-up as retriever.")
    
    # 3. Set-up the conversation memory for the chat
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    print(f"chat_history memroy has been set-up.")

    # Putting it together: set-up the conversation chain with the GPT 4o-mini or Llama3.2, the vector store and memory
    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
    
    return conversation_chain

# Building custom conversation chain
conversation_chain = setup_conversation_chain(
    model=gpt_model,
    vectorstores=faiss_openai_vectorstores,
    temperature=0.7
)
if conversation_chain:
    print("\nConversation chain is ready to be tested and used.")

gpt-4o-mini model loaded.
<langchain_community.vectorstores.faiss.FAISS object at 0x0000027466A36D10> has been set-up as retriever.
chat_history memroy has been set-up.

Conversation chain is ready to be tested and used.


# Testing the Conversation Chain

In [10]:
# Using specific model and retriever
conversation_chain = setup_conversation_chain(
    model=gpt_model,
    vectorstores=faiss_openai_vectorstores,
    temperature=0.1
)

def chat(message, history):
    response = conversation_chain.invoke({"question": message})
    return response["answer"]

gpt-4o-mini model loaded.
<langchain_community.vectorstores.faiss.FAISS object at 0x0000027466A36D10> has been set-up as retriever.
chat_history memroy has been set-up.


In [11]:
view = gr.ChatInterface(fn=chat, type="messages").launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
